From: gggeek Date: Sat, 11 Apr 2015 22:44:39 +0000 (+0100) Subject: Fix server and client: support LATIN-1 requests/responses where the charset declarati... X-Git-Tag: 4.0.0-alpha^2~102 X-Git-Url: http://git.onelab.eu/?p=plcapi.git;a=commitdiff_plain;h=7ef47445e3dc236ef5fea0d3ea5bd0492a83d2a2 Fix server and client: support LATIN-1 requests/responses where the charset declaration is in the xml prologue instead of http headers; reintroduce guess_encoding --- diff --git a/composer.json b/composer.json index 51130cb..62d1129 100644 --- a/composer.json +++ b/composer.json @@ -12,11 +12,13 @@ "phpunit/phpunit": ">=4.0.0", "phpunit/phpunit-selenium": "*", "codeclimate/php-test-reporter": "dev-master", - "ext-curl": "*" + "ext-curl": "*", + "ext-mbstring": "*" }, "suggest": { "ext-curl": "Needed for HTTPS and HTTP 1.1 support, NTLM Auth etc...", - "ext-zlib": "Needed for sending compressed requests and receiving compressed responses, if cURL is not available" + "ext-zlib": "Needed for sending compressed requests and receiving compressed responses, if cURL is not available", + "ext-mbstring": "Needed to allow reception of requests/responses in character sets other than ASCII,LATIN-1,UTF-8" }, "autoload": { "psr-4": {"PhpXmlRpc\\": "src/"} diff --git a/lib/xmlrpc.inc b/lib/xmlrpc.inc index 1572279..9106bd6 100644 --- a/lib/xmlrpc.inc +++ b/lib/xmlrpc.inc @@ -201,6 +201,7 @@ function php_xmlrpc_decode_xml($xmlVal, $options=array()) function guess_encoding($httpHeader='', $xmlChunk='', $encodingPrefs=null) { + return PhpXmlRpc\Helper\XMLParser::guessEncoding($httpHeader, $xmlChunk, $encodingPrefs); } function is_valid_charset($encoding, $validList) diff --git a/src/Encoder.php b/src/Encoder.php index 636ef35..2955f44 100644 --- a/src/Encoder.php +++ b/src/Encoder.php @@ -232,8 +232,29 @@ class Encoder */ public function decode_xml($xmlVal, $options = array()) { + // 'guestimate' encoding + $valEncoding = XMLParser::guessEncoding('', $xmlVal); + if ($valEncoding != '') { + + // Since parsing will fail if charset is not specified in the xml prologue, + // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that... + // The following code might be better for mb_string enabled installs, but + // makes the lib about 200% slower... + //if (!is_valid_charset($valEncoding, array('UTF-8')) + if (!in_array($valEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($xmlVal)) { + if ($valEncoding == 'ISO-8859-1') { + $xmlVal = utf8_encode($xmlVal); + } + else { + if (extension_loaded('mbstring')) { + $xmlVal = mb_convert_encoding($xmlVal, 'UTF-8', $valEncoding); + } else { + error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of xml text: ' . $valEncoding); + } + } + } + } - /// @todo 'guestimate' encoding $parser = xml_parser_create(); xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true); // What if internal encoding is not in one of the 3 allowed? @@ -293,88 +314,4 @@ class Encoder } } - /** - * xml charset encoding guessing helper function. - * Tries to determine the charset encoding of an XML chunk received over HTTP. - * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type, - * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of unconforming (legacy?) clients/servers, - * which will be most probably using UTF-8 anyway... - * - * @param string $httpHeader the http Content-type header - * @param string $xmlChunk xml content buffer - * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled) - * @return string - * - * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!! - */ - public static function guess_encoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null) - { - // discussion: see http://www.yale.edu/pclt/encoding/ - // 1 - test if encoding is specified in HTTP HEADERS - - //Details: - // LWS: (\13\10)?( |\t)+ - // token: (any char but excluded stuff)+ - // quoted string: " (any char but double quotes and cointrol chars)* " - // header: Content-type = ...; charset=value(; ...)* - // where value is of type token, no LWS allowed between 'charset' and value - // Note: we do not check for invalid chars in VALUE: - // this had better be done using pure ereg as below - // Note 2: we might be removing whitespace/tabs that ought to be left in if - // the received charset is a quoted string. But nobody uses such charset names... - - /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it? - $matches = array(); - if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) { - return strtoupper(trim($matches[1], " \t\"")); - } - - // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern - // (source: http://www.w3.org/TR/2000/REC-xml-20001006) - // NOTE: actually, according to the spec, even if we find the BOM and determine - // an encoding, we should check if there is an encoding specified - // in the xml declaration, and verify if they match. - /// @todo implement check as described above? - /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM) - if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) { - return 'UCS-4'; - } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) { - return 'UTF-16'; - } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) { - return 'UTF-8'; - } - - // 3 - test if encoding is specified in the xml declaration - // Details: - // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ - // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* - if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" . - '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", - $xmlChunk, $matches)) { - return strtoupper(substr($matches[2], 1, -1)); - } - - // 4 - if mbstring is available, let it do the guesswork - // NB: we favour finding an encoding that is compatible with what we can process - if (extension_loaded('mbstring')) { - if ($encodingPrefs) { - $enc = mb_detect_encoding($xmlChunk, $encodingPrefs); - } else { - $enc = mb_detect_encoding($xmlChunk); - } - // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII... - // IANA also likes better US-ASCII, so go with it - if ($enc == 'ASCII') { - $enc = 'US-' . $enc; - } - - return $enc; - } else { - // no encoding specified: as per HTTP1.1 assume it is iso-8859-1? - // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types - // this should be the standard. And we should be getting text/xml as request and response. - // BUT we have to be backward compatible with the lib, which always used UTF-8 as default... - return PhpXmlRpc::$xmlrpc_defencoding; - } - } } diff --git a/src/Helper/XMLParser.php b/src/Helper/XMLParser.php index e11d233..58acdaf 100644 --- a/src/Helper/XMLParser.php +++ b/src/Helper/XMLParser.php @@ -440,4 +440,119 @@ class XMLParser return true; } + /** + * xml charset encoding guessing helper function. + * Tries to determine the charset encoding of an XML chunk received over HTTP. + * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type, + * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of non conforming (legacy?) clients/servers, + * which will be most probably using UTF-8 anyway... + * + * @param string $httpHeader the http Content-type header + * @param string $xmlChunk xml content buffer + * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled) + * @return string + * + * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!! + */ + public static function guessEncoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null) + { + // discussion: see http://www.yale.edu/pclt/encoding/ + // 1 - test if encoding is specified in HTTP HEADERS + + //Details: + // LWS: (\13\10)?( |\t)+ + // token: (any char but excluded stuff)+ + // quoted string: " (any char but double quotes and cointrol chars)* " + // header: Content-type = ...; charset=value(; ...)* + // where value is of type token, no LWS allowed between 'charset' and value + // Note: we do not check for invalid chars in VALUE: + // this had better be done using pure ereg as below + // Note 2: we might be removing whitespace/tabs that ought to be left in if + // the received charset is a quoted string. But nobody uses such charset names... + + /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it? + $matches = array(); + if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) { + return strtoupper(trim($matches[1], " \t\"")); + } + + // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern + // (source: http://www.w3.org/TR/2000/REC-xml-20001006) + // NOTE: actually, according to the spec, even if we find the BOM and determine + // an encoding, we should check if there is an encoding specified + // in the xml declaration, and verify if they match. + /// @todo implement check as described above? + /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM) + if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) { + return 'UCS-4'; + } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) { + return 'UTF-16'; + } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) { + return 'UTF-8'; + } + + // 3 - test if encoding is specified in the xml declaration + // Details: + // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ + // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* + if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" . + '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", + $xmlChunk, $matches)) { + return strtoupper(substr($matches[2], 1, -1)); + } + + // 4 - if mbstring is available, let it do the guesswork + // NB: we favour finding an encoding that is compatible with what we can process + if (extension_loaded('mbstring')) { + if ($encodingPrefs) { + $enc = mb_detect_encoding($xmlChunk, $encodingPrefs); + } else { + $enc = mb_detect_encoding($xmlChunk); + } + // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII... + // IANA also likes better US-ASCII, so go with it + if ($enc == 'ASCII') { + $enc = 'US-' . $enc; + } + + return $enc; + } else { + // no encoding specified: as per HTTP1.1 assume it is iso-8859-1? + // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types + // this should be the standard. And we should be getting text/xml as request and response. + // BUT we have to be backward compatible with the lib, which always used UTF-8 as default... + return PhpXmlRpc::$xmlrpc_defencoding; + } + } + + /** + * Helper function: checks if an xml chunk as a charset declaration (BOM or in the xml declaration) + * + * @param string $xmlChunk + * @return bool + */ + public static function hasEncoding($xmlChunk) + { + // scan the first bytes of the data for a UTF-16 (or other) BOM pattern + // (source: http://www.w3.org/TR/2000/REC-xml-20001006) + if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) { + return true; + } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) { + return true; + } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) { + return true; + } + + // test if encoding is specified in the xml declaration + // Details: + // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ + // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* + if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" . + '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", + $xmlChunk, $matches)) { + return true; + } + + return false; + } } diff --git a/src/Request.php b/src/Request.php index 9192b81..e6816a3 100644 --- a/src/Request.php +++ b/src/Request.php @@ -227,19 +227,30 @@ class Request } // try to 'guestimate' the character encoding of the received response - $respEncoding = Encoder::guess_encoding(@$this->httpResponse['headers']['content-type'], $data); + $respEncoding = XMLParser::guessEncoding(@$this->httpResponse['headers']['content-type'], $data); - // if response charset encoding is not known / supported, try to use - // the default encoding and parse the xml anyway, but log a warning... - if (!in_array($respEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII'))) { - // the following code might be better for mb_string enabled installs, but - // makes the lib about 200% slower... - //if (!is_valid_charset($respEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII'))) + if ($respEncoding != '') { - error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received response: ' . $respEncoding); - $respEncoding = PhpXmlRpc::$xmlrpc_defencoding; + // Since parsing will fail if charset is not specified in the xml prologue, + // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that... + // The following code might be better for mb_string enabled installs, but + // makes the lib about 200% slower... + //if (!is_valid_charset($respEncoding, array('UTF-8'))) + if (!in_array($respEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($data)) { + if ($respEncoding == 'ISO-8859-1') { + $data = utf8_encode($data); + } + else { + if (extension_loaded('mbstring')) { + $data = mb_convert_encoding($data, 'UTF-8', $respEncoding); + } else { + error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received response: ' . $respEncoding); + } + } + } } - $parser = xml_parser_create($respEncoding); + + $parser = xml_parser_create(); xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true); // G. Giunta 2005/02/13: PHP internally uses ISO-8859-1, so we have to tell // the xml parser to give us back data in the expected charset. diff --git a/src/Server.php b/src/Server.php index 113ead4..40afa9b 100644 --- a/src/Server.php +++ b/src/Server.php @@ -429,7 +429,7 @@ class Server // 'guestimate' request encoding /// @todo check if mbstring is enabled and automagic input conversion is on: it might mingle with this check??? - $reqEncoding = Encoder::guess_encoding(isset($_SERVER['CONTENT_TYPE']) ? $_SERVER['CONTENT_TYPE'] : '', + $reqEncoding = XMLParser::guessEncoding(isset($_SERVER['CONTENT_TYPE']) ? $_SERVER['CONTENT_TYPE'] : '', $data); return; @@ -446,34 +446,29 @@ class Server */ public function parseRequest($data, $reqEncoding = '') { - // 2005/05/07 commented and moved into caller function code - //if($data=='') - //{ - // $data=$GLOBALS['HTTP_RAW_POST_DATA']; - //} - - // G. Giunta 2005/02/13: we do NOT expect to receive html entities - // so we do not try to convert them into xml character entities - //$data = xmlrpc_html_entity_xlate($data); - // decompose incoming XML into request structure - if ($reqEncoding != '') { - if (!in_array($reqEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII'))) { - // the following code might be better for mb_string enabled installs, but - // makes the lib about 200% slower... - //if (!is_valid_charset($reqEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII'))) - error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received request: ' . $reqEncoding); - $reqEncoding = PhpXmlRpc::$xmlrpc_defencoding; + if ($reqEncoding != '') { + // Since parsing will fail if charset is not specified in the xml prologue, + // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that... + // The following code might be better for mb_string enabled installs, but + // makes the lib about 200% slower... + //if (!is_valid_charset($reqEncoding, array('UTF-8'))) + if (!in_array($reqEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($data)) { + if ($reqEncoding == 'ISO-8859-1') { + $data = utf8_encode($data); + } + else { + if (extension_loaded('mbstring')) { + $data = mb_convert_encoding($data, 'UTF-8', $reqEncoding); + } else { + error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received request: ' . $reqEncoding); + } + } } - /// @BUG this will fail on PHP 5 if charset is not specified in the xml prologue, - // the encoding is not UTF8 and there are non-ascii chars in the text... - /// @todo use an empty string for php 5 ??? - $parser = xml_parser_create($reqEncoding); - } else { - $parser = xml_parser_create(); } + $parser = xml_parser_create(); xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true); // G. Giunta 2005/02/13: PHP internally uses ISO-8859-1, so we have to tell // the xml parser to give us back data in the expected charset