From: gggeek <giunta.gaetano@gmail.com>
Date: Sat, 11 Apr 2015 22:44:39 +0000 (+0100)
Subject: Fix server and client: support LATIN-1 requests/responses where the charset declarati... 
X-Git-Tag: 4.0.0-alpha^2~102
X-Git-Url: http://git.onelab.eu/?p=plcapi.git;a=commitdiff_plain;h=7ef47445e3dc236ef5fea0d3ea5bd0492a83d2a2

Fix server and client: support LATIN-1 requests/responses where the charset declaration is in the xml prologue instead of http headers; reintroduce guess_encoding
---

diff --git a/composer.json b/composer.json
index 51130cb..62d1129 100644
--- a/composer.json
+++ b/composer.json
@@ -12,11 +12,13 @@
         "phpunit/phpunit": ">=4.0.0",
         "phpunit/phpunit-selenium": "*",
         "codeclimate/php-test-reporter": "dev-master",
-        "ext-curl": "*"
+        "ext-curl": "*",
+        "ext-mbstring": "*"
     },
     "suggest": {
         "ext-curl": "Needed for HTTPS and HTTP 1.1 support, NTLM Auth etc...",
-        "ext-zlib": "Needed for sending compressed requests and receiving compressed responses, if cURL is not available"
+        "ext-zlib": "Needed for sending compressed requests and receiving compressed responses, if cURL is not available",
+        "ext-mbstring": "Needed to allow reception of requests/responses in character sets other than ASCII,LATIN-1,UTF-8"
     },
     "autoload": {
         "psr-4": {"PhpXmlRpc\\": "src/"}
diff --git a/lib/xmlrpc.inc b/lib/xmlrpc.inc
index 1572279..9106bd6 100644
--- a/lib/xmlrpc.inc
+++ b/lib/xmlrpc.inc
@@ -201,6 +201,7 @@ function php_xmlrpc_decode_xml($xmlVal, $options=array())
 
 function guess_encoding($httpHeader='', $xmlChunk='', $encodingPrefs=null)
 {
+    return PhpXmlRpc\Helper\XMLParser::guessEncoding($httpHeader, $xmlChunk, $encodingPrefs);
 }
 
 function is_valid_charset($encoding, $validList)
diff --git a/src/Encoder.php b/src/Encoder.php
index 636ef35..2955f44 100644
--- a/src/Encoder.php
+++ b/src/Encoder.php
@@ -232,8 +232,29 @@ class Encoder
      */
     public function decode_xml($xmlVal, $options = array())
     {
+        // 'guestimate' encoding
+        $valEncoding = XMLParser::guessEncoding('', $xmlVal);
+        if ($valEncoding != '') {
+
+            // Since parsing will fail if charset is not specified in the xml prologue,
+            // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that...
+            // The following code might be better for mb_string enabled installs, but
+            // makes the lib about 200% slower...
+            //if (!is_valid_charset($valEncoding, array('UTF-8'))
+            if (!in_array($valEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($xmlVal)) {
+                if ($valEncoding == 'ISO-8859-1') {
+                    $xmlVal = utf8_encode($xmlVal);
+                }
+                else {
+                    if (extension_loaded('mbstring')) {
+                        $xmlVal = mb_convert_encoding($xmlVal, 'UTF-8', $valEncoding);
+                    } else {
+                        error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of xml text: ' . $valEncoding);
+                    }
+                }
+            }
+        }
 
-        /// @todo 'guestimate' encoding
         $parser = xml_parser_create();
         xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true);
         // What if internal encoding is not in one of the 3 allowed?
@@ -293,88 +314,4 @@ class Encoder
         }
     }
 
-    /**
-     * xml charset encoding guessing helper function.
-     * Tries to determine the charset encoding of an XML chunk received over HTTP.
-     * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type,
-     * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of unconforming (legacy?) clients/servers,
-     * which will be most probably using UTF-8 anyway...
-     *
-     * @param string $httpHeader the http Content-type header
-     * @param string $xmlChunk xml content buffer
-     * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled)
-     * @return string
-     *
-     * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!!
-     */
-    public static function guess_encoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null)
-    {
-        // discussion: see http://www.yale.edu/pclt/encoding/
-        // 1 - test if encoding is specified in HTTP HEADERS
-
-        //Details:
-        // LWS:           (\13\10)?( |\t)+
-        // token:         (any char but excluded stuff)+
-        // quoted string: " (any char but double quotes and cointrol chars)* "
-        // header:        Content-type = ...; charset=value(; ...)*
-        //   where value is of type token, no LWS allowed between 'charset' and value
-        // Note: we do not check for invalid chars in VALUE:
-        //   this had better be done using pure ereg as below
-        // Note 2: we might be removing whitespace/tabs that ought to be left in if
-        //   the received charset is a quoted string. But nobody uses such charset names...
-
-        /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it?
-        $matches = array();
-        if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) {
-            return strtoupper(trim($matches[1], " \t\""));
-        }
-
-        // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern
-        //     (source: http://www.w3.org/TR/2000/REC-xml-20001006)
-        //     NOTE: actually, according to the spec, even if we find the BOM and determine
-        //     an encoding, we should check if there is an encoding specified
-        //     in the xml declaration, and verify if they match.
-        /// @todo implement check as described above?
-        /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM)
-        if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) {
-            return 'UCS-4';
-        } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) {
-            return 'UTF-16';
-        } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) {
-            return 'UTF-8';
-        }
-
-        // 3 - test if encoding is specified in the xml declaration
-        // Details:
-        // SPACE:         (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
-        // EQ:            SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
-        if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" .
-            '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
-            $xmlChunk, $matches)) {
-            return strtoupper(substr($matches[2], 1, -1));
-        }
-
-        // 4 - if mbstring is available, let it do the guesswork
-        // NB: we favour finding an encoding that is compatible with what we can process
-        if (extension_loaded('mbstring')) {
-            if ($encodingPrefs) {
-                $enc = mb_detect_encoding($xmlChunk, $encodingPrefs);
-            } else {
-                $enc = mb_detect_encoding($xmlChunk);
-            }
-            // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII...
-            // IANA also likes better US-ASCII, so go with it
-            if ($enc == 'ASCII') {
-                $enc = 'US-' . $enc;
-            }
-
-            return $enc;
-        } else {
-            // no encoding specified: as per HTTP1.1 assume it is iso-8859-1?
-            // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types
-            // this should be the standard. And we should be getting text/xml as request and response.
-            // BUT we have to be backward compatible with the lib, which always used UTF-8 as default...
-            return PhpXmlRpc::$xmlrpc_defencoding;
-        }
-    }
 }
diff --git a/src/Helper/XMLParser.php b/src/Helper/XMLParser.php
index e11d233..58acdaf 100644
--- a/src/Helper/XMLParser.php
+++ b/src/Helper/XMLParser.php
@@ -440,4 +440,119 @@ class XMLParser
         return true;
     }
 
+    /**
+     * xml charset encoding guessing helper function.
+     * Tries to determine the charset encoding of an XML chunk received over HTTP.
+     * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type,
+     * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of non conforming (legacy?) clients/servers,
+     * which will be most probably using UTF-8 anyway...
+     *
+     * @param string $httpHeader the http Content-type header
+     * @param string $xmlChunk xml content buffer
+     * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled)
+     * @return string
+     *
+     * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!!
+     */
+    public static function guessEncoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null)
+    {
+        // discussion: see http://www.yale.edu/pclt/encoding/
+        // 1 - test if encoding is specified in HTTP HEADERS
+
+        //Details:
+        // LWS:           (\13\10)?( |\t)+
+        // token:         (any char but excluded stuff)+
+        // quoted string: " (any char but double quotes and cointrol chars)* "
+        // header:        Content-type = ...; charset=value(; ...)*
+        //   where value is of type token, no LWS allowed between 'charset' and value
+        // Note: we do not check for invalid chars in VALUE:
+        //   this had better be done using pure ereg as below
+        // Note 2: we might be removing whitespace/tabs that ought to be left in if
+        //   the received charset is a quoted string. But nobody uses such charset names...
+
+        /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it?
+        $matches = array();
+        if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) {
+            return strtoupper(trim($matches[1], " \t\""));
+        }
+
+        // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern
+        //     (source: http://www.w3.org/TR/2000/REC-xml-20001006)
+        //     NOTE: actually, according to the spec, even if we find the BOM and determine
+        //     an encoding, we should check if there is an encoding specified
+        //     in the xml declaration, and verify if they match.
+        /// @todo implement check as described above?
+        /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM)
+        if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) {
+            return 'UCS-4';
+        } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) {
+            return 'UTF-16';
+        } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) {
+            return 'UTF-8';
+        }
+
+        // 3 - test if encoding is specified in the xml declaration
+        // Details:
+        // SPACE:         (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
+        // EQ:            SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
+        if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" .
+            '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
+            $xmlChunk, $matches)) {
+            return strtoupper(substr($matches[2], 1, -1));
+        }
+
+        // 4 - if mbstring is available, let it do the guesswork
+        // NB: we favour finding an encoding that is compatible with what we can process
+        if (extension_loaded('mbstring')) {
+            if ($encodingPrefs) {
+                $enc = mb_detect_encoding($xmlChunk, $encodingPrefs);
+            } else {
+                $enc = mb_detect_encoding($xmlChunk);
+            }
+            // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII...
+            // IANA also likes better US-ASCII, so go with it
+            if ($enc == 'ASCII') {
+                $enc = 'US-' . $enc;
+            }
+
+            return $enc;
+        } else {
+            // no encoding specified: as per HTTP1.1 assume it is iso-8859-1?
+            // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types
+            // this should be the standard. And we should be getting text/xml as request and response.
+            // BUT we have to be backward compatible with the lib, which always used UTF-8 as default...
+            return PhpXmlRpc::$xmlrpc_defencoding;
+        }
+    }
+
+    /**
+     * Helper function: checks if an xml chunk as a charset declaration (BOM or in the xml declaration)
+     *
+     * @param string $xmlChunk
+     * @return bool
+     */
+    public static function hasEncoding($xmlChunk)
+    {
+        // scan the first bytes of the data for a UTF-16 (or other) BOM pattern
+        //     (source: http://www.w3.org/TR/2000/REC-xml-20001006)
+        if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) {
+            return true;
+        } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) {
+            return true;
+        } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) {
+            return true;
+        }
+
+        // test if encoding is specified in the xml declaration
+        // Details:
+        // SPACE:         (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
+        // EQ:            SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
+        if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" .
+            '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
+            $xmlChunk, $matches)) {
+            return true;
+        }
+
+        return false;
+    }
 }
diff --git a/src/Request.php b/src/Request.php
index 9192b81..e6816a3 100644
--- a/src/Request.php
+++ b/src/Request.php
@@ -227,19 +227,30 @@ class Request
         }
 
         // try to 'guestimate' the character encoding of the received response
-        $respEncoding = Encoder::guess_encoding(@$this->httpResponse['headers']['content-type'], $data);
+        $respEncoding = XMLParser::guessEncoding(@$this->httpResponse['headers']['content-type'], $data);
 
-        // if response charset encoding is not known / supported, try to use
-        // the default encoding and parse the xml anyway, but log a warning...
-        if (!in_array($respEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII'))) {
-            // the following code might be better for mb_string enabled installs, but
-            // makes the lib about 200% slower...
-            //if (!is_valid_charset($respEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')))
+        if ($respEncoding != '') {
 
-            error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received response: ' . $respEncoding);
-            $respEncoding = PhpXmlRpc::$xmlrpc_defencoding;
+            // Since parsing will fail if charset is not specified in the xml prologue,
+            // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that...
+            // The following code might be better for mb_string enabled installs, but
+            // makes the lib about 200% slower...
+            //if (!is_valid_charset($respEncoding, array('UTF-8')))
+            if (!in_array($respEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($data)) {
+                if ($respEncoding == 'ISO-8859-1') {
+                    $data = utf8_encode($data);
+                }
+                else {
+                    if (extension_loaded('mbstring')) {
+                        $data = mb_convert_encoding($data, 'UTF-8', $respEncoding);
+                    } else {
+                        error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received response: ' . $respEncoding);
+                    }
+                }
+            }
         }
-        $parser = xml_parser_create($respEncoding);
+
+        $parser = xml_parser_create();
         xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true);
         // G. Giunta 2005/02/13: PHP internally uses ISO-8859-1, so we have to tell
         // the xml parser to give us back data in the expected charset.
diff --git a/src/Server.php b/src/Server.php
index 113ead4..40afa9b 100644
--- a/src/Server.php
+++ b/src/Server.php
@@ -429,7 +429,7 @@ class Server
 
         // 'guestimate' request encoding
         /// @todo check if mbstring is enabled and automagic input conversion is on: it might mingle with this check???
-        $reqEncoding = Encoder::guess_encoding(isset($_SERVER['CONTENT_TYPE']) ? $_SERVER['CONTENT_TYPE'] : '',
+        $reqEncoding = XMLParser::guessEncoding(isset($_SERVER['CONTENT_TYPE']) ? $_SERVER['CONTENT_TYPE'] : '',
             $data);
 
         return;
@@ -446,34 +446,29 @@ class Server
      */
     public function parseRequest($data, $reqEncoding = '')
     {
-        // 2005/05/07 commented and moved into caller function code
-        //if($data=='')
-        //{
-        //    $data=$GLOBALS['HTTP_RAW_POST_DATA'];
-        //}
-
-        // G. Giunta 2005/02/13: we do NOT expect to receive html entities
-        // so we do not try to convert them into xml character entities
-        //$data = xmlrpc_html_entity_xlate($data);
-
         // decompose incoming XML into request structure
-        if ($reqEncoding != '') {
-            if (!in_array($reqEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII'))) {
-                // the following code might be better for mb_string enabled installs, but
-                // makes the lib about 200% slower...
-                //if (!is_valid_charset($reqEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')))
 
-                error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received request: ' . $reqEncoding);
-                $reqEncoding = PhpXmlRpc::$xmlrpc_defencoding;
+        if ($reqEncoding != '') {
+            // Since parsing will fail if charset is not specified in the xml prologue,
+            // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that...
+            // The following code might be better for mb_string enabled installs, but
+            // makes the lib about 200% slower...
+            //if (!is_valid_charset($reqEncoding, array('UTF-8')))
+            if (!in_array($reqEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($data)) {
+                if ($reqEncoding == 'ISO-8859-1') {
+                    $data = utf8_encode($data);
+                }
+                else {
+                    if (extension_loaded('mbstring')) {
+                        $data = mb_convert_encoding($data, 'UTF-8', $reqEncoding);
+                    } else {
+                        error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received request: ' . $reqEncoding);
+                    }
+                }
             }
-            /// @BUG this will fail on PHP 5 if charset is not specified in the xml prologue,
-            // the encoding is not UTF8 and there are non-ascii chars in the text...
-            /// @todo use an empty string for php 5 ???
-            $parser = xml_parser_create($reqEncoding);
-        } else {
-            $parser = xml_parser_create();
         }
 
+        $parser = xml_parser_create();
         xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true);
         // G. Giunta 2005/02/13: PHP internally uses ISO-8859-1, so we have to tell
         // the xml parser to give us back data in the expected charset