From 2c0f47e778a796891fc7190c31dc73a9ea7f8289 Mon Sep 17 00:00:00 2001 From: gggeek Date: Mon, 15 Dec 2014 00:33:41 +0000 Subject: [PATCH 1/1] WIP - more fixes: system methods in server, charset guessing --- src/Encoder.php | 174 ++++++++++++++++++++++++------------------------ src/Request.php | 3 +- src/Server.php | 19 +++--- 3 files changed, 99 insertions(+), 97 deletions(-) diff --git a/src/Encoder.php b/src/Encoder.php index d687467..371237b 100644 --- a/src/Encoder.php +++ b/src/Encoder.php @@ -132,7 +132,7 @@ class Encoder * * @param mixed $php_val the value to be converted into an xmlrpcval object * @param array $options can include 'encode_php_objs', 'auto_dates', 'null_extension' or 'extension_api' - * @return xmlrpcval + * @return \PhpXmlrpc\Value */ function encode($php_val, $options=array()) { @@ -319,102 +319,102 @@ class Encoder } -/** - * xml charset encoding guessing helper function. - * Tries to determine the charset encoding of an XML chunk received over HTTP. - * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type, - * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of unconforming (legacy?) clients/servers, - * which will be most probably using UTF-8 anyway... - * - * @param string $httpheader the http Content-type header - * @param string $xmlchunk xml content buffer - * @param string $encoding_prefs comma separated list of character encodings to be used as default (when mb extension is enabled) - * @return string - * - * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!! - */ -function guess_encoding($httpheader='', $xmlchunk='', $encoding_prefs=null) -{ - // discussion: see http://www.yale.edu/pclt/encoding/ - // 1 - test if encoding is specified in HTTP HEADERS - - //Details: - // LWS: (\13\10)?( |\t)+ - // token: (any char but excluded stuff)+ - // quoted string: " (any char but double quotes and cointrol chars)* " - // header: Content-type = ...; charset=value(; ...)* - // where value is of type token, no LWS allowed between 'charset' and value - // Note: we do not check for invalid chars in VALUE: - // this had better be done using pure ereg as below - // Note 2: we might be removing whitespace/tabs that ought to be left in if - // the received charset is a quoted string. But nobody uses such charset names... - - /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it? - $matches = array(); - if(preg_match('/;\s*charset\s*=([^;]+)/i', $httpheader, $matches)) + /** + * xml charset encoding guessing helper function. + * Tries to determine the charset encoding of an XML chunk received over HTTP. + * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type, + * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of unconforming (legacy?) clients/servers, + * which will be most probably using UTF-8 anyway... + * + * @param string $httpheader the http Content-type header + * @param string $xmlchunk xml content buffer + * @param string $encoding_prefs comma separated list of character encodings to be used as default (when mb extension is enabled) + * @return string + * + * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!! + */ + static function guess_encoding($httpheader='', $xmlchunk='', $encoding_prefs=null) { - return strtoupper(trim($matches[1], " \t\"")); - } + // discussion: see http://www.yale.edu/pclt/encoding/ + // 1 - test if encoding is specified in HTTP HEADERS - // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern - // (source: http://www.w3.org/TR/2000/REC-xml-20001006) - // NOTE: actually, according to the spec, even if we find the BOM and determine - // an encoding, we should check if there is an encoding specified - // in the xml declaration, and verify if they match. - /// @todo implement check as described above? - /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM) - if(preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlchunk)) - { - return 'UCS-4'; - } - elseif(preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlchunk)) - { - return 'UTF-16'; - } - elseif(preg_match('/^(\xEF\xBB\xBF)/', $xmlchunk)) - { - return 'UTF-8'; - } + //Details: + // LWS: (\13\10)?( |\t)+ + // token: (any char but excluded stuff)+ + // quoted string: " (any char but double quotes and cointrol chars)* " + // header: Content-type = ...; charset=value(; ...)* + // where value is of type token, no LWS allowed between 'charset' and value + // Note: we do not check for invalid chars in VALUE: + // this had better be done using pure ereg as below + // Note 2: we might be removing whitespace/tabs that ought to be left in if + // the received charset is a quoted string. But nobody uses such charset names... - // 3 - test if encoding is specified in the xml declaration - // Details: - // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ - // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* - if (preg_match('/^<\?xml\s+version\s*=\s*'. "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))". - '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", - $xmlchunk, $matches)) - { - return strtoupper(substr($matches[2], 1, -1)); - } + /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it? + $matches = array(); + if(preg_match('/;\s*charset\s*=([^;]+)/i', $httpheader, $matches)) + { + return strtoupper(trim($matches[1], " \t\"")); + } - // 4 - if mbstring is available, let it do the guesswork - // NB: we favour finding an encoding that is compatible with what we can process - if(extension_loaded('mbstring')) - { - if($encoding_prefs) + // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern + // (source: http://www.w3.org/TR/2000/REC-xml-20001006) + // NOTE: actually, according to the spec, even if we find the BOM and determine + // an encoding, we should check if there is an encoding specified + // in the xml declaration, and verify if they match. + /// @todo implement check as described above? + /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM) + if(preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlchunk)) { - $enc = mb_detect_encoding($xmlchunk, $encoding_prefs); + return 'UCS-4'; } - else + elseif(preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlchunk)) { - $enc = mb_detect_encoding($xmlchunk); + return 'UTF-16'; } - // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII... - // IANA also likes better US-ASCII, so go with it - if($enc == 'ASCII') + elseif(preg_match('/^(\xEF\xBB\xBF)/', $xmlchunk)) { - $enc = 'US-'.$enc; + return 'UTF-8'; + } + + // 3 - test if encoding is specified in the xml declaration + // Details: + // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ + // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* + if (preg_match('/^<\?xml\s+version\s*=\s*'. "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))". + '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", + $xmlchunk, $matches)) + { + return strtoupper(substr($matches[2], 1, -1)); + } + + // 4 - if mbstring is available, let it do the guesswork + // NB: we favour finding an encoding that is compatible with what we can process + if(extension_loaded('mbstring')) + { + if($encoding_prefs) + { + $enc = mb_detect_encoding($xmlchunk, $encoding_prefs); + } + else + { + $enc = mb_detect_encoding($xmlchunk); + } + // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII... + // IANA also likes better US-ASCII, so go with it + if($enc == 'ASCII') + { + $enc = 'US-'.$enc; + } + return $enc; + } + else + { + // no encoding specified: as per HTTP1.1 assume it is iso-8859-1? + // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types + // this should be the standard. And we should be getting text/xml as request and response. + // BUT we have to be backward compatible with the lib, which always used UTF-8 as default... + return PhpXmlRpc::$xmlrpc_defencoding; } - return $enc; - } - else - { - // no encoding specified: as per HTTP1.1 assume it is iso-8859-1? - // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types - // this should be the standard. And we should be getting text/xml as request and response. - // BUT we have to be backward compatible with the lib, which always used UTF-8 as default... - return PhpXmlRpc::$xmlrpc_defencoding; } -} } \ No newline at end of file diff --git a/src/Request.php b/src/Request.php index 8b4a113..00db29c 100644 --- a/src/Request.php +++ b/src/Request.php @@ -4,6 +4,7 @@ namespace PhpXmlRpc; use PhpXmlRpc\Helper\Http; use PhpXmlRpc\Helper\XMLParser; +use PhpXmlRpc\Helper\Encoder; class Request { @@ -463,7 +464,7 @@ class Request } // try to 'guestimate' the character encoding of the received response - $resp_encoding = guess_encoding(@$this->httpResponse['headers']['content-type'], $data); + $resp_encoding = Encoder::guess_encoding(@$this->httpResponse['headers']['content-type'], $data); // if response charset encoding is not known / supported, try to use // the default encoding and parse the xml anyway, but log a warning... diff --git a/src/Server.php b/src/Server.php index c4d16a8..56fd6e8 100644 --- a/src/Server.php +++ b/src/Server.php @@ -4,6 +4,7 @@ namespace PhpXmlRpc; use PhpXmlRpc\Helper\XMLParser; use PhpXmlRpc\Helper\Charset; +use PhpXmlRpc\Helper\Encoder; /** * Error handler used to track errors that occur during server-side execution of PHP code. @@ -225,7 +226,7 @@ class Server * Execute the xmlrpc request, printing the response * @param string $data the request body. If null, the http POST request will be examined * @param bool $return_payload When true, return the response but do not echo it or any http header - * @return xmlrpcresp the response object (usually not used by caller...) + * @return Response the response object (usually not used by caller...) */ function service($data=null, $return_payload=false) { @@ -532,7 +533,7 @@ class Server // 'guestimate' request encoding /// @todo check if mbstring is enabled and automagic input conversion is on: it might mingle with this check??? - $req_encoding = guess_encoding(isset($_SERVER['CONTENT_TYPE']) ? $_SERVER['CONTENT_TYPE'] : '', + $req_encoding = Encoder::guess_encoding(isset($_SERVER['CONTENT_TYPE']) ? $_SERVER['CONTENT_TYPE'] : '', $data); return null; @@ -673,7 +674,7 @@ class Server $methName = $m; } $sysCall = $this->allow_system_funcs && (strpos($methName, "system.") === 0); - $dmap = $sysCall ? $GLOBALS['_xmlrpcs_dmap'] : $this->dmap; + $dmap = $sysCall ? $this->getSystemDispatchMap() : $this->dmap; if(!isset($dmap[$methName]['function'])) { @@ -865,7 +866,7 @@ class Server /* Functions that implement system.XXX methods of xmlrpc servers */ - protected function getSystemDispatchMap() + public function getSystemDispatchMap() { return array( 'system.listMethods' => array( @@ -944,7 +945,7 @@ class Server } if($server->allow_system_funcs) { - foreach($GLOBALS['_xmlrpcs_dmap'] as $key => $val) + foreach($server->getSystemDispatchMap() as $key => $val) { $outAr[]=new Value($key, 'string'); } @@ -966,11 +967,11 @@ class Server } if(strpos($methName, "system.") === 0) { - $dmap=$GLOBALS['_xmlrpcs_dmap']; $sysCall=1; + $dmap=$server->getSystemDispatchMap(); } else { - $dmap=$server->dmap; $sysCall=0; + $dmap=$server->dmap; } if(isset($dmap[$methName])) { @@ -1016,11 +1017,11 @@ class Server } if(strpos($methName, "system.") === 0) { - $dmap=$GLOBALS['_xmlrpcs_dmap']; $sysCall=1; + $dmap=$server->getSystemDispatchMap(); } else { - $dmap=$server->dmap; $sysCall=0; + $dmap=$server->dmap; } if(isset($dmap[$methName])) { -- 2.43.0