"phpunit/phpunit": ">=4.0.0",
"phpunit/phpunit-selenium": "*",
"codeclimate/php-test-reporter": "dev-master",
- "ext-curl": "*"
+ "ext-curl": "*",
+ "ext-mbstring": "*"
},
"suggest": {
"ext-curl": "Needed for HTTPS and HTTP 1.1 support, NTLM Auth etc...",
- "ext-zlib": "Needed for sending compressed requests and receiving compressed responses, if cURL is not available"
+ "ext-zlib": "Needed for sending compressed requests and receiving compressed responses, if cURL is not available",
+ "ext-mbstring": "Needed to allow reception of requests/responses in character sets other than ASCII,LATIN-1,UTF-8"
},
"autoload": {
"psr-4": {"PhpXmlRpc\\": "src/"}
function guess_encoding($httpHeader='', $xmlChunk='', $encodingPrefs=null)
{
+ return PhpXmlRpc\Helper\XMLParser::guessEncoding($httpHeader, $xmlChunk, $encodingPrefs);
}
function is_valid_charset($encoding, $validList)
*/
public function decode_xml($xmlVal, $options = array())
{
+ // 'guestimate' encoding
+ $valEncoding = XMLParser::guessEncoding('', $xmlVal);
+ if ($valEncoding != '') {
+
+ // Since parsing will fail if charset is not specified in the xml prologue,
+ // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that...
+ // The following code might be better for mb_string enabled installs, but
+ // makes the lib about 200% slower...
+ //if (!is_valid_charset($valEncoding, array('UTF-8'))
+ if (!in_array($valEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($xmlVal)) {
+ if ($valEncoding == 'ISO-8859-1') {
+ $xmlVal = utf8_encode($xmlVal);
+ }
+ else {
+ if (extension_loaded('mbstring')) {
+ $xmlVal = mb_convert_encoding($xmlVal, 'UTF-8', $valEncoding);
+ } else {
+ error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of xml text: ' . $valEncoding);
+ }
+ }
+ }
+ }
- /// @todo 'guestimate' encoding
$parser = xml_parser_create();
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true);
// What if internal encoding is not in one of the 3 allowed?
}
}
- /**
- * xml charset encoding guessing helper function.
- * Tries to determine the charset encoding of an XML chunk received over HTTP.
- * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type,
- * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of unconforming (legacy?) clients/servers,
- * which will be most probably using UTF-8 anyway...
- *
- * @param string $httpHeader the http Content-type header
- * @param string $xmlChunk xml content buffer
- * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled)
- * @return string
- *
- * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!!
- */
- public static function guess_encoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null)
- {
- // discussion: see http://www.yale.edu/pclt/encoding/
- // 1 - test if encoding is specified in HTTP HEADERS
-
- //Details:
- // LWS: (\13\10)?( |\t)+
- // token: (any char but excluded stuff)+
- // quoted string: " (any char but double quotes and cointrol chars)* "
- // header: Content-type = ...; charset=value(; ...)*
- // where value is of type token, no LWS allowed between 'charset' and value
- // Note: we do not check for invalid chars in VALUE:
- // this had better be done using pure ereg as below
- // Note 2: we might be removing whitespace/tabs that ought to be left in if
- // the received charset is a quoted string. But nobody uses such charset names...
-
- /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it?
- $matches = array();
- if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) {
- return strtoupper(trim($matches[1], " \t\""));
- }
-
- // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern
- // (source: http://www.w3.org/TR/2000/REC-xml-20001006)
- // NOTE: actually, according to the spec, even if we find the BOM and determine
- // an encoding, we should check if there is an encoding specified
- // in the xml declaration, and verify if they match.
- /// @todo implement check as described above?
- /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM)
- if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) {
- return 'UCS-4';
- } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) {
- return 'UTF-16';
- } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) {
- return 'UTF-8';
- }
-
- // 3 - test if encoding is specified in the xml declaration
- // Details:
- // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
- // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
- if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" .
- '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
- $xmlChunk, $matches)) {
- return strtoupper(substr($matches[2], 1, -1));
- }
-
- // 4 - if mbstring is available, let it do the guesswork
- // NB: we favour finding an encoding that is compatible with what we can process
- if (extension_loaded('mbstring')) {
- if ($encodingPrefs) {
- $enc = mb_detect_encoding($xmlChunk, $encodingPrefs);
- } else {
- $enc = mb_detect_encoding($xmlChunk);
- }
- // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII...
- // IANA also likes better US-ASCII, so go with it
- if ($enc == 'ASCII') {
- $enc = 'US-' . $enc;
- }
-
- return $enc;
- } else {
- // no encoding specified: as per HTTP1.1 assume it is iso-8859-1?
- // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types
- // this should be the standard. And we should be getting text/xml as request and response.
- // BUT we have to be backward compatible with the lib, which always used UTF-8 as default...
- return PhpXmlRpc::$xmlrpc_defencoding;
- }
- }
}
return true;
}
+ /**
+ * xml charset encoding guessing helper function.
+ * Tries to determine the charset encoding of an XML chunk received over HTTP.
+ * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type,
+ * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of non conforming (legacy?) clients/servers,
+ * which will be most probably using UTF-8 anyway...
+ *
+ * @param string $httpHeader the http Content-type header
+ * @param string $xmlChunk xml content buffer
+ * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled)
+ * @return string
+ *
+ * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!!
+ */
+ public static function guessEncoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null)
+ {
+ // discussion: see http://www.yale.edu/pclt/encoding/
+ // 1 - test if encoding is specified in HTTP HEADERS
+
+ //Details:
+ // LWS: (\13\10)?( |\t)+
+ // token: (any char but excluded stuff)+
+ // quoted string: " (any char but double quotes and cointrol chars)* "
+ // header: Content-type = ...; charset=value(; ...)*
+ // where value is of type token, no LWS allowed between 'charset' and value
+ // Note: we do not check for invalid chars in VALUE:
+ // this had better be done using pure ereg as below
+ // Note 2: we might be removing whitespace/tabs that ought to be left in if
+ // the received charset is a quoted string. But nobody uses such charset names...
+
+ /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it?
+ $matches = array();
+ if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) {
+ return strtoupper(trim($matches[1], " \t\""));
+ }
+
+ // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern
+ // (source: http://www.w3.org/TR/2000/REC-xml-20001006)
+ // NOTE: actually, according to the spec, even if we find the BOM and determine
+ // an encoding, we should check if there is an encoding specified
+ // in the xml declaration, and verify if they match.
+ /// @todo implement check as described above?
+ /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM)
+ if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) {
+ return 'UCS-4';
+ } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) {
+ return 'UTF-16';
+ } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) {
+ return 'UTF-8';
+ }
+
+ // 3 - test if encoding is specified in the xml declaration
+ // Details:
+ // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
+ // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
+ if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" .
+ '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
+ $xmlChunk, $matches)) {
+ return strtoupper(substr($matches[2], 1, -1));
+ }
+
+ // 4 - if mbstring is available, let it do the guesswork
+ // NB: we favour finding an encoding that is compatible with what we can process
+ if (extension_loaded('mbstring')) {
+ if ($encodingPrefs) {
+ $enc = mb_detect_encoding($xmlChunk, $encodingPrefs);
+ } else {
+ $enc = mb_detect_encoding($xmlChunk);
+ }
+ // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII...
+ // IANA also likes better US-ASCII, so go with it
+ if ($enc == 'ASCII') {
+ $enc = 'US-' . $enc;
+ }
+
+ return $enc;
+ } else {
+ // no encoding specified: as per HTTP1.1 assume it is iso-8859-1?
+ // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types
+ // this should be the standard. And we should be getting text/xml as request and response.
+ // BUT we have to be backward compatible with the lib, which always used UTF-8 as default...
+ return PhpXmlRpc::$xmlrpc_defencoding;
+ }
+ }
+
+ /**
+ * Helper function: checks if an xml chunk as a charset declaration (BOM or in the xml declaration)
+ *
+ * @param string $xmlChunk
+ * @return bool
+ */
+ public static function hasEncoding($xmlChunk)
+ {
+ // scan the first bytes of the data for a UTF-16 (or other) BOM pattern
+ // (source: http://www.w3.org/TR/2000/REC-xml-20001006)
+ if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) {
+ return true;
+ } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) {
+ return true;
+ } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) {
+ return true;
+ }
+
+ // test if encoding is specified in the xml declaration
+ // Details:
+ // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
+ // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
+ if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" .
+ '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
+ $xmlChunk, $matches)) {
+ return true;
+ }
+
+ return false;
+ }
}
}
// try to 'guestimate' the character encoding of the received response
- $respEncoding = Encoder::guess_encoding(@$this->httpResponse['headers']['content-type'], $data);
+ $respEncoding = XMLParser::guessEncoding(@$this->httpResponse['headers']['content-type'], $data);
- // if response charset encoding is not known / supported, try to use
- // the default encoding and parse the xml anyway, but log a warning...
- if (!in_array($respEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII'))) {
- // the following code might be better for mb_string enabled installs, but
- // makes the lib about 200% slower...
- //if (!is_valid_charset($respEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')))
+ if ($respEncoding != '') {
- error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received response: ' . $respEncoding);
- $respEncoding = PhpXmlRpc::$xmlrpc_defencoding;
+ // Since parsing will fail if charset is not specified in the xml prologue,
+ // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that...
+ // The following code might be better for mb_string enabled installs, but
+ // makes the lib about 200% slower...
+ //if (!is_valid_charset($respEncoding, array('UTF-8')))
+ if (!in_array($respEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($data)) {
+ if ($respEncoding == 'ISO-8859-1') {
+ $data = utf8_encode($data);
+ }
+ else {
+ if (extension_loaded('mbstring')) {
+ $data = mb_convert_encoding($data, 'UTF-8', $respEncoding);
+ } else {
+ error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received response: ' . $respEncoding);
+ }
+ }
+ }
}
- $parser = xml_parser_create($respEncoding);
+
+ $parser = xml_parser_create();
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true);
// G. Giunta 2005/02/13: PHP internally uses ISO-8859-1, so we have to tell
// the xml parser to give us back data in the expected charset.
// 'guestimate' request encoding
/// @todo check if mbstring is enabled and automagic input conversion is on: it might mingle with this check???
- $reqEncoding = Encoder::guess_encoding(isset($_SERVER['CONTENT_TYPE']) ? $_SERVER['CONTENT_TYPE'] : '',
+ $reqEncoding = XMLParser::guessEncoding(isset($_SERVER['CONTENT_TYPE']) ? $_SERVER['CONTENT_TYPE'] : '',
$data);
return;
*/
public function parseRequest($data, $reqEncoding = '')
{
- // 2005/05/07 commented and moved into caller function code
- //if($data=='')
- //{
- // $data=$GLOBALS['HTTP_RAW_POST_DATA'];
- //}
-
- // G. Giunta 2005/02/13: we do NOT expect to receive html entities
- // so we do not try to convert them into xml character entities
- //$data = xmlrpc_html_entity_xlate($data);
-
// decompose incoming XML into request structure
- if ($reqEncoding != '') {
- if (!in_array($reqEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII'))) {
- // the following code might be better for mb_string enabled installs, but
- // makes the lib about 200% slower...
- //if (!is_valid_charset($reqEncoding, array('UTF-8', 'ISO-8859-1', 'US-ASCII')))
- error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received request: ' . $reqEncoding);
- $reqEncoding = PhpXmlRpc::$xmlrpc_defencoding;
+ if ($reqEncoding != '') {
+ // Since parsing will fail if charset is not specified in the xml prologue,
+ // the encoding is not UTF8 and there are non-ascii chars in the text, we try to work round that...
+ // The following code might be better for mb_string enabled installs, but
+ // makes the lib about 200% slower...
+ //if (!is_valid_charset($reqEncoding, array('UTF-8')))
+ if (!in_array($reqEncoding, array('UTF-8', 'US-ASCII')) && !XMLParser::hasEncoding($data)) {
+ if ($reqEncoding == 'ISO-8859-1') {
+ $data = utf8_encode($data);
+ }
+ else {
+ if (extension_loaded('mbstring')) {
+ $data = mb_convert_encoding($data, 'UTF-8', $reqEncoding);
+ } else {
+ error_log('XML-RPC: ' . __METHOD__ . ': invalid charset encoding of received request: ' . $reqEncoding);
+ }
+ }
}
- /// @BUG this will fail on PHP 5 if charset is not specified in the xml prologue,
- // the encoding is not UTF8 and there are non-ascii chars in the text...
- /// @todo use an empty string for php 5 ???
- $parser = xml_parser_create($reqEncoding);
- } else {
- $parser = xml_parser_create();
}
+ $parser = xml_parser_create();
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, true);
// G. Giunta 2005/02/13: PHP internally uses ISO-8859-1, so we have to tell
// the xml parser to give us back data in the expected charset