X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=src%2FHelper%2FXMLParser.php;h=b62bf4873f1aec55914f2fa2b0d148a86ce6719e;hb=b24b6acde8360a1a0f8ac23b64c19a8dc61cc3ba;hp=734ada7e9876d0fec5c0ebc93d1eb0ea2f30e49e;hpb=35d2340eea9a168983b8f20d54c399422790f816;p=plcapi.git diff --git a/src/Helper/XMLParser.php b/src/Helper/XMLParser.php index 734ada7..b62bf48 100644 --- a/src/Helper/XMLParser.php +++ b/src/Helper/XMLParser.php @@ -18,8 +18,8 @@ class XMLParser // valuestack - array used for parsing arrays and structs // lv - used to indicate "looking for a value": implements // the logic to allow values with no types to be strings - // isf - used to indicate a parsing fault (2) or xmlrpcresp fault (1) - // isf_reason - used for storing xmlrpcresp fault string + // isf - used to indicate a parsing fault (2) or xmlrpc response fault (1) + // isf_reason - used for storing xmlrpc response fault string // method - used to store method name // params - used to store parameters in method calls // pt - used to store the type of each received parameter. Useful if parameters are automatically decoded to php values @@ -61,7 +61,7 @@ class XMLParser /** * xml parser handler function for opening element tags. */ - public function xmlrpc_se($parser, $name, $attrs, $accept_single_vals = false) + public function xmlrpc_se($parser, $name, $attrs, $acceptSingleVals = false) { // if invalid xmlrpc already detected, skip all processing if ($this->_xh['isf'] < 2) { @@ -71,7 +71,7 @@ class XMLParser /// there is only a single top level element in xml anyway if (count($this->_xh['stack']) == 0) { if ($name != 'METHODRESPONSE' && $name != 'METHODCALL' && ( - $name != 'VALUE' && !$accept_single_vals) + $name != 'VALUE' && !$acceptSingleVals) ) { $this->_xh['isf'] = 2; $this->_xh['isf_reason'] = 'missing top level xmlrpc element'; @@ -126,15 +126,15 @@ class XMLParser return; } // create an empty array to hold child values, and push it onto appropriate stack - $cur_val = array(); - $cur_val['values'] = array(); - $cur_val['type'] = $name; + $curVal = array(); + $curVal['values'] = array(); + $curVal['type'] = $name; // check for out-of-band information to rebuild php objs // and in case it is found, save it if (@isset($attrs['PHP_CLASS'])) { - $cur_val['php_class'] = $attrs['PHP_CLASS']; + $curVal['php_class'] = $attrs['PHP_CLASS']; } - $this->_xh['valuestack'][] = $cur_val; + $this->_xh['valuestack'][] = $curVal; $this->_xh['vt'] = 'data'; // be prepared for a data element next break; case 'DATA': @@ -209,14 +209,14 @@ class XMLParser /** * xml parser handler function for close element tags. */ - public function xmlrpc_ee($parser, $name, $rebuild_xmlrpcvals = true) + public function xmlrpc_ee($parser, $name, $rebuildXmlrpcvals = true) { if ($this->_xh['isf'] < 2) { // push this element name from stack // NB: if XML validates, correct opening/closing is guaranteed and - // we do not have to check for $name == $curr_elem. + // we do not have to check for $name == $currElem. // we also checked for proper nesting at start of elements... - $curr_elem = array_pop($this->_xh['stack']); + $currElem = array_pop($this->_xh['stack']); switch ($name) { case 'VALUE': @@ -226,7 +226,7 @@ class XMLParser $this->_xh['vt'] = Value::$xmlrpcString; } - if ($rebuild_xmlrpcvals) { + if ($rebuildXmlrpcvals) { // build the xmlrpc val out of the data received, and substitute it $temp = new Value($this->_xh['value'], $this->_xh['vt']); // in case we got info about underlying php class, save it @@ -271,7 +271,7 @@ class XMLParser $this->_xh['value'] = $this->_xh['ac']; } elseif ($name == 'DATETIME.ISO8601') { if (!preg_match('/^[0-9]{8}T[0-9]{2}:[0-9]{2}:[0-9]{2}$/', $this->_xh['ac'])) { - error_log('XML-RPC: invalid value received in DATETIME: ' . $this->_xh['ac']); + error_log('XML-RPC: ' . __METHOD__ . ': invalid value received in DATETIME: ' . $this->_xh['ac']); } $this->_xh['vt'] = Value::$xmlrpcDateTime; $this->_xh['value'] = $this->_xh['ac']; @@ -290,7 +290,7 @@ class XMLParser } else { // log if receiving something strange, even though we set the value to false anyway if ($this->_xh['ac'] != '0' && strcasecmp($this->_xh['ac'], 'false') != 0) { - error_log('XML-RPC: invalid value received in BOOLEAN: ' . $this->_xh['ac']); + error_log('XML-RPC: ' . __METHOD__ . ': invalid value received in BOOLEAN: ' . $this->_xh['ac']); } $this->_xh['value'] = false; } @@ -300,7 +300,7 @@ class XMLParser // NOTE: regexp could be much stricter than this... if (!preg_match('/^[+-eE0123456789 \t.]+$/', $this->_xh['ac'])) { /// @todo: find a better way of throwing an error than this! - error_log('XML-RPC: non numeric value received in DOUBLE: ' . $this->_xh['ac']); + error_log('XML-RPC: ' . __METHOD__ . ': non numeric value received in DOUBLE: ' . $this->_xh['ac']); $this->_xh['value'] = 'ERROR_NON_NUMERIC_FOUND'; } else { // it's ok, add it on @@ -311,7 +311,7 @@ class XMLParser // we must check that only 0123456789- are characters here if (!preg_match('/^[+-]?[0123456789 \t]+$/', $this->_xh['ac'])) { /// @todo find a better way of throwing an error than this! - error_log('XML-RPC: non numeric value received in INT: ' . $this->_xh['ac']); + error_log('XML-RPC: ' . __METHOD__ . ': non numeric value received in INT: ' . $this->_xh['ac']); $this->_xh['value'] = 'ERROR_NON_NUMERIC_FOUND'; } else { // it's ok, add it on @@ -332,7 +332,7 @@ class XMLParser $vscount = count($this->_xh['valuestack']); $this->_xh['valuestack'][$vscount - 1]['values'][$this->_xh['valuestack'][$vscount - 1]['name']] = $this->_xh['value']; } else { - error_log('XML-RPC: missing VALUE inside STRUCT in received xml'); + error_log('XML-RPC: ' . __METHOD__ . ': missing VALUE inside STRUCT in received xml'); } break; case 'DATA': @@ -342,11 +342,11 @@ class XMLParser case 'STRUCT': case 'ARRAY': // fetch out of stack array of values, and promote it to current value - $curr_val = array_pop($this->_xh['valuestack']); - $this->_xh['value'] = $curr_val['values']; + $currVal = array_pop($this->_xh['valuestack']); + $this->_xh['value'] = $currVal['values']; $this->_xh['vt'] = strtolower($name); - if (isset($curr_val['php_class'])) { - $this->_xh['php_class'] = $curr_val['php_class']; + if (isset($currVal['php_class'])) { + $this->_xh['php_class'] = $currVal['php_class']; } break; case 'PARAM': @@ -356,7 +356,7 @@ class XMLParser $this->_xh['params'][] = $this->_xh['value']; $this->_xh['pt'][] = $this->_xh['vt']; } else { - error_log('XML-RPC: missing VALUE inside PARAM in received xml'); + error_log('XML-RPC: ' . __METHOD__ . ': missing VALUE inside PARAM in received xml'); } break; case 'METHODNAME': @@ -439,4 +439,125 @@ class XMLParser return true; } + + /** + * xml charset encoding guessing helper function. + * Tries to determine the charset encoding of an XML chunk received over HTTP. + * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type, + * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of non conforming (legacy?) clients/servers, + * which will be most probably using UTF-8 anyway... + * In order of importance checks: + * 1. http headers + * 2. BOM + * 3. XML declaration + * 4. guesses using mb_detect_encoding() + * + * @param string $httpHeader the http Content-type header + * @param string $xmlChunk xml content buffer + * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled) + * @return string + * + * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!! + */ + public static function guessEncoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null) + { + // discussion: see http://www.yale.edu/pclt/encoding/ + // 1 - test if encoding is specified in HTTP HEADERS + + //Details: + // LWS: (\13\10)?( |\t)+ + // token: (any char but excluded stuff)+ + // quoted string: " (any char but double quotes and cointrol chars)* " + // header: Content-type = ...; charset=value(; ...)* + // where value is of type token, no LWS allowed between 'charset' and value + // Note: we do not check for invalid chars in VALUE: + // this had better be done using pure ereg as below + // Note 2: we might be removing whitespace/tabs that ought to be left in if + // the received charset is a quoted string. But nobody uses such charset names... + + /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it? + $matches = array(); + if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) { + return strtoupper(trim($matches[1], " \t\"")); + } + + // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern + // (source: http://www.w3.org/TR/2000/REC-xml-20001006) + // NOTE: actually, according to the spec, even if we find the BOM and determine + // an encoding, we should check if there is an encoding specified + // in the xml declaration, and verify if they match. + /// @todo implement check as described above? + /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM) + if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) { + return 'UCS-4'; + } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) { + return 'UTF-16'; + } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) { + return 'UTF-8'; + } + + // 3 - test if encoding is specified in the xml declaration + // Details: + // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ + // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* + if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" . + '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", + $xmlChunk, $matches)) { + return strtoupper(substr($matches[2], 1, -1)); + } + + // 4 - if mbstring is available, let it do the guesswork + // NB: we favour finding an encoding that is compatible with what we can process + if (extension_loaded('mbstring')) { + if ($encodingPrefs) { + $enc = mb_detect_encoding($xmlChunk, $encodingPrefs); + } else { + $enc = mb_detect_encoding($xmlChunk); + } + // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII... + // IANA also likes better US-ASCII, so go with it + if ($enc == 'ASCII') { + $enc = 'US-' . $enc; + } + + return $enc; + } else { + // no encoding specified: as per HTTP1.1 assume it is iso-8859-1? + // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types + // this should be the standard. And we should be getting text/xml as request and response. + // BUT we have to be backward compatible with the lib, which always used UTF-8 as default... + return PhpXmlRpc::$xmlrpc_defencoding; + } + } + + /** + * Helper function: checks if an xml chunk as a charset declaration (BOM or in the xml declaration) + * + * @param string $xmlChunk + * @return bool + */ + public static function hasEncoding($xmlChunk) + { + // scan the first bytes of the data for a UTF-16 (or other) BOM pattern + // (source: http://www.w3.org/TR/2000/REC-xml-20001006) + if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) { + return true; + } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) { + return true; + } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) { + return true; + } + + // test if encoding is specified in the xml declaration + // Details: + // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ + // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* + if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" . + '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", + $xmlChunk, $matches)) { + return true; + } + + return false; + } }