X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=src%2FHelper%2FXMLParser.php;h=2d3296d4ee6eae2dad2f8617cda1d743ba0dd749;hb=25b0d7c6c540615e854f941dc457f556bfca2408;hp=b62bf4873f1aec55914f2fa2b0d148a86ce6719e;hpb=b24b6acde8360a1a0f8ac23b64c19a8dc61cc3ba;p=plcapi.git diff --git a/src/Helper/XMLParser.php b/src/Helper/XMLParser.php index b62bf48..2d3296d 100644 --- a/src/Helper/XMLParser.php +++ b/src/Helper/XMLParser.php @@ -7,30 +7,40 @@ use PhpXmlRpc\Value; /** * Deals with parsing the XML. + * @see http://xmlrpc.com/spec.md */ class XMLParser { - // used to store state during parsing - // quick explanation of components: + const RETURN_XMLRPCVALS = 'xmlrpcvals'; + const RETURN_EPIVALS = 'epivals'; + const RETURN_PHP = 'phpvals'; + + const ACCEPT_REQUEST = 1; + const ACCEPT_RESPONSE = 2; + const ACCEPT_VALUE = 4; + const ACCEPT_FAULT = 8; + + // Used to store state during parsing. + // Quick explanation of components: + // private: // ac - used to accumulate values - // stack - array with genealogy of xml elements names: - // used to validate nesting of xmlrpc elements + // stack - array with genealogy of xml elements names used to validate nesting of xmlrpc elements // valuestack - array used for parsing arrays and structs - // lv - used to indicate "looking for a value": implements - // the logic to allow values with no types to be strings - // isf - used to indicate a parsing fault (2) or xmlrpc response fault (1) + // lv - used to indicate "looking for a value": implements the logic to allow values with no types to be strings + // public: + // isf - used to indicate an xml parsing fault (3), invalid xmlrpc fault (2) or xmlrpc response fault (1) // isf_reason - used for storing xmlrpc response fault string // method - used to store method name // params - used to store parameters in method calls // pt - used to store the type of each received parameter. Useful if parameters are automatically decoded to php values - // rt - 'methodcall or 'methodresponse' + // rt - 'methodcall', 'methodresponse', 'value' or 'fault' (the last one used only in EPI emulation mode) public $_xh = array( 'ac' => '', 'stack' => array(), 'valuestack' => array(), 'isf' => 0, 'isf_reason' => '', - 'method' => false, // so we can check later if we got a methodname or not + 'method' => false, 'params' => array(), 'pt' => array(), 'rt' => '', @@ -40,6 +50,8 @@ class XMLParser 'VALUE' => array('MEMBER', 'DATA', 'PARAM', 'FAULT'), 'BOOLEAN' => array('VALUE'), 'I4' => array('VALUE'), + 'I8' => array('VALUE'), + 'EX:I8' => array('VALUE'), 'INT' => array('VALUE'), 'STRING' => array('VALUE'), 'DOUBLE' => array('VALUE'), @@ -58,27 +70,128 @@ class XMLParser 'EX:NIL' => array('VALUE'), // only used when extension activated ); + /** @var array $parsing_options */ + protected $parsing_options = array(); + /** @var int $accept self::ACCEPT_REQUEST | self::ACCEPT_RESPONSE by default */ + protected $accept = 3; + /** @var int $maxChunkLength 4 MB by default. Any value below 10MB should be good */ + protected $maxChunkLength = 4194304; + + /** + * @param array $options passed to the xml parser + */ + public function __construct(array $options = array()) + { + $this->parsing_options = $options; + } + + /** + * @param string $data + * @param string $returnType + * @param int $accept a bit-combination of self::ACCEPT_REQUEST, self::ACCEPT_RESPONSE, self::ACCEPT_VALUE + * @return string + */ + public function parse($data, $returnType = self::RETURN_XMLRPCVALS, $accept = 3) + { + $this->_xh = array( + 'ac' => '', + 'stack' => array(), + 'valuestack' => array(), + 'isf' => 0, + 'isf_reason' => '', + 'method' => false, // so we can check later if we got a methodname or not + 'params' => array(), + 'pt' => array(), + 'rt' => '', + ); + + $len = strlen($data); + + // we test for empty documents here to save on resource allocation and simply the chunked-parsing loop below + if ($len == 0) { + $this->_xh['isf'] = 3; + $this->_xh['isf_reason'] = 'XML error 5: empty document'; + return; + } + + $parser = xml_parser_create(); + + foreach ($this->parsing_options as $key => $val) { + xml_parser_set_option($parser, $key, $val); + } + // always set this, in case someone tries to disable it via options... + xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 1); + + xml_set_object($parser, $this); + + switch($returnType) { + case self::RETURN_PHP: + xml_set_element_handler($parser, 'xmlrpc_se', 'xmlrpc_ee_fast'); + break; + case self::RETURN_EPIVALS: + xml_set_element_handler($parser, 'xmlrpc_se', 'xmlrpc_ee_epi'); + break; + default: + xml_set_element_handler($parser, 'xmlrpc_se', 'xmlrpc_ee'); + } + + xml_set_character_data_handler($parser, 'xmlrpc_cd'); + xml_set_default_handler($parser, 'xmlrpc_dh'); + + $this->accept = $accept; + + // @see ticket #70 - we have to parse big xml docks in chunks to avoid errors + for ($offset = 0; $offset < $len; $offset += $this->maxChunkLength) { + $chunk = substr($data, $offset, $this->maxChunkLength); + // error handling: xml not well formed + if (!xml_parse($parser, $chunk, $offset + $this->maxChunkLength >= $len)) { + $errCode = xml_get_error_code($parser); + $errStr = sprintf('XML error %s: %s at line %d, column %d', $errCode, xml_error_string($errCode), + xml_get_current_line_number($parser), xml_get_current_column_number($parser)); + + $this->_xh['isf'] = 3; + $this->_xh['isf_reason'] = $errStr; + break; + } + } + + xml_parser_free($parser); + } + /** * xml parser handler function for opening element tags. + * @internal + * @param resource $parser + * @param string $name + * @param $attrs + * @param bool $acceptSingleVals DEPRECATED use the $accept parameter instead */ public function xmlrpc_se($parser, $name, $attrs, $acceptSingleVals = false) { // if invalid xmlrpc already detected, skip all processing if ($this->_xh['isf'] < 2) { + // check for correct element nesting - // top level element can only be of 2 types - /// @todo optimization creep: save this check into a bool variable, instead of using count() every time: - /// there is only a single top level element in xml anyway if (count($this->_xh['stack']) == 0) { - if ($name != 'METHODRESPONSE' && $name != 'METHODCALL' && ( - $name != 'VALUE' && !$acceptSingleVals) - ) { + // top level element can only be of 2 types + /// @todo optimization creep: save this check into a bool variable, instead of using count() every time: + /// there is only a single top level element in xml anyway + // BC + if ($acceptSingleVals === false) { + $accept = $this->accept; + } else { + $accept = self::ACCEPT_REQUEST | self::ACCEPT_RESPONSE | self::ACCEPT_VALUE; + } + if (($name == 'METHODCALL' && ($accept & self::ACCEPT_REQUEST)) || + ($name == 'METHODRESPONSE' && ($accept & self::ACCEPT_RESPONSE)) || + ($name == 'VALUE' && ($accept & self::ACCEPT_VALUE)) || + ($name == 'FAULT' && ($accept & self::ACCEPT_FAULT))) { + $this->_xh['rt'] = strtolower($name); + } else { $this->_xh['isf'] = 2; - $this->_xh['isf_reason'] = 'missing top level xmlrpc element'; + $this->_xh['isf_reason'] = 'missing top level xmlrpc element. Found: ' . $name; return; - } else { - $this->_xh['rt'] = strtolower($name); } } else { // not top level element: see if parent is OK @@ -100,6 +213,16 @@ class XMLParser $this->_xh['lv'] = 1; $this->_xh['php_class'] = null; break; + case 'I8': + case 'EX:I8': + if (PHP_INT_SIZE === 4) { + // INVALID ELEMENT: RAISE ISF so that it is later recognized!!! + $this->_xh['isf'] = 2; + $this->_xh['isf_reason'] = "Received i8 element but php is compiled in 32 bit mode"; + + return; + } + // fall through voluntarily case 'I4': case 'INT': case 'STRING': @@ -108,7 +231,7 @@ class XMLParser case 'DATETIME.ISO8601': case 'BASE64': if ($this->_xh['vt'] != 'value') { - //two data elements inside a value: an error occurred! + // two data elements inside a value: an error occurred! $this->_xh['isf'] = 2; $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value"; @@ -119,7 +242,7 @@ class XMLParser case 'STRUCT': case 'ARRAY': if ($this->_xh['vt'] != 'value') { - //two data elements inside a value: an error occurred! + // two data elements inside a value: an error occurred! $this->_xh['isf'] = 2; $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value"; @@ -139,7 +262,7 @@ class XMLParser break; case 'DATA': if ($this->_xh['vt'] != 'data') { - //two data elements inside a value: an error occurred! + // two data elements inside a value: an error occurred! $this->_xh['isf'] = 2; $this->_xh['isf_reason'] = "found two data elements inside an array element"; @@ -159,7 +282,8 @@ class XMLParser $this->_xh['isf'] = 1; break; case 'MEMBER': - $this->_xh['valuestack'][count($this->_xh['valuestack']) - 1]['name'] = ''; // set member name to null, in case we do not find in the xml later on + // set member name to null, in case we do not find in the xml later on + $this->_xh['valuestack'][count($this->_xh['valuestack']) - 1]['name'] = ''; //$this->_xh['ac']=''; // Drop trough intentionally case 'PARAM': @@ -170,7 +294,7 @@ class XMLParser case 'EX:NIL': if (PhpXmlRpc::$xmlrpc_null_extension) { if ($this->_xh['vt'] != 'value') { - //two data elements inside a value: an error occurred! + // two data elements inside a value: an error occurred! $this->_xh['isf'] = 2; $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value"; @@ -182,7 +306,7 @@ class XMLParser // we do not support the extension, so // drop through intentionally default: - /// INVALID ELEMENT: RAISE ISF so that it is later recognized!!! + // INVALID ELEMENT: RAISE ISF so that it is later recognized!!! $this->_xh['isf'] = 2; $this->_xh['isf_reason'] = "found not-xmlrpc xml element $name"; break; @@ -199,7 +323,12 @@ class XMLParser } /** - * Used in decoding xml chunks that might represent single xmlrpc values. + * xml parser handler function for opening element tags. + * Used in decoding xml chunks that might represent single xmlrpc values as well as requests, responses. + * @deprecated + * @param resource $parser + * @param $name + * @param $attrs */ public function xmlrpc_se_any($parser, $name, $attrs) { @@ -208,8 +337,12 @@ class XMLParser /** * xml parser handler function for close element tags. + * @internal + * @param resource $parser + * @param string $name + * @param int $rebuildXmlrpcvals >1 for rebuilding xmlrpcvals, 0 for rebuilding php values, -1 for xmlrpc-extension compatibility */ - public function xmlrpc_ee($parser, $name, $rebuildXmlrpcvals = true) + public function xmlrpc_ee($parser, $name, $rebuildXmlrpcvals = 1) { if ($this->_xh['isf'] < 2) { // push this element name from stack @@ -226,7 +359,7 @@ class XMLParser $this->_xh['vt'] = Value::$xmlrpcString; } - if ($rebuildXmlrpcvals) { + if ($rebuildXmlrpcvals > 0) { // build the xmlrpc val out of the data received, and substitute it $temp = new Value($this->_xh['value'], $this->_xh['vt']); // in case we got info about underlying php class, save it @@ -234,31 +367,39 @@ class XMLParser if (isset($this->_xh['php_class'])) { $temp->_php_class = $this->_xh['php_class']; } - // check if we are inside an array or struct: - // if value just built is inside an array, let's move it into array on the stack - $vscount = count($this->_xh['valuestack']); - if ($vscount && $this->_xh['valuestack'][$vscount - 1]['type'] == 'ARRAY') { - $this->_xh['valuestack'][$vscount - 1]['values'][] = $temp; - } else { - $this->_xh['value'] = $temp; + $this->_xh['value'] = $temp; + } elseif ($rebuildXmlrpcvals < 0) { + if ($this->_xh['vt'] == Value::$xmlrpcDateTime) { + $this->_xh['value'] = (object)array( + 'xmlrpc_type' => 'datetime', + 'scalar' => $this->_xh['value'], + 'timestamp' => \PhpXmlRpc\Helper\Date::iso8601Decode($this->_xh['value']) + ); + } elseif ($this->_xh['vt'] == Value::$xmlrpcBase64) { + $this->_xh['value'] = (object)array( + 'xmlrpc_type' => 'base64', + 'scalar' => $this->_xh['value'] + ); } } else { - /// @todo this needs to treat correctly php-serialized objects, + /// @todo this should handle php-serialized objects, /// since std deserializing is done by php_xmlrpc_decode, /// which we will not be calling... - if (isset($this->_xh['php_class'])) { - } + //if (isset($this->_xh['php_class'])) { + //} + } - // check if we are inside an array or struct: - // if value just built is inside an array, let's move it into array on the stack - $vscount = count($this->_xh['valuestack']); - if ($vscount && $this->_xh['valuestack'][$vscount - 1]['type'] == 'ARRAY') { - $this->_xh['valuestack'][$vscount - 1]['values'][] = $this->_xh['value']; - } + // check if we are inside an array or struct: + // if value just built is inside an array, let's move it into array on the stack + $vscount = count($this->_xh['valuestack']); + if ($vscount && $this->_xh['valuestack'][$vscount - 1]['type'] == 'ARRAY') { + $this->_xh['valuestack'][$vscount - 1]['values'][] = $this->_xh['value']; } break; case 'BOOLEAN': case 'I4': + case 'I8': + case 'EX:I8': case 'INT': case 'STRING': case 'DOUBLE': @@ -271,7 +412,7 @@ class XMLParser $this->_xh['value'] = $this->_xh['ac']; } elseif ($name == 'DATETIME.ISO8601') { if (!preg_match('/^[0-9]{8}T[0-9]{2}:[0-9]{2}:[0-9]{2}$/', $this->_xh['ac'])) { - error_log('XML-RPC: ' . __METHOD__ . ': invalid value received in DATETIME: ' . $this->_xh['ac']); + Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': invalid value received in DATETIME: ' . $this->_xh['ac']); } $this->_xh['vt'] = Value::$xmlrpcDateTime; $this->_xh['value'] = $this->_xh['ac']; @@ -290,7 +431,7 @@ class XMLParser } else { // log if receiving something strange, even though we set the value to false anyway if ($this->_xh['ac'] != '0' && strcasecmp($this->_xh['ac'], 'false') != 0) { - error_log('XML-RPC: ' . __METHOD__ . ': invalid value received in BOOLEAN: ' . $this->_xh['ac']); + Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': invalid value received in BOOLEAN: ' . $this->_xh['ac']); } $this->_xh['value'] = false; } @@ -300,43 +441,40 @@ class XMLParser // NOTE: regexp could be much stricter than this... if (!preg_match('/^[+-eE0123456789 \t.]+$/', $this->_xh['ac'])) { /// @todo: find a better way of throwing an error than this! - error_log('XML-RPC: ' . __METHOD__ . ': non numeric value received in DOUBLE: ' . $this->_xh['ac']); + Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': non numeric value received in DOUBLE: ' . $this->_xh['ac']); $this->_xh['value'] = 'ERROR_NON_NUMERIC_FOUND'; } else { // it's ok, add it on $this->_xh['value'] = (double)$this->_xh['ac']; } } else { - // we have an I4/INT + // we have an I4/I8/INT // we must check that only 0123456789- are characters here if (!preg_match('/^[+-]?[0123456789 \t]+$/', $this->_xh['ac'])) { /// @todo find a better way of throwing an error than this! - error_log('XML-RPC: ' . __METHOD__ . ': non numeric value received in INT: ' . $this->_xh['ac']); + Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': non numeric value received in INT: ' . $this->_xh['ac']); $this->_xh['value'] = 'ERROR_NON_NUMERIC_FOUND'; } else { // it's ok, add it on $this->_xh['value'] = (int)$this->_xh['ac']; } } - //$this->_xh['ac']=''; // is this necessary? $this->_xh['lv'] = 3; // indicate we've found a value break; case 'NAME': $this->_xh['valuestack'][count($this->_xh['valuestack']) - 1]['name'] = $this->_xh['ac']; break; case 'MEMBER': - //$this->_xh['ac']=''; // is this necessary? // add to array in the stack the last element built, // unless no VALUE was found if ($this->_xh['vt']) { $vscount = count($this->_xh['valuestack']); $this->_xh['valuestack'][$vscount - 1]['values'][$this->_xh['valuestack'][$vscount - 1]['name']] = $this->_xh['value']; } else { - error_log('XML-RPC: ' . __METHOD__ . ': missing VALUE inside STRUCT in received xml'); + Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': missing VALUE inside STRUCT in received xml'); } break; case 'DATA': - //$this->_xh['ac']=''; // is this necessary? $this->_xh['vt'] = null; // reset this to check for 2 data elements in a row - even if they're empty break; case 'STRUCT': @@ -356,7 +494,7 @@ class XMLParser $this->_xh['params'][] = $this->_xh['value']; $this->_xh['pt'][] = $this->_xh['vt']; } else { - error_log('XML-RPC: ' . __METHOD__ . ': missing VALUE inside PARAM in received xml'); + Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': missing VALUE inside PARAM in received xml'); } break; case 'METHODNAME': @@ -386,14 +524,31 @@ class XMLParser /** * Used in decoding xmlrpc requests/responses without rebuilding xmlrpc Values. + * @internal + * @param resource $parser + * @param string $name */ public function xmlrpc_ee_fast($parser, $name) { - $this->xmlrpc_ee($parser, $name, false); + $this->xmlrpc_ee($parser, $name, 0); + } + + /** + * Used in decoding xmlrpc requests/responses while building xmlrpc-extension Values (plain php for all but base64 and datetime). + * @internal + * @param resource $parser + * @param string $name + */ + public function xmlrpc_ee_epi($parser, $name) + { + $this->xmlrpc_ee($parser, $name, -1); } /** * xml parser handler function for character data. + * @internal + * @param resource $parser + * @param string $data */ public function xmlrpc_cd($parser, $data) { @@ -402,18 +557,6 @@ class XMLParser // "lookforvalue==3" means that we've found an entire value // and should discard any further character data if ($this->_xh['lv'] != 3) { - // G. Giunta 2006-08-23: useless change of 'lv' from 1 to 2 - //if($this->_xh['lv']==1) - //{ - // if we've found text and we're just in a then - // say we've found a value - //$this->_xh['lv']=2; - //} - // we always initialize the accumulator before starting parsing, anyway... - //if(!@isset($this->_xh['ac'])) - //{ - // $this->_xh['ac'] = ''; - //} $this->_xh['ac'] .= $data; } } @@ -422,22 +565,20 @@ class XMLParser /** * xml parser handler function for 'other stuff', ie. not char data or * element start/end tag. In fact it only gets called on unknown entities... + * @internal + * @param $parser + * @param string data */ public function xmlrpc_dh($parser, $data) { // skip processing if xml fault already detected if ($this->_xh['isf'] < 2) { if (substr($data, 0, 1) == '&' && substr($data, -1, 1) == ';') { - // G. Giunta 2006-08-25: useless change of 'lv' from 1 to 2 - //if($this->_xh['lv']==1) - //{ - // $this->_xh['lv']=2; - //} $this->_xh['ac'] .= $data; } } - return true; + //return true; } /** @@ -454,8 +595,10 @@ class XMLParser * * @param string $httpHeader the http Content-type header * @param string $xmlChunk xml content buffer - * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled) - * @return string + * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled). + * This can also be set globally using PhpXmlRpc::$xmlrpc_detectencodings + * @return string the encoding determined. Null if it can't be determined and mbstring is enabled, + * PhpXmlRpc::$xmlrpc_defencoding if it can't be determined and mbstring is not enabled * * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!! */ @@ -464,10 +607,10 @@ class XMLParser // discussion: see http://www.yale.edu/pclt/encoding/ // 1 - test if encoding is specified in HTTP HEADERS - //Details: + // Details: // LWS: (\13\10)?( |\t)+ // token: (any char but excluded stuff)+ - // quoted string: " (any char but double quotes and cointrol chars)* " + // quoted string: " (any char but double quotes and control chars)* " // header: Content-type = ...; charset=value(; ...)* // where value is of type token, no LWS allowed between 'charset' and value // Note: we do not check for invalid chars in VALUE: @@ -507,8 +650,10 @@ class XMLParser } // 4 - if mbstring is available, let it do the guesswork - // NB: we favour finding an encoding that is compatible with what we can process if (extension_loaded('mbstring')) { + if ($encodingPrefs == null && PhpXmlRpc::$xmlrpc_detectencodings != null) { + $encodingPrefs = PhpXmlRpc::$xmlrpc_detectencodings; + } if ($encodingPrefs) { $enc = mb_detect_encoding($xmlChunk, $encodingPrefs); } else {