-
- /**
- * xml charset encoding guessing helper function.
- * Tries to determine the charset encoding of an XML chunk received over HTTP.
- * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type,
- * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of unconforming (legacy?) clients/servers,
- * which will be most probably using UTF-8 anyway...
- *
- * @param string $httpheader the http Content-type header
- * @param string $xmlchunk xml content buffer
- * @param string $encoding_prefs comma separated list of character encodings to be used as default (when mb extension is enabled)
- * @return string
- *
- * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!!
- */
- static function guess_encoding($httpheader='', $xmlchunk='', $encoding_prefs=null)
- {
- // discussion: see http://www.yale.edu/pclt/encoding/
- // 1 - test if encoding is specified in HTTP HEADERS
-
- //Details:
- // LWS: (\13\10)?( |\t)+
- // token: (any char but excluded stuff)+
- // quoted string: " (any char but double quotes and cointrol chars)* "
- // header: Content-type = ...; charset=value(; ...)*
- // where value is of type token, no LWS allowed between 'charset' and value
- // Note: we do not check for invalid chars in VALUE:
- // this had better be done using pure ereg as below
- // Note 2: we might be removing whitespace/tabs that ought to be left in if
- // the received charset is a quoted string. But nobody uses such charset names...
-
- /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it?
- $matches = array();
- if(preg_match('/;\s*charset\s*=([^;]+)/i', $httpheader, $matches))
- {
- return strtoupper(trim($matches[1], " \t\""));
- }
-
- // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern
- // (source: http://www.w3.org/TR/2000/REC-xml-20001006)
- // NOTE: actually, according to the spec, even if we find the BOM and determine
- // an encoding, we should check if there is an encoding specified
- // in the xml declaration, and verify if they match.
- /// @todo implement check as described above?
- /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM)
- if(preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlchunk))
- {
- return 'UCS-4';
- }
- elseif(preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlchunk))
- {
- return 'UTF-16';
- }
- elseif(preg_match('/^(\xEF\xBB\xBF)/', $xmlchunk))
- {
- return 'UTF-8';
- }
-
- // 3 - test if encoding is specified in the xml declaration
- // Details:
- // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
- // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
- if (preg_match('/^<\?xml\s+version\s*=\s*'. "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))".
- '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
- $xmlchunk, $matches))
- {
- return strtoupper(substr($matches[2], 1, -1));
- }
-
- // 4 - if mbstring is available, let it do the guesswork
- // NB: we favour finding an encoding that is compatible with what we can process
- if(extension_loaded('mbstring'))
- {
- if($encoding_prefs)
- {
- $enc = mb_detect_encoding($xmlchunk, $encoding_prefs);
- }
- else
- {
- $enc = mb_detect_encoding($xmlchunk);
- }
- // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII...
- // IANA also likes better US-ASCII, so go with it
- if($enc == 'ASCII')
- {
- $enc = 'US-'.$enc;
- }
- return $enc;
- }
- else
- {
- // no encoding specified: as per HTTP1.1 assume it is iso-8859-1?
- // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types
- // this should be the standard. And we should be getting text/xml as request and response.
- // BUT we have to be backward compatible with the lib, which always used UTF-8 as default...
- return PhpXmlRpc::$xmlrpc_defencoding;
- }
- }
-
-}
\ No newline at end of file