X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=src%2FHelper%2FCharset.php;h=a72016e4b9dbe4a93882b35bc226b83b2f037255;hb=5d63c263e763338952815b6724ffa4469061afc4;hp=8fdecb1dc3461825686ef04c0d5630220c3c079e;hpb=dbcb9c168e60ad868fba442452bbddafee7cf684;p=plcapi.git diff --git a/src/Helper/Charset.php b/src/Helper/Charset.php index 8fdecb1..a72016e 100644 --- a/src/Helper/Charset.php +++ b/src/Helper/Charset.php @@ -103,16 +103,21 @@ class Charset /** * Convert a string to the correct XML representation in a target charset. + * This involves: + * - character transformation for all characters which have a different representation in source and dest charsets + * - using 'charset entity' representation for all characters which are outside of the target charset * * To help correct communication of non-ascii chars inside strings, regardless of the charset used when sending * requests, parsing them, sending responses and parsing responses, an option is to convert all non-ascii chars * present in the message into their equivalent 'charset entity'. Charset entities enumerated this way are * independent of the charset encoding used to transmit them, and all XML parsers are bound to understand them. - * Note that in the std case we are not sending a charset encoding mime type along with http headers, so we are - * bound by RFC 3023 to emit strict us-ascii. + * + * Note that when not sending a charset encoding mime type along with http headers, we are bound by RFC 3023 to emit + * strict us-ascii for 'text/xml' payloads (but we should review RFC 7303, which seems to have changed the rules...) * * @todo do a bit of basic benchmarking (strtr vs. str_replace) - * @todo make usage of iconv() or recode_string() or mb_string() where available + * @todo make usage of iconv() or mb_string() where available + * @todo support aliases for charset names, eg ASCII, LATIN1, ISO-88591 (see f.e. polyfill-iconv for a list) * * @param string $data * @param string $srcEncoding @@ -163,7 +168,14 @@ class Charset $ch = $data[$nn]; $ii = ord($ch); // 7 bits: 0bbbbbbb (127) - if ($ii < 128) { + if ($ii < 32) { + if ($conversion == 'UTF-8_' || $conversion == 'UTF-8_US-ASCII') { + $escapedData .= sprintf('&#%d;', $ii); + } else { + $escapedData .= $ch; + } + } + else if ($ii < 128) { /// @todo shall we replace this with a (supposedly) faster str_replace? switch ($ii) { case 34: @@ -187,35 +199,26 @@ class Charset } // 11 bits: 110bbbbb 10bbbbbb (2047) elseif ($ii >> 5 == 6) { $b1 = ($ii & 31); - $ii = ord($data[$nn + 1]); - $b2 = ($ii & 63); + $b2 = (ord($data[$nn + 1]) & 63); $ii = ($b1 * 64) + $b2; - $ent = sprintf('&#%d;', $ii); - $escapedData .= $ent; + $escapedData .= sprintf('&#%d;', $ii); $nn += 1; } // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb elseif ($ii >> 4 == 14) { $b1 = ($ii & 15); - $ii = ord($data[$nn + 1]); - $b2 = ($ii & 63); - $ii = ord($data[$nn + 2]); - $b3 = ($ii & 63); + $b2 = (ord($data[$nn + 1]) & 63); + $b3 = (ord($data[$nn + 2]) & 63); $ii = ((($b1 * 64) + $b2) * 64) + $b3; - $ent = sprintf('&#%d;', $ii); - $escapedData .= $ent; + $escapedData .= sprintf('&#%d;', $ii); $nn += 2; } // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb elseif ($ii >> 3 == 30) { $b1 = ($ii & 7); - $ii = ord($data[$nn + 1]); - $b2 = ($ii & 63); - $ii = ord($data[$nn + 2]); - $b3 = ($ii & 63); - $ii = ord($data[$nn + 3]); - $b4 = ($ii & 63); + $b2 = (ord($data[$nn + 1]) & 63); + $b3 = (ord($data[$nn + 2]) & 63); + $b4 = (ord($data[$nn + 3]) & 63); $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4; - $ent = sprintf('&#%d;', $ii); - $escapedData .= $ent; + $escapedData .= sprintf('&#%d;', $ii); $nn += 3; } }