From d8e180b0ca87bf675d5fda2099d49c9d0043cd42 Mon Sep 17 00:00:00 2001 From: gggeek Date: Sun, 27 Mar 2016 00:17:45 +0000 Subject: [PATCH] Fix for issue #33: excessive usage of numeric charset entities when converting utf8 to latin-1 --- NEWS | 5 ++- src/Client.php | 2 +- src/Helper/Charset.php | 22 +++++++--- tests/0CharsetTest.php | 92 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 7 deletions(-) create mode 100644 tests/0CharsetTest.php diff --git a/NEWS b/NEWS index 4d11f6e..93b4402 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,10 @@ -XML-RPC for PHP version 4.0.1 - 2016/??/?? +XML-RPC for PHP version 4.0.1 - 2016/3/27 * improved: all of the API documentation has been moved out of the manual and into the source code phpdoc comments +* fixed: when the internal character set is set to UTF-8 and the client sends requests (or the server responses), too + many characters were encoded as numeric entities, whereas some, like åäö, needed not not be + * fixed: the 'valtyp' property of Response was not present in all cases; the ValType property had been added by error and has been removed diff --git a/src/Client.php b/src/Client.php index af17679..d31e9f5 100644 --- a/src/Client.php +++ b/src/Client.php @@ -83,7 +83,7 @@ class Client /** * The charset encoding that will be used for serializing request sent by the client. - * If defaults to NULL, which means using US-ASCII and encoding all characters outside of the ASCII range using + * It defaults to NULL, which means using US-ASCII and encoding all characters outside of the ASCII range using * their xml character entity representation (this has the benefit that line end characters will not be mangled in * the transfer, a CR-LF will be preserved as well as a singe LF). * Valid values are 'US-ASCII', 'UTF-8' and 'ISO-8859-1' diff --git a/src/Helper/Charset.php b/src/Helper/Charset.php index 0ec7de4..4f1103b 100644 --- a/src/Helper/Charset.php +++ b/src/Helper/Charset.php @@ -8,6 +8,7 @@ class Charset { // tables used for transcoding different charsets into us-ascii xml protected $xml_iso88591_Entities = array("in" => array(), "out" => array()); + protected $xml_iso88591_utf8 = array("in" => array(), "out" => array()); /// @todo add to iso table the characters from cp_1252 range, i.e. 128 to 159? /// These will NOT be present in true ISO-8859-1, but will save the unwary @@ -93,16 +94,19 @@ class Charset $srcEncoding = PhpXmlRpc::$xmlrpc_internalencoding; } - switch (strtoupper($srcEncoding . '_' . $destEncoding)) { + $conversion = strtoupper($srcEncoding . '_' . $destEncoding); + switch ($conversion) { case 'ISO-8859-1_': case 'ISO-8859-1_US-ASCII': $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); $escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData); break; + case 'ISO-8859-1_UTF-8': $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); $escapedData = utf8_encode($escapedData); break; + case 'ISO-8859-1_ISO-8859-1': case 'US-ASCII_US-ASCII': case 'US-ASCII_UTF-8': @@ -112,6 +116,7 @@ class Charset //case 'CP1252_CP1252': $escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data); break; + case 'UTF-8_': case 'UTF-8_US-ASCII': case 'UTF-8_ISO-8859-1': @@ -123,7 +128,7 @@ class Charset for ($nn = 0; $nn < $ns; $nn++) { $ch = $data[$nn]; $ii = ord($ch); - //1 7 0bbbbbbb (127) + // 7 bits: 0bbbbbbb (127) if ($ii < 128) { /// @todo shall we replace this with a (supposedly) faster str_replace? switch ($ii) { @@ -145,7 +150,7 @@ class Charset default: $escapedData .= $ch; } // switch - } //2 11 110bbbbb 10bbbbbb (2047) + } // 11 bits: 110bbbbb 10bbbbbb (2047) elseif ($ii >> 5 == 6) { $b1 = ($ii & 31); $ii = ord($data[$nn + 1]); @@ -154,7 +159,7 @@ class Charset $ent = sprintf('&#%d;', $ii); $escapedData .= $ent; $nn += 1; - } //3 16 1110bbbb 10bbbbbb 10bbbbbb + } // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb elseif ($ii >> 4 == 14) { $b1 = ($ii & 15); $ii = ord($data[$nn + 1]); @@ -165,7 +170,7 @@ class Charset $ent = sprintf('&#%d;', $ii); $escapedData .= $ent; $nn += 2; - } //4 21 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb + } // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb elseif ($ii >> 3 == 30) { $b1 = ($ii & 7); $ii = ord($data[$nn + 1]); @@ -180,7 +185,13 @@ class Charset $nn += 3; } } + + // when converting to latin-1, do not be so eager with using entities for characters 160-255 + if ($conversion == 'UTF-8_ISO-8859-1') { + $escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData); + } break; + /* case 'CP1252_': case 'CP1252_US-ASCII': @@ -200,6 +211,7 @@ class Charset $escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData); break; */ + default: $escapedData = ''; error_log('XML-RPC: ' . __METHOD__ . ": Converting from $srcEncoding to $destEncoding: not supported..."); diff --git a/tests/0CharsetTest.php b/tests/0CharsetTest.php new file mode 100644 index 0000000..8a62506 --- /dev/null +++ b/tests/0CharsetTest.php @@ -0,0 +1,92 @@ +latinString = "\n\r\t"; + for($i = 32; $i < 127; $i++) { + $this->latinString .= chr($i); + } + for($i = 160; $i < 256; $i++) { + $this->latinString .= chr($i); + } + } + + protected function utfToLatin($data) + { + return Charset::instance()->encodeEntities( + $data, + 'UTF-8', + 'ISO-8859-1' + ); + } + + public function testUtf8ToLatin1All() + { + /*$this->assertEquals( + 'ISO-8859-1', + mb_detect_encoding($this->latinString, 'ISO-8859-1, UTF-8, WINDOWS-1251, ASCII', true), + 'Setup latinString is not ISO-8859-1 encoded...' + );*/ + $string = utf8_encode($this->latinString); + $encoded = $this->utfToLatin($string); + $this->assertEquals(str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $this->latinString), $encoded); + } + + public function testUtf8ToLatin1EuroSymbol() + { + $string = 'a.b.c.Ã¥.ä.ö.€.'; + $encoded = $this->utfToLatin($string); + $this->assertEquals(utf8_decode('a.b.c.Ã¥.ä.ö.€.'), $encoded); + } + + public function testUtf8ToLatin1Runes() + { + $string = $this->runes; + $encoded = $this->utfToLatin($string); + $this->assertEquals('ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ', $encoded); + } + + public function testUtf8ToLatin1Greek() + { + $string = $this->greek; + $encoded = $this->utfToLatin($string); + $this->assertEquals('Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ', $encoded); + } + + public function testUtf8ToLatin1Russian() + { + $string = $this->russian; + $encoded = $this->utfToLatin($string); + $this->assertEquals('Река неслася; бедный чёлн', $encoded); + } + + public function testUtf8ToLatin1Chinese() + { + $string = $this->chinese; + $encoded = $this->utfToLatin($string); + $this->assertEquals('我能吞下玻璃而不伤身体。', $encoded); + } +} -- 2.43.0