From 5d63c263e763338952815b6724ffa4469061afc4 Mon Sep 17 00:00:00 2001 From: gggeek Date: Tue, 12 Jan 2021 00:20:41 +0000 Subject: [PATCH] improve encoding utf8->ascii for non-printable chars --- NEWS | 6 ++++++ src/Helper/Charset.php | 36 +++++++++++++++++------------------- tests/0CharsetTest.php | 12 +++++++++++- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/NEWS b/NEWS index 3d2f11d..eff5673 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,9 @@ +XML-RPC for PHP version 4.XX - unreleased + +* improved: when encoding utf8 text into us-ascii xml, use character entity references for characters number 0-31 + (ascii non printable characters), as we were already doing when encoding iso-8859-1 text into us-ascii xml + + XML-RPC for PHP version 4.5.2 - 2021/1/11 * improved: better phpdocs in the the php code generated by the Wrapper class diff --git a/src/Helper/Charset.php b/src/Helper/Charset.php index eca3e46..a72016e 100644 --- a/src/Helper/Charset.php +++ b/src/Helper/Charset.php @@ -168,7 +168,14 @@ class Charset $ch = $data[$nn]; $ii = ord($ch); // 7 bits: 0bbbbbbb (127) - if ($ii < 128) { + if ($ii < 32) { + if ($conversion == 'UTF-8_' || $conversion == 'UTF-8_US-ASCII') { + $escapedData .= sprintf('&#%d;', $ii); + } else { + $escapedData .= $ch; + } + } + else if ($ii < 128) { /// @todo shall we replace this with a (supposedly) faster str_replace? switch ($ii) { case 34: @@ -192,35 +199,26 @@ class Charset } // 11 bits: 110bbbbb 10bbbbbb (2047) elseif ($ii >> 5 == 6) { $b1 = ($ii & 31); - $ii = ord($data[$nn + 1]); - $b2 = ($ii & 63); + $b2 = (ord($data[$nn + 1]) & 63); $ii = ($b1 * 64) + $b2; - $ent = sprintf('&#%d;', $ii); - $escapedData .= $ent; + $escapedData .= sprintf('&#%d;', $ii); $nn += 1; } // 16 bits: 1110bbbb 10bbbbbb 10bbbbbb elseif ($ii >> 4 == 14) { $b1 = ($ii & 15); - $ii = ord($data[$nn + 1]); - $b2 = ($ii & 63); - $ii = ord($data[$nn + 2]); - $b3 = ($ii & 63); + $b2 = (ord($data[$nn + 1]) & 63); + $b3 = (ord($data[$nn + 2]) & 63); $ii = ((($b1 * 64) + $b2) * 64) + $b3; - $ent = sprintf('&#%d;', $ii); - $escapedData .= $ent; + $escapedData .= sprintf('&#%d;', $ii); $nn += 2; } // 21 bits: 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb elseif ($ii >> 3 == 30) { $b1 = ($ii & 7); - $ii = ord($data[$nn + 1]); - $b2 = ($ii & 63); - $ii = ord($data[$nn + 2]); - $b3 = ($ii & 63); - $ii = ord($data[$nn + 3]); - $b4 = ($ii & 63); + $b2 = (ord($data[$nn + 1]) & 63); + $b3 = (ord($data[$nn + 2]) & 63); + $b4 = (ord($data[$nn + 3]) & 63); $ii = ((((($b1 * 64) + $b2) * 64) + $b3) * 64) + $b4; - $ent = sprintf('&#%d;', $ii); - $escapedData .= $ent; + $escapedData .= sprintf('&#%d;', $ii); $nn += 3; } } diff --git a/tests/0CharsetTest.php b/tests/0CharsetTest.php index f94e3f4..549b0f4 100644 --- a/tests/0CharsetTest.php +++ b/tests/0CharsetTest.php @@ -15,8 +15,9 @@ use PhpXmlRpc\Helper\Charset; * chcp 28591 (latin1) * chcp 65001 (utf8) * - * @todo add tests for conversion: utf8 -> ascii + * @todo add tests for conversion: utf8 -> ascii (incl. chars 0-31) * @todo add tests for conversion: latin1 -> utf8 + * @todo add tests for conversion: latin1 -> ascii */ class CharsetTest extends PhpXmlRpc_PolyfillTestCase { @@ -49,6 +50,15 @@ class CharsetTest extends PhpXmlRpc_PolyfillTestCase ); } + protected function utf8ToAscii($data) + { + return Charset::instance()->encodeEntities( + $data, + 'UTF-8', + 'US-ASCII' + ); + } + public function testUtf8ToLatin1All() { /*$this->assertEquals( -- 2.43.0