// tables used for transcoding different charsets into us-ascii xml
protected $xml_iso88591_Entities = array("in" => array(), "out" => array());
- /// @todo add to iso table the characters from cp_1252 range, i.e. 128 to 159?
- /// These will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
- /// (though no luck when receiving them...)
- /*
- protected $xml_cp1252_Entities = array('in' => array(), out' => array(
- '€', '?', '‚', 'ƒ',
- '„', '…', '†', '‡',
- 'ˆ', '‰', 'Š', '‹',
- 'Œ', '?', 'Ž', '?',
- '?', '‘', '’', '“',
- '”', '•', '–', '—',
- '˜', '™', 'š', '›',
- 'œ', '?', 'ž', 'Ÿ'
- ));
- */
+ /// @todo should we add to the latin-1 table the characters from cp_1252 range, i.e. 128 to 159 ?
+ /// Those will NOT be present in true ISO-8859-1, but will save the unwary windows user from sending junk
+ /// (though no luck when receiving them...)
+ /// Note also that, apparently, while 'ISO/IEC 8859-1' has no characters defined for bytes 128 to 159,
+ /// IANA ISO-8859-1 does have well-defined 'C1' control codes for those - wikipedia's page on latin-1 says:
+ /// "ISO-8859-1 is the IANA preferred name for this standard when supplemented with the C0 and C1 control codes from ISO/IEC 6429."
+ /// Check what mbstring/iconv do by default with those?
+ //
+ //protected $xml_cp1252_Entities = array('in' => array(), out' => array());
protected $charset_supersets = array(
'US-ASCII' => array('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
/**
* This class is singleton for performance reasons.
+ * @todo can't we just make $xml_iso88591_Entities a static variable instead ?
*
* @return Charset
*/
return self::$instance;
}
- private function __construct()
+ /**
+ * Force usage as singleton
+ */
+ protected function __construct()
{
- for ($i = 0; $i < 32; $i++) {
- $this->xml_iso88591_Entities["in"][] = chr($i);
- $this->xml_iso88591_Entities["out"][] = "&#{$i};";
- }
+ }
- for ($i = 160; $i < 256; $i++) {
- $this->xml_iso88591_Entities["in"][] = chr($i);
- $this->xml_iso88591_Entities["out"][] = "&#{$i};";
- }
+ /**
+ * @param string $tableName
+ * @throws \Exception for unsupported $tableName
+ */
+ protected function buildConversionTable($tableName)
+ {
+ switch($tableName) {
+ case 'xml_iso88591_Entities':
+ if (count($this->xml_iso88591_Entities['in'])) {
+ return;
+ }
+ for ($i = 0; $i < 32; $i++) {
+ $this->xml_iso88591_Entities["in"][] = chr($i);
+ $this->xml_iso88591_Entities["out"][] = "&#{$i};";
+ }
- /*for ($i = 128; $i < 160; $i++)
- {
- $this->xml_cp1252_Entities['in'][] = chr($i);
- }*/
+ for ($i = 160; $i < 256; $i++) {
+ $this->xml_iso88591_Entities["in"][] = chr($i);
+ $this->xml_iso88591_Entities["out"][] = "&#{$i};";
+ }
+ break;
+ /*case 'xml_cp1252_Entities':
+ if (count($this->xml_cp1252_Entities['in'])) {
+ return;
+ }
+ for ($i = 128; $i < 160; $i++)
+ {
+ $this->xml_cp1252_Entities['in'][] = chr($i);
+ }
+ $this->xml_cp1252_Entities['out'] = array(
+ '€', '?', '‚', 'ƒ',
+ '„', '…', '†', '‡',
+ 'ˆ', '‰', 'Š', '‹',
+ 'Œ', '?', 'Ž', '?',
+ '?', '‘', '’', '“',
+ '”', '•', '–', '—',
+ '˜', '™', 'š', '›',
+ 'œ', '?', 'ž', 'Ÿ'
+ );
+ $this->buildConversionTable('xml_iso88591_Entities');
+ break;*/
+ default:
+ throw new \Exception('Unsupported table: ' . $tableName);
+ }
}
/**
switch ($conversion) {
case 'ISO-8859-1_':
case 'ISO-8859-1_US-ASCII':
+ $this->buildConversionTable('xml_iso88591_Entities');
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);
$escapedData = str_replace($this->xml_iso88591_Entities['in'], $this->xml_iso88591_Entities['out'], $escapedData);
break;
// when converting to latin-1, do not be so eager with using entities for characters 160-255
if ($conversion == 'UTF-8_ISO-8859-1') {
+ $this->buildConversionTable('xml_iso88591_Entities');
$escapedData = str_replace(array_slice($this->xml_iso88591_Entities['out'], 32), array_slice($this->xml_iso88591_Entities['in'], 32), $escapedData);
}
break;
/*
case 'CP1252_':
case 'CP1252_US-ASCII':
+ $this->buildConversionTable('xml_cp1252_Entities');
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);
$escapedData = str_replace($this->xml_iso88591_Entities']['in'], $this->xml_iso88591_Entities['out'], $escapedData);
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
break;
case 'CP1252_UTF-8':
+ $this->buildConversionTable('xml_cp1252_Entities');
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);
- /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all allone will NOT convert them)
+ /// @todo we could use real UTF8 chars here instead of xml entities... (note that utf_8 encode all alone will NOT convert them)
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);
$escapedData = utf8_encode($escapedData);
break;
case 'CP1252_ISO-8859-1':
+ $this->buildConversionTable('xml_cp1252_Entities');
$escapedData = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $data);
// we might as well replace all funky chars with a '?' here, but we are kind and leave it to the receiving application layer to decide what to do with these weird entities...
$escapedData = str_replace($this->xml_cp1252_Entities['in'], $this->xml_cp1252_Entities['out'], $escapedData);