Posted By

dom111 on 02/15/10


Tagged

xml unicode xmlentities xmlentitydecode


Versions (?)

xmlentnties and xml_entitiy_decode


 / Published in: PHP
 

URL: http://www.dom111.co.uk/blog/coding/xml-entities-in-php/224

Because htmlentities just doesn't cut it.

Recently needed this to work with 3rd part software, ended up having to make it pretty robust so thought I'd share!

  1. <?php
  2. /**
  3.  * unicode_ord
  4.  *
  5.  * Returns the unicode value of the string
  6.  *
  7.  * @param string $c The source string
  8.  * @param integer $i The index to get the char from (passed by reference for use in a loop)
  9.  * @return integer The value of the char at $c[$i]
  10.  * @author kerry at shetline dot com
  11.  * @author Dom Hastings - modified to suit my needs
  12.  * @see http://www.php.net/manual/en/function.ord.php#78032
  13.  */
  14. function unicode_ord(&$c, &$i = 0) {
  15. // get the character length
  16. $l = strlen($c);
  17. // copy the offset
  18. $index = $i;
  19.  
  20. // check it's a valid offset
  21. if ($index >= $l) {
  22. return false;
  23. }
  24.  
  25. // check the value
  26. $o = ord($c[$index]);
  27.  
  28. // if it's ascii
  29. if ($o <= 0x7F) {
  30. return $o;
  31.  
  32. // not sure what it is...
  33. } elseif ($o < 0xC2) {
  34. return false;
  35.  
  36. // if it's a two-byte character
  37. } elseif ($o <= 0xDF && $index < $l - 1) {
  38. $i += 1;
  39. return ($o & 0x1F) << 6 | (ord($c[$index + 1]) & 0x3F);
  40.  
  41. // three-byte
  42. } elseif ($o <= 0xEF && $index < $l - 2) {
  43. $i += 2;
  44. return ($o & 0x0F) << 12 | (ord($c[$index + 1]) & 0x3F) << 6 | (ord($c[$index + 2]) & 0x3F);
  45.  
  46. // four-byte
  47. } elseif ($o <= 0xF4 && $index < $l - 3) {
  48. $i += 3;
  49. return ($o & 0x0F) << 18 | (ord($c[$index + 1]) & 0x3F) << 12 | (ord($c[$index + 2]) & 0x3F) << 6 | (ord($c[$index + 3]) & 0x3F);
  50.  
  51. // not sure what it is...
  52. } else {
  53. return false;
  54. }
  55. }
  56.  
  57. /**
  58.  * unicode_chr
  59.  *
  60.  * @param string $c
  61.  * @return string
  62.  * @author Miguel Perez
  63.  * @see http://www.php.net/manual/en/function.chr.php#77911
  64.  */
  65. function unicode_chr(&$c) {
  66. if ($c <= 0x7F) {
  67. return chr($c);
  68.  
  69. } else if ($c <= 0x7FF) {
  70. return chr(0xC0 | $c >> 6).chr(0x80 | $c & 0x3F);
  71.  
  72. } else if ($c <= 0xFFFF) {
  73. return chr(0xE0 | $c >> 12).chr(0x80 | $c >> 6 & 0x3F).chr(0x80 | $c & 0x3F);
  74.  
  75. } else if ($c <= 0x10FFFF) {
  76. return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F).chr(0x80 | $c >> 6 & 0x3F).chr(0x80 | $c & 0x3F);
  77.  
  78. } else {
  79. return false;
  80. }
  81. }
  82.  
  83. /**
  84.  * xmlentities
  85.  *
  86.  * Makes the specified string XML-safe
  87.  *
  88.  * @param string $s
  89.  * @param boolean $hex Whether or not to make hexadecimal entities (as opposed to decimal)
  90.  * @return string The XML-safe result
  91.  * @author Dom Hastings
  92.  * @dependencies unicode_ord()
  93.  * @see http://www.w3.org/TR/REC-xml/#sec-predefined-ent
  94.  */
  95. function xmlentities($s, $hex = true) {
  96. // if the string is empty
  97. if (empty($s)) {
  98. // just return it
  99. return $s;
  100. }
  101.  
  102. // create the return string
  103. $r = '';
  104. // get the length
  105. $l = strlen($s);
  106.  
  107. // iterate the string
  108. for ($i = 0; $i < $l; $i++) {
  109. // get the value of the character
  110. $o = unicode_ord($s, $i);
  111.  
  112. // valid cahracters
  113. $v = (
  114. // \t \n <vertical tab> <form feed> \r
  115. ($o >= 9 && $o <= 13) ||
  116. // <space> !
  117. ($o == 32) || ($o == 33) ||
  118. // # $ %
  119. ($o >= 35 && $o <= 37) ||
  120. // ( ) * + , - . /
  121. ($o >= 40 && $o <= 47) ||
  122. // numbers
  123. ($o >= 48 && $o <= 57) ||
  124. // : ;
  125. ($o == 58) || ($o == 59) ||
  126. // = ?
  127. ($o == 61) || ($o == 63) ||
  128. // @
  129. ($o == 64) ||
  130. // uppercase
  131. ($o >= 65 && $o <= 90) ||
  132. // [ \ ] ^ _ `
  133. ($o >= 91 && $o <= 96) ||
  134. // lowercase
  135. ($o >= 97 && $o <= 122) ||
  136. // { | } ~
  137. ($o >= 123 && $o <= 126)
  138. );
  139.  
  140. // if it's valid, just keep it
  141. if ($v) {
  142. $r .= $s[$i];
  143.  
  144. // &
  145. } elseif ($o == 38) {
  146. $r .= '&amp;';
  147.  
  148. // <
  149. } elseif ($o == 60) {
  150. $r .= '&lt;';
  151.  
  152. // >
  153. } elseif ($o == 62) {
  154. $r .= '&gt;';
  155.  
  156. // '
  157. } elseif ($o == 39) {
  158. $r .= '&apos;';
  159.  
  160. // "
  161. } elseif ($o == 34) {
  162. $r .= '&quot;';
  163.  
  164. // unknown, add it as a reference
  165. } elseif ($o > 0) {
  166. if ($hex) {
  167. $r .= '&#x'.strtoupper(dechex($o)).';';
  168.  
  169. } else {
  170. $r .= '&#'.$o.';';
  171. }
  172. }
  173. }
  174.  
  175. return $r;
  176. }
  177.  
  178. /**
  179.  * xmlentity_decode
  180.  *
  181.  * Converts XML entity encoded data back to a unicode string
  182.  *
  183.  * @param string $s The XML encoded string
  184.  * @param array $entities Additional entities to decode (optional)
  185.  * @return string
  186.  * @dependencies unicode_chr()
  187.  * @author Dom Hastings
  188.  */
  189. function xml_entity_decode($s, $entities = array()) {
  190. // if the string is empty, just return it
  191. if (empty($s)) {
  192. return $s;
  193. }
  194.  
  195. // check that entities is an array
  196. if (!is_array($entities)) {
  197. throw new Exception('xmlentity_decode expects argument 2 to be array.');
  198. }
  199.  
  200. // initialise vars
  201. $r = '';
  202. $l = strlen($s);
  203.  
  204. // merge the entities with the defaults (amp, lt, gt, apos and quot MUST take precedence)
  205. $entities = array_merge($entities, array(
  206. 'amp' => '&',
  207. 'lt' => '<',
  208. 'gt' => '>',
  209. 'apos' => '\'',
  210. 'quot' => '"'
  211. ));
  212.  
  213. // loop through the string
  214. for ($i = 0; $i < $l; $i++) {
  215. // if it looks like an entity
  216. if ($s[$i] == '&') {
  217. // initialise some vars
  218. $e = '';
  219. $c = '';
  220.  
  221. // loop until we find a semi-colon
  222. for ($j = ++$i; ($c != ';' && $j < $l); $j++) {
  223. // get the char
  224. $c = $s[$j];
  225.  
  226. // if it's not a semi-colon
  227. if ($c != ';') {
  228. // add it to the temporary entity string
  229. $e .= $c;
  230. }
  231. }
  232.  
  233. // update the index
  234. $i = ($j - 1);
  235.  
  236. // if the first char is a #, it's a numeric entity
  237. if ($e[0] == '#') {
  238. // if the second char is x it's a hexadecimal entity
  239. if ($e[1] == 'x') {
  240. // store the number
  241. $e = hexdec(substr($e, 2));
  242.  
  243. } else {
  244. // store the number
  245. $e = substr($e, 1);
  246. }
  247. }
  248.  
  249. // if we got a number
  250. if (is_numeric($e)) {
  251. // get the unicode char from it
  252. $r .= unicode_chr($e);
  253.  
  254. // otherwise
  255. } else {
  256. // if it's in our array (which it should be)
  257. if (array_key_exists($e, $entities)) {
  258. // append the character
  259. $r .= $entities[$e];
  260.  
  261. // otherwise
  262. } else {
  263. // throw an exception, we don't know what to do with this
  264. throw new Exception('Unknown entity "'.$e.'"');
  265. }
  266. }
  267.  
  268. // if it's just a regular char
  269. } else {
  270. // append it
  271. $r .= $s[$i];
  272. }
  273. }
  274.  
  275. return $r;
  276. }

Report this snippet  

You need to login to post a comment.