Posted By

Pazuzu on 04/06/11


Tagged

php xml json parser unicode utf-8


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

vitor_np


XML to JSON (Simple) Parser


 / Published in: PHP
 

The unicode decoding functions were adapted from the Decoder.php source from the Zend Framework.

  1. <?php
  2. header("Content-type: text/html; charset=utf-8");
  3.  
  4. //http://framework.zend.com/svn/framework/standard/trunk/library/Zend/Json/Decoder.php
  5. function utf162utf8($utf16) {
  6. // Check for mb extension otherwise do by hand.
  7. if( function_exists('mb_convert_encoding') ) {
  8. return mb_convert_encoding($utf16, 'UTF-8', 'UTF-16');
  9. }
  10.  
  11. $bytes = (ord($utf16{0}) << 8) | ord($utf16{1});
  12.  
  13. switch (true) {
  14. case ((0x7F & $bytes) == $bytes):
  15. // this case should never be reached, because we are in ASCII range
  16. // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  17. return chr(0x7F & $bytes);
  18.  
  19. case (0x07FF & $bytes) == $bytes:
  20. // return a 2-byte UTF-8 character
  21. // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  22. return chr(0xC0 | (($bytes >> 6) & 0x1F))
  23. . chr(0x80 | ($bytes & 0x3F));
  24.  
  25. case (0xFFFF & $bytes) == $bytes:
  26. // return a 3-byte UTF-8 character
  27. // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  28. return chr(0xE0 | (($bytes >> 12) & 0x0F))
  29. . chr(0x80 | (($bytes >> 6) & 0x3F))
  30. . chr(0x80 | ($bytes & 0x3F));
  31. }
  32.  
  33. // ignoring UTF-32 for now, sorry
  34. return '';
  35. }
  36.  
  37. function decodeUnicodeString($chrs)
  38. {
  39. $delim = substr($chrs, 0, 1);
  40. $utf8 = '';
  41. $strlen_chrs = strlen($chrs);
  42.  
  43. for($i = 0; $i < $strlen_chrs; $i++) {
  44.  
  45. $substr_chrs_c_2 = substr($chrs, $i, 2);
  46. $ord_chrs_c = ord($chrs[$i]);
  47.  
  48. switch (true) {
  49. case preg_match('/\\\u[0-9A-F]{4}/i', substr($chrs, $i, 6)):
  50. // single, escaped unicode character
  51. $utf16 = chr(hexdec(substr($chrs, ($i + 2), 2)))
  52. . chr(hexdec(substr($chrs, ($i + 4), 2)));
  53. $utf8 .= utf162utf8($utf16);
  54. $i += 5;
  55. break;
  56. case ($ord_chrs_c >= 0x20) && ($ord_chrs_c <= 0x7F):
  57. $utf8 .= $chrs{$i};
  58. break;
  59. case ($ord_chrs_c & 0xE0) == 0xC0:
  60. // characters U-00000080 - U-000007FF, mask 110XXXXX
  61. //see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  62. $utf8 .= substr($chrs, $i, 2);
  63. ++$i;
  64. break;
  65. case ($ord_chrs_c & 0xF0) == 0xE0:
  66. // characters U-00000800 - U-0000FFFF, mask 1110XXXX
  67. // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  68. $utf8 .= substr($chrs, $i, 3);
  69. $i += 2;
  70. break;
  71. case ($ord_chrs_c & 0xF8) == 0xF0:
  72. // characters U-00010000 - U-001FFFFF, mask 11110XXX
  73. // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  74. $utf8 .= substr($chrs, $i, 4);
  75. $i += 3;
  76. break;
  77. case ($ord_chrs_c & 0xFC) == 0xF8:
  78. // characters U-00200000 - U-03FFFFFF, mask 111110XX
  79. // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  80. $utf8 .= substr($chrs, $i, 5);
  81. $i += 4;
  82. break;
  83. case ($ord_chrs_c & 0xFE) == 0xFC:
  84. // characters U-04000000 - U-7FFFFFFF, mask 1111110X
  85. // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  86. $utf8 .= substr($chrs, $i, 6);
  87. $i += 5;
  88. break;
  89. }
  90. }
  91.  
  92. return $utf8;
  93. }
  94.  
  95.  
  96. $json = json_encode(simplexml_load_file("the_file.xml"));
  97. echo decodeUnicodeString( str_replace("\t", "", $json) );
  98.  
  99. ?>

Report this snippet  

You need to login to post a comment.