Posted By

aadsm on 04/10/10


Tagged

utf8 reader


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

wirenaught


Read a UTF8 String from a "byte" source


 / Published in: JavaScript
 

This is useful if you have a string where each character represents a byte like the one returned by getStringAt() in BinaryAjax.

  1. function readUTF8String(bytes) {
  2. var ix = 0;
  3.  
  4. if( bytes.slice(0,3) == "\xEF\xBB\xBF") {
  5. ix = 3;
  6. }
  7.  
  8. var string = "";
  9. for( ; ix < bytes.length; ix++ ) {
  10. var byte1 = bytes[ix].charCodeAt(0);
  11. if( byte1 < 0x80 ) {
  12. string += String.fromCharCode(byte1);
  13. } else if( byte1 >= 0xC2 && byte1 < 0xE0 ) {
  14. var byte2 = bytes[++ix].charCodeAt(0);
  15. string += String.fromCharCode(((byte1&0x1F)<<6) + (byte2&0x3F));
  16. } else if( byte1 >= 0xE0 && byte1 < 0xF0 ) {
  17. var byte2 = bytes[++ix].charCodeAt(0);
  18. var byte3 = bytes[++ix].charCodeAt(0);
  19. string += String.fromCharCode(((byte1&0xFF)<<12) + ((byte2&0x3F)<<6) + (byte3&0x3F));
  20. } else if( byte1 >= 0xF0 && byte1 < 0xF5) {
  21. var byte2 = bytes[++ix].charCodeAt(0);
  22. var byte3 = bytes[++ix].charCodeAt(0);
  23. var byte4 = bytes[++ix].charCodeAt(0);
  24. var codepoint = ((byte1&0x07)<<18) + ((byte2&0x3F)<<12)+ ((byte3&0x3F)<<6) + (byte4&0x3F);
  25. codepoint -= 0x10000;
  26. string += String.fromCharCode(
  27. (codepoint>>10) + 0xD800,
  28. (codepoint&0x3FF) + 0xDC00
  29. );
  30. }
  31. }
  32.  
  33. return string;
  34. }

Report this snippet  

You need to login to post a comment.