Posted By

kendsnyder on 08/18/11


Tagged

htmlentitydecode


Versions (?)

Convert HTML to Text


 / Published in: PHP
 

  1. <?php
  2. // strip javascript, styles, html tags, normalize entities and spaces
  3. // based on http://www.php.net/manual/en/function.strip-tags.php#68757
  4. function html2text($html){
  5. $text = $html;
  6. static $search = array(
  7. '@<script.+?</script>@usi', // Strip out javascript content
  8. '@<style.+?</style>@usi', // Strip style content
  9. '@<!--.+?-->@us', // Strip multi-line comments including CDATA
  10. '@</?[a-z].*?\>@usi', // Strip out HTML tags
  11. );
  12. $text = preg_replace($search, ' ', $text);
  13. // normalize common entities
  14. $text = normalizeEntities($text);
  15. // decode other entities
  16. $text = html_entity_decode($text, ENT_QUOTES, 'utf-8');
  17. // normalize possibly repeated newlines, tabs, spaces to spaces
  18. $text = preg_replace('/\s+/u', ' ', $text);
  19. $text = trim($text);
  20. // we must still run htmlentities on anything that comes out!
  21. // for instance:
  22. // <<a>script>alert('XSS')//<<a>/script>
  23. // will become
  24. // <script>alert('XSS')//</script>
  25. return $text;
  26. }
  27.  
  28. // replace encoded and double encoded entities to equivalent unicode character
  29. // also see /app/bookmarkletPopup.js
  30. function normalizeEntities($text) {
  31. static $find = array();
  32. static $repl = array();
  33. if (!count($find)) {
  34. // build $find and $replace from map one time
  35. $map = array(
  36. array('\'', 'apos', 39, 'x27'), // Apostrophe
  37. array('\'', '‘', 'lsquo', 8216, 'x2018'), // Open single quote
  38. array('\'', '’', 'rsquo', 8217, 'x2019'), // Close single quote
  39. array('"', '“', 'ldquo', 8220, 'x201C'), // Open double quotes
  40. array('"', '”', 'rdquo', 8221, 'x201D'), // Close double quotes
  41. array('\'', '‚', 'sbquo', 8218, 'x201A'), // Single low-9 quote
  42. array('"', '„', 'bdquo', 8222, 'x201E'), // Double low-9 quote
  43. array('\'', '′', 'prime', 8242, 'x2032'), // Prime/minutes/feet
  44. array('"', '″', 'Prime', 8243, 'x2033'), // Double prime/seconds/inches
  45. array(' ', 'nbsp', 160, 'xA0'), // Non-breaking space
  46. array('-', '‐', 8208, 'x2010'), // Hyphen
  47. array('-', '–', 'ndash', 8211, 150, 'x2013'), // En dash
  48. array('--', '—', 'mdash', 8212, 151, 'x2014'), // Em dash
  49. array(' ', ' ', 'ensp', 8194, 'x2002'), // En space
  50. array(' ', ' ', 'emsp', 8195, 'x2003'), // Em space
  51. array(' ', ' ', 'thinsp', 8201, 'x2009'), // Thin space
  52. array('*', '•', 'bull', 8226, 'x2022'), // Bullet
  53. array('*', '‣', 8227, 'x2023'), // Triangular bullet
  54. array('...', '…', 'hellip', 8230, 'x2026'), // Horizontal ellipsis
  55. array('°', 'deg', 176, 'xB0'), // Degree
  56. array('€', 'euro', 8364, 'x20AC'), // Euro
  57. array('¥', 'yen', 165, 'xA5'), // Yen
  58. array('£', 'pound', 163, 'xA3'), // British Pound
  59. array('©', 'copy', 169, 'xA9'), // Copyright Sign
  60. array('®', 'reg', 174, 'xAE'), // Registered Sign
  61. array('™', 'trade', 8482, 'x2122') // TM Sign
  62. );
  63. foreach ($map as $e) {
  64. for ($i = 1; $i < count($e); ++$i) {
  65. $code = $e[$i];
  66. if (is_int($code)) {
  67. // numeric entity
  68. $regex = "/&(amp;)?#0*$code;/";
  69. }
  70. elseif (preg_match('/^.$/u', $code)/* one unicode char*/) {
  71. // single character
  72. $regex = "/$code/u";
  73. }
  74. elseif (preg_match('/^x([0-9A-F]{2}){1,2}$/i', $code)) {
  75. // hex entity
  76. $regex = "/&(amp;)?#x0*" . substr($code, 1) . ";/i";
  77. }
  78. else {
  79. // named entity
  80. $regex = "/&(amp;)?$code;/";
  81. }
  82. $find[] = $regex;
  83. $repl[] = $e[0];
  84. }
  85. }
  86. } // end first time build
  87. return preg_replace($find, $repl, $text);
  88. }

Report this snippet  

You need to login to post a comment.