Posted By

liuran on 10/22/08


Tagged

html parser


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

nerdfiles


php html Parser


 / Published in: PHP
 

  1. Class HTML_Parser {
  2. // Private properties
  3. var $_parser;
  4. var $_tags = array();
  5. var $_html;
  6. var $output = array();
  7. var $strXmlData;
  8. var $_level = 0;
  9. var $_outline;
  10. var $_tagcount = array();
  11. var $xml_error = false;
  12. var $xml_error_code;
  13. var $xml_error_string;
  14. var $xml_error_line_number;
  15.  
  16. function get_html () {
  17. return $this->_html;
  18. }
  19.  
  20. function parse($strInputXML) {
  21. $this->output = array();
  22.  
  23. // Translate entities
  24. $strInputXML = $this->translate_entities($strInputXML);
  25.  
  26. $this->_parser = xml_parser_create ();
  27. xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, true);
  28. xml_set_object($this->_parser,$this);
  29. xml_set_element_handler($this->_parser, "tagOpen", "tagClosed");
  30.  
  31. xml_set_character_data_handler($this->_parser, "tagData");
  32.  
  33. $this->strXmlData = xml_parse($this->_parser,$strInputXML );
  34.  
  35. if (!$this->strXmlData) {
  36. $this->xml_error = true;
  37. $this->xml_error_code = xml_get_error_code($this->_parser);
  38. $this->xml_error_string = xml_error_string(xml_get_error_code($this->_parser));
  39. $this->xml_error_line_number = xml_get_current_line_number($this->_parser);
  40. return false;
  41. }
  42.  
  43. return $this->output;
  44. }
  45.  
  46.  
  47. function tagOpen($parser, $name, $attr) {
  48. // Increase level
  49. $this->_level++;
  50.  
  51. // Create tag:
  52. $newtag = $this->create_tag($name, $attr);
  53.  
  54. // Build tag
  55. $tag = array("name"=>$name,"attr"=>$attr, "level"=>$this->_level);
  56.  
  57. // Add tag
  58. array_push ($this->output, $tag);
  59.  
  60. // Add tag to this level
  61. $this->_tags[$this->_level] = $tag;
  62.  
  63. // Add to HTML
  64. $this->_html .= $newtag;
  65.  
  66. // Add to outline
  67. $this->_outline .= $this->_level . $newtag;
  68. }
  69.  
  70. function create_tag ($name, $attr) {
  71. // Create tag:
  72. # Begin with name
  73. $tag = '<' . strtolower($name) . ' ';
  74.  
  75. # Create attribute list
  76. foreach ($attr as $key=>$val) {
  77. $tag .= strtolower($key) . '="' . htmlentities($val) . '" ';
  78. }
  79.  
  80. # Finish tag
  81. $tag = trim($tag);
  82.  
  83. switch(strtolower($name)) {
  84. case 'br':
  85. case 'input':
  86. $tag .= ' /';
  87. break;
  88. }
  89.  
  90. $tag .= '>';
  91.  
  92. return $tag;
  93. }
  94.  
  95. function tagData($parser, $tagData) {
  96. if(trim($tagData)) {
  97. if(isset($this->output[count($this->output)-1]['tagData'])) {
  98. $this->output[count($this->output)-1]['tagData'] .= $tagData;
  99. } else {
  100. $this->output[count($this->output)-1]['tagData'] = $tagData;
  101. }
  102. }
  103.  
  104. $this->_html .= htmlentities($tagData);
  105. $this->_outline .= htmlentities($tagData);
  106. }
  107.  
  108. function tagClosed($parser, $name) {
  109. // Add to HTML and outline
  110. switch (strtolower($name)) {
  111. case 'br':
  112. case 'input':
  113. break;
  114. default:
  115. $this->_outline .= $this->_level . '</' . strtolower($name) . '>';
  116. $this->_html .= '</' . strtolower($name) . '>';
  117. }
  118.  
  119. // Get tag that belongs to this end
  120. $tag = $this->_tags[$this->_level];
  121. $tag = $this->create_tag($tag['name'], $tag['attr']);
  122.  
  123. // Try to get innerHTML
  124. $regex = '%' . preg_quote($this->_level . $tag, '%') . '(.*?)' . preg_quote($this->_level . '</' . strtolower($name) . '>', '%') . '%is';
  125. preg_match ($regex, $this->_outline, $matches);
  126.  
  127. // Get innerHTML
  128. if (isset($matches['1'])) {
  129. $innerhtml = $matches['1'];
  130. }
  131.  
  132. // Remove level identifiers
  133. $this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline);
  134. $this->_outline = str_replace($this->_level . '</' . strtolower($name) . '>', '</' . strtolower($name) . '>', $this->_outline);
  135.  
  136. // Add innerHTML
  137. if (isset($innerhtml)) {
  138. $this->output[count($this->output)-1]['innerhtml'] = $innerhtml;
  139. }
  140.  
  141. // Fix tree
  142. $this->output[count($this->output)-2]['children'][] = $this->output[count($this->output)-1];
  143. array_pop($this->output);
  144.  
  145. // Decrease level
  146. $this->_level--;
  147. }
  148.  
  149. function translate_entities($xmlSource, $reverse =FALSE) {
  150. static $literal2NumericEntity;
  151.  
  152. if (empty($literal2NumericEntity)) {
  153. $transTbl = get_html_translation_table(HTML_ENTITIES);
  154.  
  155. foreach ($transTbl as $char => $entity) {
  156. if (strpos('&#038;"<>', $char) !== FALSE) continue;
  157. $literal2NumericEntity[$entity] = '&#'.ord($char).';';
  158. }
  159. }
  160.  
  161. if ($reverse) {
  162. return strtr($xmlSource, array_flip($literal2NumericEntity));
  163. } else {
  164. return strtr($xmlSource, $literal2NumericEntity);
  165. }
  166. }
  167. }

Report this snippet  

You need to login to post a comment.