Posted By

jerryvig on 11/04/11


Tagged

php html DOM simple scraping


Versions (?)

First Code 2 - Merrill Alegado


 / Published in: PHP
 

http://www.poetiv.com/

  1. <?php
  2. require_once("./simple_html_dom.php");
  3.  
  4. //getAuthorURLs();
  5. //getPoemURLs();
  6. getPoemTexts();
  7.  
  8. function getAuthorURLs() {
  9. //Load html from url
  10. $url = "http://www.poetiv.com";
  11. $ch = curl_init($url);
  12. $fp = fopen("./poetiv.html", "w");
  13. curl_setopt($ch, CURLOPT_FILE, $fp);
  14. curl_setopt($ch, CURLOPT_HEADER, 0);
  15. curl_exec($ch);
  16. fclose($fp);
  17.  
  18. //output csv file
  19. $csv_out = fopen("AuthorURLs.csv","w");
  20.  
  21. //Create Dom from html
  22. $dom = new simple_html_dom();
  23. $dom->load_file( "poetiv.html" );
  24.  
  25. $div_left_list = $dom->find("#fleft");
  26.  
  27. //line to debug size of list
  28. //echo count($div_left_list) . "\n";
  29.  
  30. foreach ( $div_left_list as &$div ) {
  31. $a_list = $div->find("a");
  32. foreach ( $a_list as &$anchor ) {
  33. //put the data into an array and then write array out to a csv file.
  34. $csv_array = array( $anchor->plaintext, $anchor->getAttribute("href") );
  35. fputcsv( $csv_out, $csv_array );
  36. }
  37. }
  38.  
  39. $div_right_list = $dom->find("#fright");
  40.  
  41. foreach ( $div_right_list as &$div ) {
  42. $a_list = $div->find("a");
  43. foreach ( $a_list as &$anchor ) {
  44. //put the data into an array and then write array out to a csv file.
  45. $csv_array = array( $anchor->plaintext, $anchor->getAttribute("href") );
  46. fputcsv( $csv_out, $csv_array );
  47. }
  48. }
  49.  
  50. //close the csv file output.
  51. fclose( $csv_out );
  52. }
  53.  
  54. function getPoemURLs() {
  55. $input_csv = fopen( "AuthorURLs.csv", "r" );
  56. $author_names = array();
  57. $author_urls = array();
  58.  
  59. while (($columns = fgetcsv($input_csv, 1000, ",")) !== FALSE) {
  60. array_push( $author_names, $columns[0] );
  61. array_push( $author_urls, $columns[1] );
  62. }
  63. fclose( $input_csv );
  64.  
  65. $output_csv = fopen( "PoemURLs.csv", "w" );
  66.  
  67. for ( $i=0; $i<count($author_names); $i++ ) {
  68. echo "getting page(s) for " . $author_names[$i] . " " . trim($author_urls[$i]) . "\n";
  69. findPoemURLs( trim($author_urls[$i]), $output_csv, trim($author_names[$i]) );
  70.  
  71. for ( $j=2; $j<16; $j++ ) {
  72. $author_url = str_replace( ".html", "", $author_urls[$i] ) . $j . ".html";
  73. echo "getting page(s) for " . $author_names[$i] . " " . trim($author_url) . "\n";
  74. findPoemURLs( trim($author_url), $output_csv, trim($author_names[$i]) );
  75. }
  76. }
  77. fclose( $output_csv );
  78. }
  79.  
  80. function findPoemURLs( $url, $out, $author_name ) {
  81. $ch = curl_init($url);
  82. $fp = fopen("./author.html", "w");
  83. curl_setopt($ch, CURLOPT_FILE, $fp);
  84. curl_setopt($ch, CURLOPT_HEADER, 0);
  85. curl_exec($ch);
  86. fclose($fp);
  87.  
  88. if ( filesize( "./author.html" ) > 1000 ) {
  89. $html = new simple_html_dom();
  90. $html->load_file( "./author.html" );
  91.  
  92. $fleft = $html->find("#fleft");
  93. foreach ( $fleft as &$myFleft ) {
  94. $aList = $myFleft->find("a");
  95. foreach ( $aList as &$anchor ) {
  96. $poem_name = trim($anchor->plaintext);
  97. $poem_url = trim($anchor->getAttribute('href'));
  98. $poem_row = array( $author_name, $poem_name, $poem_url );
  99. fputcsv( $out, $poem_row );
  100. }
  101. }
  102.  
  103. $fright = $html->find("#fright");
  104. foreach ( $fright as &$myFright ) {
  105. $aList = $myFright->find("a");
  106. foreach ( $aList as &$anchor ) {
  107. $poem_name = trim($anchor->plaintext);
  108. $poem_url = trim($anchor->getAttribute('href'));
  109. $poem_row = array( $author_name, $poem_name, $poem_url );
  110. fputcsv( $out, $poem_row );
  111. }
  112. }
  113.  
  114. $html->clear();
  115. }
  116. unlink( "./author.html" );
  117. }
  118.  
  119. function getPoemTexts() {
  120. $input_csv = fopen( "PoemURLs.csv", "r" );
  121. $author_names = array();
  122. $poem_names = array();
  123. $poem_urls = array();
  124.  
  125. while (($columns = fgetcsv($input_csv, 1000, ",")) !== FALSE) {
  126. array_push( $author_names, $columns[0] );
  127. array_push( $poem_names, $columns[1] );
  128. array_push( $poem_urls, $columns[2] );
  129. }
  130. fclose( $input_csv );
  131.  
  132. $dbh = mysql_connect( 'localhost', 'poetiv', 'alegado' );
  133. mysql_select_db( 'poetiv', $dbh );
  134.  
  135. $output_csv = fopen( "Poems.csv", "w" );
  136.  
  137. $simple = new simple_html_dom();
  138.  
  139. for ( $i=0; $i<count($poem_urls); $i++ ) {
  140. //$simple->load_file( trim($poem_urls[$i]) );
  141. $ch = curl_init( trim($poem_urls[$i]) );
  142. $fp = fopen("./poem.html", "w");
  143. curl_setopt($ch, CURLOPT_FILE, $fp);
  144. curl_setopt($ch, CURLOPT_HEADER, 0);
  145. curl_exec($ch);
  146. fclose($fp);
  147.  
  148. if ( filesize("./poem.html") > 800 ) {
  149. $simple->load_file( "./poem.html" );
  150. $pre_list = $simple->find('pre');
  151. $poem_text = '';
  152. foreach ( $pre_list as &$pre ) {
  153. $poem_text = trim($pre->plaintext);
  154. $poem_text = str_replace('"','',$poem_text);
  155. }
  156.  
  157. echo $author_names[$i] . ", " . $poem_names[$i] . ", " . $poem_urls[$i] . ", " . $poem_text . "\n";
  158. $poem_data = array( $author_names[$i], $poem_names[$i], $poem_text, $poem_urls[$i] );
  159. fputcsv( $output_csv, $poem_data );
  160. mysql_query( 'INSERT INTO poetiv_poems ( author_name, poem_name, poem_text, poem_url ) VALUES ( "' . $author_names[$i] . '", "' . $poem_names[$i] . '", "' . $poem_text . '", "' . $poem_urls[$i] . '" )' );
  161.  
  162. usleep( 200000 );
  163. }
  164. }
  165.  
  166. fclose( $output_csv );
  167. mysql_close( $dbh );
  168. }
  169.  
  170. ?>

Report this snippet  

You need to login to post a comment.