Posted By

jerryvig on 10/14/11


Tagged

php google geocode esther kestenbaum


Versions (?)

TrainingIndustry.com Scrape - Esther Kestenbaum 4


 / Published in: PHP
 

  1. <?php
  2. require_once('./simple_html_dom.php');
  3. require_once('./GoogleGeocode.php');
  4.  
  5. //loadUrls();
  6. //getContactLinks();
  7. // getContactInfo();
  8. getAddys();
  9.  
  10. function loadUrls() {
  11. $dom = new simple_html_dom();
  12. $fh = fopen( "URLList1.csv", "r" );
  13. $fh2 = fopen( "ProfileURLs.csv", "w" );
  14.  
  15. while (($buffer = fgets($fh, 4096)) !== false) {
  16. $urlString = "http://www.trainingindustry.com/learning-communities/". trim($buffer);
  17. echo $urlString . "\n";
  18. $dom->load_file( $urlString );
  19. $divList = $dom->find('div[class=company-info]');
  20. foreach ( $divList as &$div ) {
  21. $aList = $div->find('a');
  22. fwrite( $fh2, trim($aList[0]->getAttribute("href")) . "\n" );
  23. }
  24.  
  25. }
  26.  
  27. fclose( $fh2 );
  28. fclose( $fh );
  29. }
  30.  
  31. function getContactLinks() {
  32. $fh2 = fopen( "ProfileURLs.csv", "r" );
  33. $dom = new simple_html_dom();
  34. $fh = fopen( "contact_urls.csv", "w" );
  35.  
  36. while (($buffer = fgets($fh2, 4096)) !== false) {
  37. $pUrl = trim($buffer);
  38. echo $pUrl . "\n";
  39. $dom->load_file( 'http://www.trainingindustry.com' . $pUrl );
  40. $pList = $dom->find('p[class=btn contact-sponsor]');
  41. if ( count($pList) > 0 ) {
  42. $aList = $pList[0]->find('a');
  43. $contact_url = 'http://www.trainingindustry.com' . trim($aList[0]->getAttribute('href')) . "\n";
  44. fwrite( $fh, $contact_url );
  45. }
  46. }
  47.  
  48. fclose( $fh );
  49. fclose ( $fh2 );
  50. }
  51.  
  52. function getContactInfo () {
  53. $fh = fopen( "ProfileURLs.csv", "r" );
  54. $dom = new simple_html_dom();
  55. $fh2 = fopen( "contactInfo.csv", "w" );
  56.  
  57. while (($buffer = fgets($fh, 4096)) !== false) {
  58. $url1 = trim($buffer);
  59. echo $url1 . "\n";
  60.  
  61. $cmd = '/usr/bin/wget -O file.html -t 3 -U "Mozilla/5.0" "http://www.trainingindustry.com' . trim($url1) . '"';
  62. echo $cmd . "\n";
  63. system( $cmd );
  64.  
  65. $dom->load_file( './file.html' );
  66.  
  67. $company_name = "";
  68. $address = "";
  69. $cName = "";
  70. $phone = "";
  71. $company_desc = "";
  72. $website_url = '';
  73.  
  74. if ( !isset($dom) ) {
  75. continue;
  76. }
  77.  
  78. $divCList = $dom->find('div[class=company-heading]');
  79. if ( count($divCList) < 1 ) continue;
  80. $h2List = $divCList[0]->find('h2');
  81. if ( count($h2List) > 0 ) {
  82. $company_name = trim($h2List[0]->plaintext);
  83. }
  84.  
  85. $descList = $dom->find('div[class=company-desc]');
  86. if ( count( $descList ) > 0 ) {
  87. $company_desc = trim($descList[0]->plaintext);
  88. //echo trim($descList[0]->plaintext);
  89. }
  90.  
  91.  
  92. $aList = $dom->find('a[class=url]');
  93. if ( count($aList) > 0 ) {
  94. $website_url = trim($aList[0]->href);
  95. }
  96.  
  97. $divList = $dom->find('div[class=supplier-v-card]');
  98. foreach ( $divList as &$div ) {
  99. $contactDiv = $div->find('div[class=company-address]');
  100. $address = trim($contactDiv[0]->plaintext);
  101. $contact_name_span = $div->find('span[class=fn]');
  102. $contactName = trim($contact_name_span[0]->plaintext);
  103. $phoneLi = $div->find('li[class=tel]');
  104. echo $phoneLi[0]->plaintext . "\n";
  105. $pieces = explode( ":", ($phoneLi[0]->plaintext) );
  106. if ( count($pieces) > 1 ) {
  107. $phone = trim(substr(trim($pieces[1]), 1 ));
  108. }
  109. }
  110.  
  111. fwrite( $fh2, '"' . trim($company_name) . '","' . trim($contactName) . '","' . trim($address) . '","' . trim($phone) . '","' . trim($company_desc) . '","' . trim($website_url) . '"' . "\n" );
  112. }
  113.  
  114. fclose ( $fh );
  115. fclose( $fh2 );
  116. }
  117.  
  118.  
  119. function getAddys() {
  120. $fh = fopen( "TIDirectAddys.csv", "r" );
  121. $fh2 = fopen( "TIAddressParts.csv", "w" );
  122.  
  123. $apiKey = 'ABQIAAAAI1oIsi6Dv7MlmxUm1lRR_xTmarcuMJj81CoryY3grjEx5dFcyxQoeQTublWNe-B1iLVnHNrRuJD6_w';
  124. $geo = new GoogleGeocode( $apiKey );
  125.  
  126. //$result = $geo->geocode( "124 Merrydale RD San Rafael, CA 94903" );
  127. //print_r( $result );
  128.  
  129. while (($pieces = fgetcsv($fh, 1000, ",", '"')) !== FALSE) {
  130. $dirty_addy = str_replace( "\n", " ", $pieces[1] );
  131. $dirty_addy = utf8_encode($dirty_addy);
  132. $addy_array = $geo->geocode( $dirty_addy );
  133. $country = '';
  134. if ( isset($addy_array['Placemarks'][0]['Country']) ) {
  135. $country = $addy_array['Placemarks'][0]['Country'];
  136. }
  137. $state = '';
  138. if ( isset($addy_array['Placemarks'][0]['AdministrativeArea']) ) {
  139. $state = $addy_array['Placemarks'][0]['AdministrativeArea'];
  140. }
  141. $city = '';
  142. if ( isset($addy_array['Placemarks'][0]['Locality']) ) {
  143. $city = $addy_array['Placemarks'][0]['Locality'];
  144. }
  145. $street_address = '';
  146. if ( isset($addy_array['Placemarks'][0]['Thoroughfare']) ) {
  147. $street_address = $addy_array['Placemarks'][0]['Thoroughfare'];
  148. }
  149. $zip = '';
  150. if ( isset($addy_array['Placemarks'][0]['PostalCode']) ) {
  151. $zip = $addy_array['Placemarks'][0]['PostalCode'];
  152. }
  153.  
  154. echo '"' . trim($pieces[0]) . '","' . $street_address . '","' . $city . '","' . $state . '","' . $zip . '","' . $country . '"' . "\n";
  155. fwrite( $fh2, '"' . trim($pieces[0]) . '","' . $street_address . '","' . $city . '","' . $state . '","' . $zip . '","' . $country . '"' . "\n" );
  156. usleep( 750000 );
  157. }
  158. fclose( $fh );
  159. fclose( $fh2 );
  160. }
  161.  
  162. ?>

Report this snippet  

You need to login to post a comment.