Posted By

Keef on 02/01/10


Tagged

php scrape


Versions (?)

Who likes this?

2 people have marked this snippet as a favorite

wirenaught
Terion


Screenscraper


 / Published in: PHP
 

NOTE: This is still very much work in progress and is rather "hackish".

This class parses and extracts information from websites using xpaths and optionally also regular expressions.

Here's a quick example of it's usage...

$ss = new ScreenScraper();

$ss->getData('http://progressquest.com/pemptus.php?name=Etzem');

$data = $ss->query(array(
        'Rank' => '//td[@class=\'sel\']/parent::*/td[1]',
        'Name' => '//td[@class=\'sel\']/parent::*/td[2]',
        'Race' => '//td[@class=\'sel\']/parent::*/td[3]',
        'Class' => '//td[@class=\'sel\']/parent::*/td[4]',
        'Level' => '//td[@class=\'sel\']/parent::*/td[5]',
        'Prime Stat' => '//td[@class=\'sel\']/parent::*/td[6]',
        'Plot Stage' => '//td[@class=\'sel\']/parent::*/td[7]',
        'Prized Item' => '//td[@class=\'sel\']/parent::*/td[8]',
        'Specialty' => '//td[@class=\'sel\']/parent::*/td[9]',
        'Motto' => '//td[@class=\'sel\']/parent::*/td[10]',
        'Guild_name' => '//td[@class=\'sel\']/parent::*/td[11]/a',
        'Guild_id' => array(
                'xpath' => '//td[@class=\'sel\']/parent::*/td[11]/a/@href',
                'match' => '/[^?]+\?id=(?P<id>[0-9]+)/i',
            ),
    ));

foreach($data as $key => $value) {
    echo $key. ': '. $value. "\n";
}

Outputs something like...

Rank: 42856
Name: Etzem
Race: Half Man
Class: Shiv-Knight
Level: 24
Prime Stat: CHA 62
Plot Stage: Act II
Prized Item: -2 Diamond Mail Greaves
Specialty: Good Move VI
Motto: Blue monkeys; the only monkeys for me.
Guild_name: pants
Guild_id: 1203

There's also offset and limit parameters on the found items but they're fairly self-explanatory.

  1. class ScreenScraper
  2. {
  3. var $url;
  4. var $data;
  5. var $xpaths;
  6.  
  7. // Download options
  8. var $cookie = 'cookie.txt';
  9. var $useragent = "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.1.1) Gecko/20090715 Firefox/3.5.1 GTB5 (.NET CLR 4.0.20506)";
  10. var $cache_time = '300';
  11. var $cache_dir = 'cache/';
  12. var $cache_file;
  13.  
  14. var $referer;
  15. var $postParams;
  16.  
  17.  
  18. function getData($url = null)
  19. {
  20. if (!empty($url))
  21. {
  22. $this->url = $url;
  23. }
  24.  
  25. // Make sure the cookie file exists
  26. if (!file_exists($this->cookie))
  27. {
  28. file_put_contents($this->cookie, '');
  29. }
  30.  
  31. if (!empty($this->url))
  32. {
  33. $this->cache_file = realpath($this->cache_dir). '/'. sha1($this->url. $this->cookie. $this->postParams. $this->useragent);
  34.  
  35. // Cache the page...
  36. if (file_exists($this->cache_file) && (time() - @filemtime($this->cache_file) < $this->cache_time))
  37. {
  38. $file = file_get_contents($this->cache_file);
  39. } else {
  40. $ch = curl_init($this->url);
  41.  
  42. $ch_opts = array(
  43. CURLOPT_URL => $this->url,
  44. CURLOPT_FOLLOWLOCATION => true,
  45. CURLOPT_RETURNTRANSFER => true,
  46. CURLOPT_MAXREDIRS => 3,
  47. CURLOPT_COOKIEFILE => $this->cookie,
  48. CURLOPT_COOKIEJAR => $this->cookie,
  49. CURLOPT_USERAGENT => $this->useragent,
  50. );
  51.  
  52. if (!empty($this->postParams))
  53. {
  54. $ch_opts += array(CURLOPT_POSTFIELDS => $this->postParams);
  55. }
  56. if (!empty($this->referer))
  57. {
  58. $ch_opts += array(CURLOPT_REFERER => $this->referer);
  59. }
  60.  
  61. curl_setopt_array($ch, $ch_opts);
  62.  
  63. $file = curl_exec($ch);
  64. curl_close ($ch);
  65. file_put_contents($this->cache_file, $file);
  66. }
  67.  
  68. $this->data = $file;
  69. $file = null;
  70.  
  71. return !empty($this->data);
  72. } else {
  73. return false;
  74. }
  75. }
  76.  
  77. function query($queries)
  78. {
  79. if (!empty($this->data))
  80. {
  81. $doc = new DomDocument();
  82. @$doc->loadHTML($this->data);
  83. $domXPath = new DomXPath($doc);
  84.  
  85. foreach($queries as $label => $query)
  86. {
  87.  
  88. $nodes = null;
  89.  
  90. if (is_array($query))
  91. {
  92. $items = $domXPath->query($query['xpath']);
  93. }
  94. else
  95. {
  96. $items = $domXPath->query($query);
  97. }
  98.  
  99. $count = 0;
  100.  
  101. foreach($items as $item)
  102. {
  103. if (!empty($item))
  104. {
  105. if (is_array($query) && !isset($query['offset']) || $count >= $query['offset'])
  106. {
  107. $node = trim(preg_replace("%(?:\n\r|\n|\s{2,})%is", " ", str_replace('&nbsp;', ' ', $this->__xpathHTML($item))));
  108.  
  109. if (is_array($query) && !empty($query['match']) && isset($query['match']))
  110. {
  111. preg_match_all($query['match'], $node, $matches);
  112.  
  113. // Need a better array algorithm, this is messy...
  114. if (!empty($matches))
  115. {
  116. if (count($matches) >= 1)
  117. {
  118. if (count($matches[1]) <= 1)
  119. {
  120. $nodes[] = $matches[1][0];
  121. }
  122. else
  123. {
  124. $nodes[] = $matches[1];
  125. }
  126. }
  127. else
  128. {
  129. if (isset($matches[0][0]))
  130. {
  131. $nodes[] = $matches[0][0];
  132. }
  133. else
  134. {
  135. $nodes[] = $matches[0];
  136. }
  137.  
  138. }
  139. }
  140.  
  141. }
  142. else
  143. {
  144. $nodes[] = $node;
  145. }
  146. }
  147. }
  148.  
  149. if (is_array($query) && isset($query['limit']) && ($count + 1) >= $query['limit'])
  150. {
  151. break;
  152. }
  153. else
  154. {
  155. $count++;
  156. }
  157. }
  158.  
  159. if (count($nodes) <= 1)
  160. {
  161. $data[$label] = $nodes[0];
  162. }
  163. else
  164. {
  165. $data[$label] = $nodes;
  166. }
  167. }
  168.  
  169. return $data;
  170. } else {
  171. return false;
  172. }
  173. }
  174.  
  175.  
  176. function __xpathHTML($element)
  177. {
  178. $children = $element->childNodes;
  179. foreach ($children as $child)
  180. {
  181. $domdoc = new DOMDocument();
  182. $domdoc->appendChild($domdoc->importNode($child, true));
  183. $innerHTML.=trim($domdoc->saveHTML());
  184. }
  185. return $innerHTML;
  186. }
  187. }

Report this snippet  

You need to login to post a comment.