/ Published in: PHP
NOTE: This is still very much work in progress and is rather "hackish".
This class parses and extracts information from websites using xpaths and optionally also regular expressions.
Here's a quick example of it's usage...
$ss = new ScreenScraper();
$ss->getData('http://progressquest.com/pemptus.php?name=Etzem');
$data = $ss->query(array(
'Rank' => '//td[@class=\'sel\']/parent::*/td[1]',
'Name' => '//td[@class=\'sel\']/parent::*/td[2]',
'Race' => '//td[@class=\'sel\']/parent::*/td[3]',
'Class' => '//td[@class=\'sel\']/parent::*/td[4]',
'Level' => '//td[@class=\'sel\']/parent::*/td[5]',
'Prime Stat' => '//td[@class=\'sel\']/parent::*/td[6]',
'Plot Stage' => '//td[@class=\'sel\']/parent::*/td[7]',
'Prized Item' => '//td[@class=\'sel\']/parent::*/td[8]',
'Specialty' => '//td[@class=\'sel\']/parent::*/td[9]',
'Motto' => '//td[@class=\'sel\']/parent::*/td[10]',
'Guild_name' => '//td[@class=\'sel\']/parent::*/td[11]/a',
'Guild_id' => array(
'xpath' => '//td[@class=\'sel\']/parent::*/td[11]/a/@href',
'match' => '/[^?]+\?id=(?P<id>[0-9]+)/i',
),
));
foreach($data as $key => $value) {
echo $key. ': '. $value. "\n";
}
Outputs something like...
Rank: 42856
Name: Etzem
Race: Half Man
Class: Shiv-Knight
Level: 24
Prime Stat: CHA 62
Plot Stage: Act II
Prized Item: -2 Diamond Mail Greaves
Specialty: Good Move VI
Motto: Blue monkeys; the only monkeys for me.
Guild_name: pants
Guild_id: 1203
There's also offset and limit parameters on the found items but they're fairly self-explanatory.
Expand |
Embed | Plain Text
class ScreenScraper { var $url; var $data; var $xpaths; // Download options var $cookie = 'cookie.txt'; var $useragent = "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.1.1) Gecko/20090715 Firefox/3.5.1 GTB5 (.NET CLR 4.0.20506)"; var $cache_time = '300'; var $cache_dir = 'cache/'; var $cache_file; var $referer; var $postParams; function getData($url = null) { { $this->url = $url; } // Make sure the cookie file exists { file_put_contents($this->cookie, ''); } { // Cache the page... { } else { $ch = curl_init($this->url); CURLOPT_URL => $this->url, CURLOPT_FOLLOWLOCATION => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_MAXREDIRS => 3, CURLOPT_COOKIEFILE => $this->cookie, CURLOPT_COOKIEJAR => $this->cookie, CURLOPT_USERAGENT => $this->useragent, ); { } { } curl_setopt_array($ch, $ch_opts); $file = curl_exec($ch); curl_close ($ch); file_put_contents($this->cache_file, $file); } $this->data = $file; $file = null; } else { return false; } } function query($queries) { { $doc = new DomDocument(); @$doc->loadHTML($this->data); $domXPath = new DomXPath($doc); foreach($queries as $label => $query) { $nodes = null; { $items = $domXPath->query($query['xpath']); } else { $items = $domXPath->query($query); } $count = 0; foreach($items as $item) { { { $node = trim(preg_replace("%(?:\n\r|\n|\s{2,})%is", " ", str_replace(' ', ' ', $this->__xpathHTML($item)))); { // Need a better array algorithm, this is messy... { { { $nodes[] = $matches[1][0]; } else { $nodes[] = $matches[1]; } } else { { $nodes[] = $matches[0][0]; } else { $nodes[] = $matches[0]; } } } } else { $nodes[] = $node; } } } { break; } else { $count++; } } { $data[$label] = $nodes[0]; } else { $data[$label] = $nodes; } } return $data; } else { return false; } } function __xpathHTML($element) { $children = $element->childNodes; foreach ($children as $child) { $domdoc = new DOMDocument(); $domdoc->appendChild($domdoc->importNode($child, true)); } return $innerHTML; } }
You need to login to post a comment.
