Published in: Perl
For a while I used this to scrape weather.com. Then they changed their HTML and my script broke.
#! /usr/bin/perl -w use strict; #use LWP::Simple; use LWP::UserAgent; package NoFace; #**************************************************************** # SYNOPSIS: * #**************************************************************** # NOFACE is a script to grab strings off arbitrary web pages based on a regular expression #**************************************************************** # get_facts FILE #**************************************************************** #HEADLINES currently cant be called as a method, only internally as a sub #which is fine. # headlines FACT, REGEX, URI, QUERY_STRING; # EXAMPLE: # my $monster = new NoFace; # print $monster -> headlines ('high', '.*<b CLASS=obsTempTextA>(.\d*°F)</b>.*', 'http://www.weather.com/weather/local/11215', 'x=19&lswe=11215&lswa=WeatherLocalUndeclared&GO=GO&whatprefs=&y=7'); #**************************************************************** # METHODS * #**************************************************************** ################################################################ #if this doesn't make sense, look at the headlines() method sub get_facts { my @fact; my %headline; while (<>) { if ($_ ne "") { $headline{$fact[0]} = headlines (@fact); } } return %headline; } ################################################################ sub headlines { my $ua = new LWP::UserAgent; my $webfact; my ($fact, $regex, $uri, $query_string) = @_; $ua->agent("MSIE/6.0 " . $ua->agent); # Create a request my $req = new HTTP::Request POST => $uri; #$req->content_type('application/x-www-form-urlencoded'); $req->content($query_string); # Pass request to the user agent and get a response back my $res = $ua->request($req); # Check the outcome of the response if ($res->is_success) { my $page = $res->content; $webfact = $page; } else { $webfact = "REGEX ERROR:\n$uri\n did not match regex:\n $regex\n";} } else { $webfact .= "BROWSER ERROR:\n$uri \nnot found!\n"; } return $webfact; } ################################################################ sub new { my $self = {}; #$self->{START_XML_TAG} = ""; #start regex return $self; } return 1;
You need to login to post a comment.
