We Recommend

Learning Perl Learning Perl
In this smooth, carefully paced course, a leading Perl trainer teaches you to program in the language that threatens to make C, sed, awk, and the Unix shell obsolete for many tasks. This book is the "official" guide for both formal (classroom) and informal learning. It is fully accessible to the novice programmer.


Posted By

noah on 07/03/07


Tagged

regex http html scrape 2003 agent aggregator watch track


Versions (?)


scraper


Published in: Perl 


For a while I used this to scrape weather.com. Then they changed their HTML and my script broke.

  1. #! /usr/bin/perl -w
  2. use strict;
  3. #use LWP::Simple;
  4. use LWP::UserAgent;
  5.  
  6. package NoFace;
  7.  
  8. #****************************************************************
  9. # SYNOPSIS: *
  10. #****************************************************************
  11.  
  12. # NOFACE is a script to grab strings off arbitrary web pages based on a regular expression
  13.  
  14. #****************************************************************
  15. # get_facts FILE
  16.  
  17.  
  18. #****************************************************************
  19. #HEADLINES currently cant be called as a method, only internally as a sub
  20. #which is fine.
  21.  
  22. # headlines FACT, REGEX, URI, QUERY_STRING;
  23.  
  24. # EXAMPLE:
  25. # my $monster = new NoFace;
  26. # print $monster -> headlines ('high', '.*<b CLASS=obsTempTextA>(.\d*&deg;F)</b>.*', 'http://www.weather.com/weather/local/11215', 'x=19&lswe=11215&lswa=WeatherLocalUndeclared&GO=GO&whatprefs=&y=7');
  27.  
  28.  
  29.  
  30. #****************************************************************
  31. # METHODS *
  32. #****************************************************************
  33.  
  34. ################################################################
  35. #if this doesn't make sense, look at the headlines() method
  36. sub get_facts {
  37. $ARGV[0] = shift;
  38. my @fact;
  39. my %headline;
  40.  
  41. while (<>) {
  42. if ($_ ne "") {
  43. chomp(@fact = split ('\t', $_)); #expects a file with a tab-delimited list on each line
  44. $headline{$fact[0]} = headlines (@fact);
  45. }
  46. }
  47.  
  48. return %headline;
  49.  
  50. }
  51.  
  52. ################################################################
  53.  
  54. sub headlines {
  55. my $ua = new LWP::UserAgent;
  56. my $webfact;
  57. my ($fact, $regex, $uri, $query_string) = @_;
  58.  
  59. $ua->agent("MSIE/6.0 " . $ua->agent);
  60. # Create a request
  61. my $req = new HTTP::Request POST => $uri;
  62. #$req->content_type('application/x-www-form-urlencoded');
  63. $req->content($query_string);
  64. # Pass request to the user agent and get a response back
  65. my $res = $ua->request($req);
  66. # Check the outcome of the response
  67. if ($res->is_success) {
  68. my $page = $res->content;
  69. if ( $page =~ s{$regex}{$1}igs ) {
  70. $webfact = $page;
  71. }
  72. else { $webfact = "REGEX ERROR:\n$uri\n did not match regex:\n $regex\n";}
  73. } else {
  74. $webfact .= "BROWSER ERROR:\n$uri \nnot found!\n";
  75. }
  76. return $webfact;
  77. }
  78.  
  79. ################################################################
  80.  
  81. sub new
  82. {
  83. my $class = shift;
  84. my $self = {};
  85. #$self->{START_XML_TAG} = ""; #start regex
  86. bless($self, $class);
  87. return $self;
  88. }
  89.  
  90.  
  91.  
  92.  
  93.  
  94.  
  95.  

Report this snippet 

You need to login to post a comment.