Posted By

noah on 07/03/07


Tagged

regex http html 2003 agent aggregator watch track scraping


Versions (?)

Who likes this?

2 people have marked this snippet as a favorite

wirenaught
icecreamboyy


scraper


 / Published in: Perl
 

For a while I used this to scrape weather.com. Then they changed their HTML and my script broke.

  1. #! /usr/bin/perl -w
  2. use strict;
  3. #use LWP::Simple;
  4. use LWP::UserAgent;
  5.  
  6. package NoFace;
  7.  
  8. #****************************************************************
  9. # SYNOPSIS: *
  10. #****************************************************************
  11.  
  12. # NOFACE is a script to grab strings off arbitrary web pages based on a regular expression
  13.  
  14. #****************************************************************
  15. # get_facts FILE
  16.  
  17.  
  18. #****************************************************************
  19. #HEADLINES currently cant be called as a method, only internally as a sub
  20. #which is fine.
  21.  
  22. # headlines FACT, REGEX, URI, QUERY_STRING;
  23.  
  24. # EXAMPLE:
  25. # my $monster = new NoFace;
  26. # print $monster -> headlines ('high', '.*<b CLASS=obsTempTextA>(.\d*&deg;F)</b>.*', 'http://www.weather.com/weather/local/11215', 'x=19&lswe=11215&lswa=WeatherLocalUndeclared&GO=GO&whatprefs=&y=7');
  27.  
  28.  
  29.  
  30. #****************************************************************
  31. # METHODS *
  32. #****************************************************************
  33.  
  34. ################################################################
  35. #if this doesn't make sense, look at the headlines() method
  36. sub get_facts {
  37. $ARGV[0] = shift;
  38. my @fact;
  39. my %headline;
  40.  
  41. while (<>) {
  42. if ($_ ne "") {
  43. chomp(@fact = split ('\t', $_)); #expects a file with a tab-delimited list on each line
  44. $headline{$fact[0]} = headlines (@fact);
  45. }
  46. }
  47.  
  48. return %headline;
  49.  
  50. }
  51.  
  52. ################################################################
  53.  
  54. sub headlines {
  55. my $ua = new LWP::UserAgent;
  56. my $webfact;
  57. my ($fact, $regex, $uri, $query_string) = @_;
  58.  
  59. $ua->agent("MSIE/6.0 " . $ua->agent);
  60. # Create a request
  61. my $req = new HTTP::Request POST => $uri;
  62. #$req->content_type('application/x-www-form-urlencoded');
  63. $req->content($query_string);
  64. # Pass request to the user agent and get a response back
  65. my $res = $ua->request($req);
  66. # Check the outcome of the response
  67. if ($res->is_success) {
  68. my $page = $res->content;
  69. if ( $page =~ s{$regex}{$1}igs ) {
  70. $webfact = $page;
  71. }
  72. else { $webfact = "REGEX ERROR:\n$uri\n did not match regex:\n $regex\n";}
  73. } else {
  74. $webfact .= "BROWSER ERROR:\n$uri \nnot found!\n";
  75. }
  76. return $webfact;
  77. }
  78.  
  79. ################################################################
  80.  
  81. sub new
  82. {
  83. my $class = shift;
  84. my $self = {};
  85. #$self->{START_XML_TAG} = ""; #start regex
  86. bless($self, $class);
  87. return $self;
  88. }
  89.  
  90.  
  91.  
  92.  
  93.  
  94.  
  95.  

Report this snippet  

You need to login to post a comment.