Posted By

on 09/06/08


Tagged

rss scraping girlwithaonetrackmind


Versions (?)

Girl With a One Track Mind Scraper


 / Published in: Perl
 

URL: http://girlwithaonetrackmind.blogspot.com/

Didn't bother fixing it but works well enough to scrape Girl With a One Track Mind

  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. use LWP::Simple;
  5. use LWP::Parallel::UserAgent;
  6. use XML::TreeBuilder;
  7. use HTML::TreeBuilder 3.0;
  8. use Encode;
  9.  
  10. my $feedurl = "http://girlwithaonetrackmind.blogspot.com/feeds/posts/default";
  11.  
  12. my $xml = XML::TreeBuilder->new();
  13. $xml->parse(get($feedurl));
  14. my %posts;
  15. my $pua = LWP::Parallel::UserAgent->new();
  16.  
  17.  
  18. foreach my $entry ($xml->look_down("_tag", "entry")) {
  19. my $entryurl = $entry->look_down("_tag", "link")->attr_get_i("href");
  20. $posts{$entryurl} = $entry;
  21.  
  22. my $httprequest = HTTP::Request->new();
  23. $httprequest->url($entryurl);
  24. $httprequest->method("GET");
  25.  
  26. $pua->register($httprequest);
  27. }
  28.  
  29. my $html = $pua->wait();
  30.  
  31. foreach my $blogpost (values(%$html)) {
  32. my $response = $blogpost->response();
  33. my $url = $response->base()->as_string();
  34.  
  35. my $entry = $posts{$url};
  36. my $summary = $entry->look_down("_tag", "summary");
  37.  
  38. $summary->delete_content();
  39.  
  40. my $blogpost = HTML::TreeBuilder->new_from_content(
  41. $response->content()
  42. );
  43.  
  44. $blogpost->look_down("_tag", "div", "class", "byline")->delete();
  45.  
  46. foreach my $element ($blogpost->look_down("_tag", "script")) {
  47. $element->delete();
  48. }
  49.  
  50. $summary->push_content($blogpost->look_down("_tag", "div", "class", "blogPost")->as_HTML() );
  51. $summary->attr("type", "html");
  52. }
  53.  
  54. print encode("utf8", $xml->as_XML());

Report this snippet  

You need to login to post a comment.