Posted By

on 09/06/08


Tagged

rss scraping theonion infographic


Versions (?)

The Onion Infographic Scrape to RSS


 / Published in: Perl
 

URL: http://www.theonion.com/content/index/

Scrapes the infographic to RSS. Old and busted.

  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. use LWP::Simple;
  5. use XML::RSS;
  6. use HTML::TreeBuilder 3.0;
  7. use Date::Manip;
  8.  
  9. my $rss = XML::RSS->new(version => '2.0');
  10.  
  11. $rss->channel( title => "Infographic: The Onion",
  12. link => "http://www.theonion.com/",
  13. language => "en",
  14. description => "",
  15. webmaster => "",
  16. ttl => "300"
  17. );
  18.  
  19. $rss->skipDays(
  20. day => "Sunday",
  21. # day => "Monday",
  22. # day => "Tuesday",
  23. # day => "Wednesday",
  24. # day => "Thursday",
  25. # day => "Friday",
  26. day => "Saturday"
  27. );
  28.  
  29. my $frontpage = HTML::TreeBuilder->new_from_content(get("http://www.theonion.com/content/index/"));
  30.  
  31. my $url = "http://www.theonion.com" . $frontpage->look_down("_tag", "div", "id", "infographic")->look_down("_tag", "a", "class", "image")->attr_get_i("href");
  32.  
  33. $frontpage->delete();
  34.  
  35. my $infographic = HTML::TreeBuilder->new_from_content(get($url));
  36.  
  37. my $date = UnixDate(ParseDate($infographic->look_down("_tag", "meta", "name", "date")->attr_get_i("content")), "%g");
  38. my $category = $infographic->look_down("_tag", "meta", "name", "category")->attr_get_i("content");
  39. my $element = $infographic->look_down("_tag", "div", "class", "story text_and_image");
  40. $element->attr("style", "width:500px");
  41. $element->look_down("_tag", "img")->attr("align", "right");
  42. foreach my $ptag ($element->look_down("_tag", "div", "class", "bulleted")->look_down("_tag", "p")) {
  43. ;
  44. }
  45. my $title = $element->look_down("_tag", "h2", "class", "title")->as_text();
  46. my $description = $element->as_HTML();
  47. $description =~ s/&amp/&/g;
  48.  
  49. $rss->add_item( title => "$title",
  50. description => "<![CDATA[$description]]>",
  51. link => "$url",
  52. pubDate => "$date",
  53. category => "$category",
  54. author => "",
  55. );
  56.  
  57. print $rss->as_string();

Report this snippet  

You need to login to post a comment.