/ Published in: Perl
URL: http://www.theonion.com/content/index/
Scrapes the infographic to RSS. Old and busted.
Expand |
Embed | Plain Text
#!/usr/bin/perl -w use strict; use LWP::Simple; use XML::RSS; use HTML::TreeBuilder 3.0; use Date::Manip; my $rss = XML::RSS->new(version => '2.0'); $rss->channel( title => "Infographic: The Onion", language => "en", description => "", webmaster => "", ttl => "300" ); $rss->skipDays( day => "Sunday", # day => "Monday", # day => "Tuesday", # day => "Wednesday", # day => "Thursday", # day => "Friday", day => "Saturday" ); my $frontpage = HTML::TreeBuilder->new_from_content(get("http://www.theonion.com/content/index/")); my $url = "http://www.theonion.com" . $frontpage->look_down("_tag", "div", "id", "infographic")->look_down("_tag", "a", "class", "image")->attr_get_i("href"); $frontpage->delete(); my $infographic = HTML::TreeBuilder->new_from_content(get($url)); my $date = UnixDate(ParseDate($infographic->look_down("_tag", "meta", "name", "date")->attr_get_i("content")), "%g"); my $category = $infographic->look_down("_tag", "meta", "name", "category")->attr_get_i("content"); my $element = $infographic->look_down("_tag", "div", "class", "story text_and_image"); $element->attr("style", "width:500px"); $element->look_down("_tag", "img")->attr("align", "right"); foreach my $ptag ($element->look_down("_tag", "div", "class", "bulleted")->look_down("_tag", "p")) { ; } my $title = $element->look_down("_tag", "h2", "class", "title")->as_text(); my $description = $element->as_HTML(); $description =~ s/&/&/g; $rss->add_item( title => "$title", description => "<![CDATA[$description]]>", pubDate => "$date", category => "$category", author => "", );
You need to login to post a comment.
