/ Published in: Perl
Scrapes Engrish to RSS.
Expand |
Embed | Plain Text
#!/usr/bin/perl -w use strict; use LWP::Simple; use XML::RSS; use HTML::TreeBuilder 3.0; use Date::Manip; my $rss = XML::RSS->new(version => '2.0'); $rss->channel( title => "Engrish", language => "en", description => "", webmaster => "", ttl => "300" ); $rss->skipDays( # day => "Sunday", # day => "Monday", # day => "Tuesday", # day => "Wednesday", # day => "Thursday", # day => "Friday", # day => "Saturday" ); # Some kind of loop here to grab feeds. my $recent = HTML::TreeBuilder->new_from_content( get("http://www.engrish.com/recent.php")); my @pictures; foreach my $tdtag ($recent->look_down("_tag", "td", "bgcolor", "\#CCCCCC", "width", "120", "height", "120")) { } foreach my $picture (@pictures) { (my $date) = ($picture =~ m/date=(.*?)$/); $date = UnixDate(ParseDate($date), "%g"); my $item = HTML::TreeBuilder->new_from_content(get("http://www.engrish.com/$picture")); my $image = $item->look_down("_tag", "img", "src", qr/image\/engrish\//); $image->attr("src", "http://www.engrish.com/" . $image->attr_get_i("src")); $image = $image->as_HTML(); my $caption = $item->look_down("_tag", "em")->as_text(); my $title = $item->look_down("_tag", "title")->as_text(); my $quip = $item->look_down("_tag", "font", "face", "Times New Roman, Times, serif")->as_text(); ($quip) = ($quip =~ m/(.*?)Photo/); $rss->add_item( title => "$title", description => "<