Posted By

on 09/06/08


Tagged

rss perl scraping engrish


Versions (?)

Engrish Scrape to RSS


 / Published in: Perl
 

URL: http://www.engrish.com/

Scrapes Engrish to RSS.

  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. use LWP::Simple;
  5. use XML::RSS;
  6. use HTML::TreeBuilder 3.0;
  7. use Date::Manip;
  8.  
  9. my $rss = XML::RSS->new(version => '2.0');
  10.  
  11. $rss->channel( title => "Engrish",
  12. link => "http://www.engrish.com/",
  13. language => "en",
  14. description => "",
  15. webmaster => "",
  16. ttl => "300"
  17. );
  18.  
  19. $rss->skipDays(
  20. # day => "Sunday",
  21. # day => "Monday",
  22. # day => "Tuesday",
  23. # day => "Wednesday",
  24. # day => "Thursday",
  25. # day => "Friday",
  26. # day => "Saturday"
  27. );
  28.  
  29. # Some kind of loop here to grab feeds.
  30.  
  31. my $recent = HTML::TreeBuilder->new_from_content(
  32. get("http://www.engrish.com/recent.php"));
  33. my @pictures;
  34. foreach my $tdtag ($recent->look_down("_tag", "td", "bgcolor", "\#CCCCCC", "width", "120", "height", "120")) {
  35. push(@pictures, $tdtag->look_down("_tag", "a")->attr_get_i("href"));
  36. }
  37.  
  38. foreach my $picture (@pictures) {
  39. (my $date) = ($picture =~ m/date=(.*?)$/);
  40. $date = UnixDate(ParseDate($date), "%g");
  41.  
  42. my $item = HTML::TreeBuilder->new_from_content(get("http://www.engrish.com/$picture"));
  43. my $image = $item->look_down("_tag", "img", "src", qr/image\/engrish\//);
  44. $image->attr("src", "http://www.engrish.com/" . $image->attr_get_i("src"));
  45. $image = $image->as_HTML();
  46. my $caption = $item->look_down("_tag", "em")->as_text();
  47. my $title = $item->look_down("_tag", "title")->as_text();
  48. my $quip = $item->look_down("_tag", "font", "face", "Times New Roman, Times, serif")->as_text();
  49. ($quip) = ($quip =~ m/(.*?)Photo/);
  50.  
  51. $rss->add_item( title => "$title",
  52. description => "<![CDATA[<p align=\"center\">$quip<br />$image<br />$caption</p>",
  53. link => "http://www.engrish.com/$picture",
  54. pubDate => "$date",
  55. category => "humor",
  56. author => "",
  57. );
  58. }
  59. print $rss->as_string();

Report this snippet  

You need to login to post a comment.