Posted By

on 09/06/08


Tagged

rss perl scraping achewood webcomics


Versions (?)

Achewood Scrape to RSS


 / Published in: Perl
 

URL: http://www.achewood.com/

Generates an RSS feed from Achewood's stuff.

  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. use LWP::Simple;
  5. use XML::RSS;
  6. use HTML::TreeBuilder 3.0;
  7. use Date::Manip;
  8.  
  9. my $rss = XML::RSS->new(version => '2.0');
  10.  
  11. $rss->channel(
  12. title => "Achewood",
  13. link => "http://www.achewood.com/",
  14. language => "en",
  15. description => "\"Achewood,\" like wormwood, was used by antebellum slaves in the production of \"achewater,\" a long-since outmoded and outlawed Southern beverage.\n\nDrinkers of achewater experienced hallucinations
  16. and euphoria, but the after-effects of the liquor produced a deep and lasting melancholy (hence its name).\n\nModern science has confirmed that achewood oil, the active ingredient in achewater, is a powerful depressant which causes irreversible neurological damage.\n\nAchewater is generally thought to have inspired many Southern folk songs and fables, such as \"The Story of Poor John Ritch,\" \"Sullivan's Bear and Dried Bird\" and \"I'm Following a Little Round Lord.\"",
  17. webmaster => "chris\@achewood.com",
  18. ttl => "300"
  19. );
  20.  
  21. $rss->skipDays(
  22. day => "Sunday",
  23. # day => "Monday",
  24. # day => "Tuesday",
  25. # day => "Wednesday",
  26. # day => "Thursday",
  27. # day => "Friday",
  28. day => "Saturday"
  29. );
  30.  
  31. my $url = "http://m.assetbar.com/achewood/";
  32.  
  33. my $tree = HTML::TreeBuilder->new_from_content(get($url));
  34. my $element = $tree->look_down("_tag", "span", "class", "date");
  35. my $date = UnixDate(ParseDate($element->as_text()), "%g");
  36. $element = $tree->find("title");
  37. my $title = $element->as_text();
  38. $element = $tree->look_down("_tag", "img", "src", qr/(.*)/, "title", qr/(.*)/);
  39. my $image = "http://m.assetbar.com" . $element->attr_get_i("src");
  40. my $alttext = $element->attr_get_i("title");
  41.  
  42. $tree->delete();
  43.  
  44. $rss->add_item(
  45. title => $title,
  46. description => "<![CDATA[<img src=\"$image\" title=\"$alttext\" />]]>",
  47. link => "http://www.achewood.com/index.php?date=" . UnixDate(ParseDate($date), "%m%d%Y"),
  48. pubDate => $date
  49. );
  50.  
  51. $tree = HTML::TreeBuilder->new_from_content(get("http://www.achewood.com/"));
  52. my @elements = $tree->look_down("_tag", "td", "bgcolor", "\#ffffff", "align", "left", "cellpadding", "5", "colspan", "2");
  53. my $status = $elements[2]->find("b")->as_text();
  54. $elements[2]->find("b")->delete();
  55. my $description = $elements[2]->as_text();
  56.  
  57. my $statusdate;
  58. if ($status =~ m/(\d+\/\d+)/) {
  59. $statusdate = $1 . "/2007";
  60. }
  61.  
  62. $rss->add_item(
  63. title => $status,
  64. description => $description,
  65. pubDate => UnixDate(ParseDate($statusdate), "%g"),
  66. link => "http://www.achewood.com/"
  67. );
  68.  
  69. print $rss->as_string();

Report this snippet  

You need to login to post a comment.