Posted By

on 09/06/08


Tagged

rss scraping mother3translation


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

techdetours


Mother 3 Translation Blog RSS Tidy Up


 / Published in: Perl
 

URL: http://mother3.fobby.net/

This one I like. Scrapes the blog and fixes their RSS feed to include images, video, mini updates and whatnot. Images are busted and too lazy to fix. Left up to an exercise for the reader.

  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. use LWP::Simple;
  5. use HTML::TreeBuilder;
  6. use LWP::Parallel::UserAgent;
  7. use XML::TreeBuilder;
  8. use Encode;
  9.  
  10. my $ua = LWP::Parallel::UserAgent->new();
  11. print STDERR "Grabbing feed XML...\n";
  12. my $xml = get("http://feeds.feedburner.com/Mother3FanTranslation?format=xml");
  13. my $atomfeed = XML::TreeBuilder->new();
  14. $atomfeed->parse($xml);
  15. my %entries;
  16.  
  17. print STDERR "Scraping links from XML...\n";
  18. foreach my $item ($atomfeed->look_down("_tag", "item")) {
  19. my $link = $item->look_down("_tag", "link")->as_text();
  20. $entries{$link} = $item;
  21. }
  22.  
  23. foreach my $item ($atomfeed->look_down("_tag", "item")) {
  24. my $url = $item->look_down("_tag", "link")->as_text();
  25. my $request = HTTP::Request->new();
  26.  
  27. $request->uri($url);
  28. $request->method("GET");
  29.  
  30. print STDERR "Registering $url...\n";
  31. $ua->register($request);
  32. }
  33.  
  34. print STDERR "Downloading HTML...";
  35. my $html = $ua->wait();
  36. print STDERR "done!\n";
  37.  
  38. foreach my $entry (values(%$html)) {
  39. my $response = $entry->response();
  40. my $url = $response->base()->as_string();
  41.  
  42. print STDERR "Processing $url...\n";
  43. my $item = $entries{$url};
  44.  
  45. print STDERR "\tDeleting description content...\n";
  46. my $description = $item->look_down("_tag", "content:encoded");
  47. $description->delete_content();
  48.  
  49. print STDERR "\tGrabbing relavent HTML via regular expression...\n";
  50. my $blogentry = $response->content();
  51. (undef, my $blogtree) = ($blogentry =~ m/<p class="meta">(.*?)<\/p>(.*?)<p class="meta">Posted/s);
  52.  
  53. my $html = HTML::TreeBuilder->new_from_content(
  54. decode_utf8($response->content())
  55. );
  56.  
  57. my $comments = "<br /><br />Mato Comments:<br />";
  58.  
  59. print STDERR "\tFinding Mato comments...\n";
  60. foreach ($html->look_down("_tag", "div", "style", qr/DCB6B6/)) {
  61. print STDERR "\t\tFound a comment!\n";
  62. foreach ($_->look_down("_tag", "p")) {
  63. $comments .= "<blockquote>".$_->as_HTML()."</blockquote>";
  64. }
  65. }
  66.  
  67. $blogtree = HTML::TreeBuilder->new_from_content(
  68. decode_utf8($blogtree.$comments)
  69. );
  70.  
  71. print STDERR "\tCleaning up HTML for Liferea...\n";
  72. foreach my $hrdiv ($blogtree->look_down("_tag", "div", "class", "hr")) {
  73. $hrdiv->delete();
  74. }
  75.  
  76.  
  77.  
  78. print STDERR "\tPushing content to $url entry...\n";
  79. $description->push_content($blogtree->as_HTML());
  80. }
  81.  
  82. print $atomfeed->as_XML();

Report this snippet  

You need to login to post a comment.