Posted By

on 09/06/08


Tagged

rss perl scraping apod astronomypictureoftheday nasa


Versions (?)

APOD Scrape to RSS


 / Published in: Perl
 

URL: http://antwrp.gsfc.nasa.gov/apod/

Scrapes the Astronomy Picture of the Day and packages it into a messy RSS feed.

  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. use LWP::Simple;
  5. use LWP::Parallel::UserAgent;
  6. use XML::RSS;
  7. use HTML::TreeBuilder 3.0;
  8. use Date::Manip;
  9.  
  10. my $rss = XML::RSS->new(version => '2.0');
  11. my $url = "http://antwrp.gsfc.nasa.gov/apod/";
  12. my $pua = LWP::Parallel::UserAgent->new();
  13.  
  14. $rss->channel( title => "Astronomy Picture of the Day",
  15. link => $url,
  16. language => "en",
  17. description => "Discover the cosmos! Each day a different image or photograph of our fascinating universe is featured, along with a brief explanation written by a professional astronomer.",
  18. webmaster => "",
  19. ttl => "480"
  20. );
  21.  
  22. $rss->skipDays(
  23. # day => "Sunday",
  24. # day => "Monday",
  25. # day => "Tuesday",
  26. # day => "Wednesday",
  27. # day => "Thursday",
  28. # day => "Friday",
  29. # day => "Saturday"
  30. );
  31.  
  32. my $archive = get("http://antwrp.gsfc.nasa.gov/apod/archivepix.html");
  33. my @pages;
  34.  
  35. my %dates;
  36. my %titles;
  37. while ($archive =~ m/(.*?): <a href="(.*?)">(.*?)<\/a>/g) {
  38. $dates{$url . $2} = $1;
  39. $titles{$url . $2} = $3;
  40. push(@pages, $2);
  41. }
  42.  
  43. for (my $i = 0; $i < 20; $i++) {
  44. my $link = $url . $pages[$i];
  45.  
  46. my $httprequest = HTTP::Request->new();
  47. $httprequest->uri($link);
  48. $httprequest->method("GET");
  49.  
  50. $pua->register($httprequest);
  51. }
  52.  
  53. my $html = $pua->wait();
  54.  
  55. foreach my $pod (values(%$html)) {
  56. my $response = $pod->response();
  57. my $podurl = $response->base()->as_string();
  58.  
  59. my $article = HTML::TreeBuilder->new_from_content(
  60. $response->content()
  61. );
  62.  
  63. my @chunks = $article->look_down("_tag", "p");
  64. my $description = $chunks[1]->as_HTML() . $chunks[2]->as_HTML();
  65. my $date = UnixDate(ParseDate($dates{$podurl}), "%g");
  66.  
  67. $rss->add_item( title => "$titles{$podurl}",
  68. description => "$description",
  69. link => "$podurl",
  70. pubDate => "$date",
  71. category => "science",
  72. author => "Robert Nemiroff, Jerry Bonnell",
  73. );
  74. }
  75.  
  76. print $rss->as_string();

Report this snippet  

You need to login to post a comment.