Posted By

on 09/06/08


Tagged

rss html perl scraping newyorktimes nyt


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

techdetours


New York Times Scrape to RSS


 / Published in: Perl
 

URL: http://topics.nytimes.com/top/reference/timestopics/index.html

Crawls all over the NYT RSS feed to glue entire articles together for your enjoyment. Requires registration.

Pick a feed URL from http://topics.nytimes.com/top/reference/timestopics/index.html and have at it.

perl nyt.pl --user=aksdhf --pass=aksdfhasouidf --url=http://asiudhfasdfj

  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. use LWP::Simple;
  5. use HTML::TreeBuilder;
  6. use LWP::Parallel::UserAgent;
  7. use WWW::Mechanize;
  8. use XML::TreeBuilder;
  9. use Getopt::Long;
  10. use HTTP::Cookies;
  11. use Encode;
  12.  
  13. my $username;
  14. my $password;
  15. my $feedurl;
  16.  
  17. GetOptions( "user=s" => \$username,
  18. "pass=s" => \$password,
  19. "url=s" => \$feedurl
  20. );
  21.  
  22. print STDERR "Getting login page...\n";
  23.  
  24. my $cookiejar = HTTP::Cookies->new();
  25.  
  26. my $mech = WWW::Mechanize->new();
  27.  
  28. $mech->agent_alias('Linux Mozilla');
  29. $mech->cookie_jar($cookiejar);
  30.  
  31. $mech->get("http://www.nytimes.com/auth/login");
  32.  
  33. my $loginresponse = $mech->submit_form(
  34. form_name => 'login',
  35. fields => {
  36. USERID => $username,
  37. PASSWORD => $password
  38. }
  39. );
  40.  
  41.  
  42. unless ($loginresponse->is_success()) {
  43. die("Error logging in!\n");
  44. }
  45.  
  46. print STDERR "Logged in successfully!\n";
  47.  
  48. my $pua = LWP::Parallel::UserAgent->new();
  49. $pua->cookie_jar($cookiejar);
  50. $pua->redirect(1);
  51.  
  52.  
  53. print STDERR "Getting XML...\n";
  54. my $xml = get($feedurl);
  55. my $feed = XML::TreeBuilder->new();
  56. $feed->parse($xml);
  57. my %entries;
  58.  
  59. print STDERR "Grabbing links...\n";
  60.  
  61. foreach my $item ($feed->look_down("_tag", "item")) {
  62. my $link = $item->look_down("_tag", "link")->as_text();
  63. $link =~ s/\?.*//;
  64.  
  65. my $request = HTTP::Request->new();
  66.  
  67. print STDERR "Registering $link...\n";
  68.  
  69. $request->uri($link,);
  70. $request->method("GET");
  71.  
  72. $pua->register($request);
  73.  
  74. $entries{$link} = $item;
  75. }
  76.  
  77. print STDERR "Downloading HTML...\n";
  78. my $html = $pua->wait();
  79.  
  80. foreach my $entry (values(%$html)) {
  81. my $response = $entry->response();
  82. my $url = $response->base()->as_string();
  83. $url =~ s/\?.*//;
  84.  
  85. print STDERR "Processing $url...\n";
  86.  
  87. my $item = $entries{$url};
  88.  
  89. my $articlehtml = HTML::TreeBuilder->new_from_content(
  90. decode_utf8($response->content())
  91. );
  92.  
  93. if (my $redirelem = $articlehtml->look_down("_tag", "meta", "http-equiv", "refresh")) {
  94. print STDERR "Interstitial ad detected, skipping...\n";
  95. my $newurl = $redirelem->attr_get_i("content");
  96.  
  97. ($newurl) = ($newurl =~ m/url=(.*?)/);
  98. $newurl = "http://www.nytimes.com" . $newurl;
  99.  
  100. print STDERR "Redirect URL is $newurl...\n";
  101. }
  102.  
  103.  
  104. # Let's clean this up for Liferea, shall we?
  105. my $messyelement;
  106.  
  107. foreach ($articlehtml->look_down("_tag", "div", "class", "enlargeThis")) {
  108. $_->delete();
  109. }
  110. if ($messyelement = $articlehtml->look_down("_tag", "div", "class", "nextArticleLink")) {
  111. $messyelement->delete();
  112. }
  113.  
  114. my $description = $entries{$url}->look_down("_tag", "description");
  115. $description->delete_content();
  116.  
  117. foreach ($articlehtml->look_down("_tag", "div", "class", "image")) {
  118. $description->push_content($_->as_HTML());
  119. }
  120.  
  121. if ($messyelement = $articlehtml->look_down("_tag", "div", "id", "articleInline")) {
  122. $messyelement->delete();
  123. }
  124.  
  125. my %pages;
  126. my @pages;
  127. my $pageua = LWP::Parallel::UserAgent->new();
  128. $pageua->cookie_jar($cookiejar);
  129.  
  130. if (my $pageelem = $articlehtml->look_down("_tag", "div", "id", "pageLinks")) {
  131. print STDERR "Multiple pages detected...\n";
  132.  
  133. foreach ($pageelem->look_down("_tag", "a", "title", qr/^Page/)) {
  134. my $pageurl = "http://www.nytimes.com" . $_->attr_get_i("href");
  135. print STDERR "Registering $pageurl...\n";
  136.  
  137. push(@pages, $pageurl);
  138. my $pagerequest = HTTP::Request->new();
  139. $pagerequest->uri($pageurl);
  140. $pagerequest->method('GET');
  141.  
  142. $pageua->register($pagerequest);
  143. }
  144.  
  145. print STDERR "Downloading pages...\n";
  146.  
  147. $pageelem->delete();
  148. }
  149.  
  150. foreach ($articlehtml->look_down("_tag", "div", "id", "articlebody")) {
  151. $description->push_content($_->as_HTML());
  152. }
  153.  
  154. my $htmlpages = $pageua->wait();
  155.  
  156. if ($htmlpages) {
  157. print STDERR "Sorting pages...\n";
  158. foreach (values(%$htmlpages)) {
  159. my $pageresponse = $_->response();
  160. my $responseurl = $pageresponse->base()->as_string();
  161.  
  162. $pages{$responseurl} = $pageresponse;
  163. }
  164.  
  165. foreach (@pages) {
  166. print STDERR "Processing $_...\n";
  167.  
  168. my $pagehtml = HTML::TreeBuilder->new_from_content(
  169. decode_utf8($pages{$_}->content())
  170. );
  171.  
  172. # Let's clean this up for Liferea, shall we?
  173.  
  174. if ($messyelement = $pagehtml->look_down("_tag", "div", "class", "enlargeThis")) {
  175. $messyelement->delete();
  176. }
  177. if ($messyelement = $pagehtml->look_down("_tag", "div", "class", "nextArticleLink")) {
  178. $messyelement->delete();
  179. }
  180.  
  181. foreach ($pagehtml->look_down("_tag", "div", "class", "image")) {
  182. $description->push_content($_->as_HTML());
  183. }
  184.  
  185. if ($messyelement = $pagehtml->look_down("_tag", "div", "id", "articleInline")) {
  186. $messyelement->delete();
  187. }
  188.  
  189. if ($messyelement = $pagehtml->look_down("_tag", "div", "id", "pageLinks")) {
  190. $messyelement->delete();
  191. }
  192.  
  193. foreach ($pagehtml->look_down("_tag", "div", "id", "articlebody")) {
  194. my $content = $_->as_HTML();
  195. $content =~ s/\(Page \d+ of \d+\)//g;
  196. $description->push_content($content);
  197. }
  198. }
  199. }
  200. }
  201.  
  202. print $feed->as_XML();

Report this snippet  

You need to login to post a comment.