/ Published in: Perl
URL: http://topics.nytimes.com/top/reference/timestopics/index.html
Crawls all over the NYT RSS feed to glue entire articles together for your enjoyment. Requires registration.
Pick a feed URL from http://topics.nytimes.com/top/reference/timestopics/index.html and have at it.
perl nyt.pl --user=aksdhf --pass=aksdfhasouidf --url=http://asiudhfasdfj
Expand |
Embed | Plain Text
#!/usr/bin/perl -w use strict; use LWP::Simple; use HTML::TreeBuilder; use LWP::Parallel::UserAgent; use WWW::Mechanize; use XML::TreeBuilder; use Getopt::Long; use HTTP::Cookies; use Encode; my $username; my $password; my $feedurl; GetOptions( "user=s" => \$username, "pass=s" => \$password, "url=s" => \$feedurl ); my $cookiejar = HTTP::Cookies->new(); my $mech = WWW::Mechanize->new(); $mech->agent_alias('Linux Mozilla'); $mech->cookie_jar($cookiejar); $mech->get("http://www.nytimes.com/auth/login"); my $loginresponse = $mech->submit_form( form_name => 'login', fields => { USERID => $username, PASSWORD => $password } ); unless ($loginresponse->is_success()) { } my $pua = LWP::Parallel::UserAgent->new(); $pua->cookie_jar($cookiejar); $pua->redirect(1); my $xml = get($feedurl); my $feed = XML::TreeBuilder->new(); $feed->parse($xml); my %entries; foreach my $item ($feed->look_down("_tag", "item")) { my $link = $item->look_down("_tag", "link")->as_text(); $link =~ s/\?.*//; my $request = HTTP::Request->new(); $request->uri($link,); $request->method("GET"); $pua->register($request); $entries{$link} = $item; } my $html = $pua->wait(); my $response = $entry->response(); my $url = $response->base()->as_string(); $url =~ s/\?.*//; my $item = $entries{$url}; my $articlehtml = HTML::TreeBuilder->new_from_content( decode_utf8($response->content()) ); if (my $redirelem = $articlehtml->look_down("_tag", "meta", "http-equiv", "refresh")) { my $newurl = $redirelem->attr_get_i("content"); ($newurl) = ($newurl =~ m/url=(.*?)/); $newurl = "http://www.nytimes.com" . $newurl; } # Let's clean this up for Liferea, shall we? my $messyelement; foreach ($articlehtml->look_down("_tag", "div", "class", "enlargeThis")) { $_->delete(); } if ($messyelement = $articlehtml->look_down("_tag", "div", "class", "nextArticleLink")) { $messyelement->delete(); } my $description = $entries{$url}->look_down("_tag", "description"); $description->delete_content(); foreach ($articlehtml->look_down("_tag", "div", "class", "image")) { $description->push_content($_->as_HTML()); } if ($messyelement = $articlehtml->look_down("_tag", "div", "id", "articleInline")) { $messyelement->delete(); } my %pages; my @pages; my $pageua = LWP::Parallel::UserAgent->new(); $pageua->cookie_jar($cookiejar); if (my $pageelem = $articlehtml->look_down("_tag", "div", "id", "pageLinks")) { foreach ($pageelem->look_down("_tag", "a", "title", qr/^Page/)) { my $pageurl = "http://www.nytimes.com" . $_->attr_get_i("href"); my $pagerequest = HTTP::Request->new(); $pagerequest->uri($pageurl); $pagerequest->method('GET'); $pageua->register($pagerequest); } $pageelem->delete(); } foreach ($articlehtml->look_down("_tag", "div", "id", "articlebody")) { $description->push_content($_->as_HTML()); } my $htmlpages = $pageua->wait(); if ($htmlpages) { my $pageresponse = $_->response(); my $responseurl = $pageresponse->base()->as_string(); $pages{$responseurl} = $pageresponse; } foreach (@pages) { my $pagehtml = HTML::TreeBuilder->new_from_content( decode_utf8($pages{$_}->content()) ); # Let's clean this up for Liferea, shall we? if ($messyelement = $pagehtml->look_down("_tag", "div", "class", "enlargeThis")) { $messyelement->delete(); } if ($messyelement = $pagehtml->look_down("_tag", "div", "class", "nextArticleLink")) { $messyelement->delete(); } foreach ($pagehtml->look_down("_tag", "div", "class", "image")) { $description->push_content($_->as_HTML()); } if ($messyelement = $pagehtml->look_down("_tag", "div", "id", "articleInline")) { $messyelement->delete(); } if ($messyelement = $pagehtml->look_down("_tag", "div", "id", "pageLinks")) { $messyelement->delete(); } foreach ($pagehtml->look_down("_tag", "div", "id", "articlebody")) { my $content = $_->as_HTML(); $content =~ s/\(Page \d+ of \d+\)//g; $description->push_content($content); } } } }
You need to login to post a comment.
