/ Published in: Perl
URL: http://girlwithaonetrackmind.blogspot.com/
Didn't bother fixing it but works well enough to scrape Girl With a One Track Mind
Expand |
Embed | Plain Text
#!/usr/bin/perl -w use strict; use LWP::Simple; use LWP::Parallel::UserAgent; use XML::TreeBuilder; use HTML::TreeBuilder 3.0; use Encode; my $feedurl = "http://girlwithaonetrackmind.blogspot.com/feeds/posts/default"; my $xml = XML::TreeBuilder->new(); $xml->parse(get($feedurl)); my %posts; my $pua = LWP::Parallel::UserAgent->new(); foreach my $entry ($xml->look_down("_tag", "entry")) { my $entryurl = $entry->look_down("_tag", "link")->attr_get_i("href"); $posts{$entryurl} = $entry; my $httprequest = HTTP::Request->new(); $httprequest->url($entryurl); $httprequest->method("GET"); $pua->register($httprequest); } my $html = $pua->wait(); my $response = $blogpost->response(); my $url = $response->base()->as_string(); my $entry = $posts{$url}; my $summary = $entry->look_down("_tag", "summary"); $summary->delete_content(); my $blogpost = HTML::TreeBuilder->new_from_content( $response->content() ); $blogpost->look_down("_tag", "div", "class", "byline")->delete(); foreach my $element ($blogpost->look_down("_tag", "script")) { $element->delete(); } $summary->push_content($blogpost->look_down("_tag", "div", "class", "blogPost")->as_HTML() ); $summary->attr("type", "html"); }
You need to login to post a comment.
