Revision: 59333
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 1, 2012 07:15 by scrapy
Initial Code
# This is a spider that can crawl RSS feeds in a version independent manner. it uses Mark pilgrim's excellent feedparser utility to parse RSS feeds. You can read about the nightmares of RSS incompatibility [here](http://diveintomark.org/archives/2004/02/04/incompatible-rss) and download feedparser that strives to resolve it from [here](http://feedparser.org/docs/)
# The scripts processes only certain elements in the feeds(title, link and summary)
# The items may be saved in the Item pipeline which I leave to you.
#
# Please let me know about any discrepencies you may find in the technical and functional aspects of this scipt.
#
# -Sid
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
import feedparser
import re
import urlparse
class RSSSpider(BaseSpider):
name = "rssspider"
allowed_domain = ["news.google.com"]
start_urls = [
"http://news.google.com/"
]
_date_pattern = re.compile( \
r'(\d{,2})/(\d{,2})/(\d{4}) (\d{,2}):(\d{2}):(\d{2})');
_http_pattern = re.compile(r'^http:\/\/');
_gathered_fields = ('published_parsed' ,'title' , 'link' ,'summary');
def parse(self, response):
#recieve Parsed urls here...
hxs = HtmlXPathSelector(response)
base_url = response.url;
res = urlparse.urlparse(base_url);
self.allowed_domain = [res.netloc];
print ('**********BASE URL********',base_url);
links = hxs.select('//a/@href').extract();
self.num_links = len(links);
self.num_links_proc = 0;
print 'Number of links TBP %s'%(self.num_links);
for url in links:
#TODO: Inform mongo about progress
if(self._http_pattern.match(url)):
# this is an absolute URL
if url.find(self.allowed_domain[0])!=-1 :
try:
#callback should be in a separate function. Otherwise all links in this will be crawled too as this function is recursive.
yield Request(url, callback=self.first_level_links);
except:
pass;
else:
# this was an absolute URL but the domain was not the same, so dont crawl
pass
else:
#relative URL we should try to append the domain and fetch the page
yield Request(urlparse.urljoin(base_url, url), callback=self.first_level_links);
# This page will process the first level links
def first_level_links(self, response):
print('****First Level links:',response.url);
r = self.detect_feed(response);
if r:
yield r;
pass
# detect an RSS Feed and return a RssFeedItem Object
def detect_feed(self, response):
"""Just detects the feed in the links and returns an Item"""
xxs = XmlXPathSelector(response);
'''Need to tweak the feedparser lib to just use the headers from response instead of
d/l the feed page again, rather than d/l it again
'''
if any(xxs.select("/%s" % feed_type) for feed_type in ['rss', 'feed', 'xml', 'rdf']):
try:
rssFeed = feedparser.parse(response.url);
return self.extract_feed(rssFeed)
except:
raise Exception('Exception while parsing/extracting the feed')
return None
def extract_feed(self, parsed_feed):
"""
Takes a feed from the feedparser and returns the constructed items
"""
if hasattr(parsed_feed.feed, 'link') and (hasattr(parsed_feed.feed,'title')
or hasattr(parsed_feed.feed,'description')) and parsed_feed.entries:
r = RssFeedItem();
if 'title' in parsed_feed.feed:
r['title'] = parsed_feed.feed.title;
if 'subtitle' in parsed_feed.feed:
r['summary'] = parsed_feed.feed.subtitle
if 'link' in parsed_feed.feed:
r['link'] = parsed_feed.feed.link
# entries gathered as list(s) of key value pairs. Each list is an entry item
entry_lists= [[
{i: entry[i]} for i in entry if i in self._gathered_fields
]for entry in parsed_feed.entries if hasattr(entry,'title') and hasattr(entry,'link') and hasattr(entry,'summary')
]
for entry_list in entry_lists:
entry_item = RssEntryItem();
for entry_dict in entry_list:
if r.has_key('entries') == False:
r['entries'] = list();
if 'published_parsed' in entry_dict:
entry_item.update({ 'published':date_handler(entry_dict('published_parsed'))});
else:
entry_item.update(entry_dict);
r['entries'].append(entry_item);
if r['entries']:
return r;
# if there are no entries return null
return None;
def dateHandler(self, dateString):
"""parse a UTC date in MM/DD/YYYY HH:MM:SS format"""
month, day, year, hour, minute, second = \
self._date_pattern.search(dateString).groups()
return (int(year), int(month), int(day), \
int(hour), int(minute), int(second), 0, 0, 0);
class MalformedURLException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class RssFeedItem(Item):
title = Field()# the Title of the feed
link = Field()# the URL to the web site(not the feed)
summary = Field();# short description of feed
entries = Field();# will contain the RSSEntrItems
class RssEntryItem(RssFeedItem):
published = Field()
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: itissid
# date : Feb 20, 2011
Initial URL
Initial Description
Initial Title
Scrapy snippet to gather RSS feeds on a page(using feedparser)
Initial Tags
Initial Language
Python