Posted By

scrapy on 09/01/12


Tagged

scrapy


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

sgjc65


Scrapy snippet to gather RSS feeds on a page(using feedparser)


 / Published in: Python
 

  1. # This is a spider that can crawl RSS feeds in a version independent manner. it uses Mark pilgrim's excellent feedparser utility to parse RSS feeds. You can read about the nightmares of RSS incompatibility [here](http://diveintomark.org/archives/2004/02/04/incompatible-rss) and download feedparser that strives to resolve it from [here](http://feedparser.org/docs/)
  2. # The scripts processes only certain elements in the feeds(title, link and summary)
  3. # The items may be saved in the Item pipeline which I leave to you.
  4. #
  5. # Please let me know about any discrepencies you may find in the technical and functional aspects of this scipt.
  6. #
  7. # -Sid
  8.  
  9. from scrapy.spider import BaseSpider
  10.  
  11. from scrapy.selector import HtmlXPathSelector
  12. from scrapy.selector import XmlXPathSelector
  13. from scrapy.http import Request
  14. import feedparser
  15. import re
  16. import urlparse
  17.  
  18.  
  19.  
  20. class RSSSpider(BaseSpider):
  21. name = "rssspider"
  22. allowed_domain = ["news.google.com"]
  23. start_urls = [
  24. "http://news.google.com/"
  25.  
  26. ]
  27. _date_pattern = re.compile( \
  28. r'(\d{,2})/(\d{,2})/(\d{4}) (\d{,2}):(\d{2}):(\d{2})');
  29. _http_pattern = re.compile(r'^http:\/\/');
  30. _gathered_fields = ('published_parsed' ,'title' , 'link' ,'summary');
  31.  
  32.  
  33. def parse(self, response):
  34. #recieve Parsed urls here...
  35. hxs = HtmlXPathSelector(response)
  36. base_url = response.url;
  37. res = urlparse.urlparse(base_url);
  38. self.allowed_domain = [res.netloc];
  39.  
  40.  
  41. print ('**********BASE URL********',base_url);
  42. links = hxs.select('//a/@href').extract();
  43. self.num_links = len(links);
  44. self.num_links_proc = 0;
  45. print 'Number of links TBP %s'%(self.num_links);
  46. for url in links:
  47. #TODO: Inform mongo about progress
  48. if(self._http_pattern.match(url)):
  49. # this is an absolute URL
  50. if url.find(self.allowed_domain[0])!=-1 :
  51. try:
  52. #callback should be in a separate function. Otherwise all links in this will be crawled too as this function is recursive.
  53. yield Request(url, callback=self.first_level_links);
  54. except:
  55. pass;
  56. else:
  57. # this was an absolute URL but the domain was not the same, so dont crawl
  58. pass
  59.  
  60. else:
  61. #relative URL we should try to append the domain and fetch the page
  62. yield Request(urlparse.urljoin(base_url, url), callback=self.first_level_links);
  63. # This page will process the first level links
  64. def first_level_links(self, response):
  65. print('****First Level links:',response.url);
  66. r = self.detect_feed(response);
  67. if r:
  68. yield r;
  69. pass
  70. # detect an RSS Feed and return a RssFeedItem Object
  71. def detect_feed(self, response):
  72. """Just detects the feed in the links and returns an Item"""
  73. xxs = XmlXPathSelector(response);
  74. '''Need to tweak the feedparser lib to just use the headers from response instead of
  75. d/l the feed page again, rather than d/l it again
  76. '''
  77.  
  78. if any(xxs.select("/%s" % feed_type) for feed_type in ['rss', 'feed', 'xml', 'rdf']):
  79. try:
  80. rssFeed = feedparser.parse(response.url);
  81. return self.extract_feed(rssFeed)
  82. except:
  83. raise Exception('Exception while parsing/extracting the feed')
  84.  
  85. return None
  86.  
  87. def extract_feed(self, parsed_feed):
  88. """
  89. Takes a feed from the feedparser and returns the constructed items
  90. """
  91.  
  92. if hasattr(parsed_feed.feed, 'link') and (hasattr(parsed_feed.feed,'title')
  93. or hasattr(parsed_feed.feed,'description')) and parsed_feed.entries:
  94. r = RssFeedItem();
  95. if 'title' in parsed_feed.feed:
  96. r['title'] = parsed_feed.feed.title;
  97. if 'subtitle' in parsed_feed.feed:
  98. r['summary'] = parsed_feed.feed.subtitle
  99. if 'link' in parsed_feed.feed:
  100. r['link'] = parsed_feed.feed.link
  101.  
  102. # entries gathered as list(s) of key value pairs. Each list is an entry item
  103. entry_lists= [[
  104. {i: entry[i]} for i in entry if i in self._gathered_fields
  105. ]for entry in parsed_feed.entries if hasattr(entry,'title') and hasattr(entry,'link') and hasattr(entry,'summary')
  106. ]
  107.  
  108. for entry_list in entry_lists:
  109. entry_item = RssEntryItem();
  110.  
  111. for entry_dict in entry_list:
  112. if r.has_key('entries') == False:
  113. r['entries'] = list();
  114.  
  115. if 'published_parsed' in entry_dict:
  116. entry_item.update({ 'published':date_handler(entry_dict('published_parsed'))});
  117. else:
  118. entry_item.update(entry_dict);
  119. r['entries'].append(entry_item);
  120. if r['entries']:
  121. return r;
  122. # if there are no entries return null
  123. return None;
  124. def dateHandler(self, dateString):
  125. """parse a UTC date in MM/DD/YYYY HH:MM:SS format"""
  126. month, day, year, hour, minute, second = \
  127. self._date_pattern.search(dateString).groups()
  128. return (int(year), int(month), int(day), \
  129. int(hour), int(minute), int(second), 0, 0, 0);
  130.  
  131.  
  132.  
  133. class MalformedURLException(Exception):
  134. def __init__(self, value):
  135. self.value = value
  136. def __str__(self):
  137. return repr(self.value)
  138.  
  139. class RssFeedItem(Item):
  140. title = Field()# the Title of the feed
  141. link = Field()# the URL to the web site(not the feed)
  142. summary = Field();# short description of feed
  143. entries = Field();# will contain the RSSEntrItems
  144.  
  145. class RssEntryItem(RssFeedItem):
  146. published = Field()
  147.  
  148. # Snippet imported from snippets.scrapy.org (which no longer works)
  149. # author: itissid
  150. # date : Feb 20, 2011
  151.  

Report this snippet  

You need to login to post a comment.