/ Published in: Python
A simple spider using the scrapy module to get the text, title, url, author, and date of some poems. Although this is written with poems in mind, with some minor customization, it can be applied to a wider variety of scraping projects where the data desired is not dynamically generated.
Expand |
Embed | Plain Text
# Standard Python library imports # 3rd party imports from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector # My imports from poetry_analysis.items import PoetryAnalysisItem HTML_FILE_NAME = r'.+\.html' class PoetryParser(object): """ Provides common parsing method for poems formatted this one specific way. """ date_pattern = r'(\d{2} \w{3,9} \d{4})' def parse_poem(self, response): hxs = HtmlXPathSelector(response) item = PoetryAnalysisItem() # All poetry text is in pre tags text = hxs.select('//pre/text()').extract() item['text'] = ''.join(text) item['url'] = response.url # head/title contains title - a poem by author title_text = hxs.select('//head/title/text()').extract()[0] item['title'], item['author'] = title_text.split(' - ') item['author'] = item['author'].replace('a poem by', '') for key in ['title', 'author']: item[key] = item[key].strip() item['date'] = hxs.select("//p[@class='small']/text()").re(date_pattern) return item class PoetrySpider(CrawlSpider, PoetryParser): name = 'example.com_poetry' allowed_domains = ['www.example.com'] root_path = 'someuser/poetry/' start_urls = ['http://www.example.com/someuser/poetry/recent/', 'http://www.example.com/someuser/poetry/less_recent/'] rules = [Rule(SgmlLinkExtractor(allow=[start_urls[0] + HTML_FILE_NAME]), callback='parse_poem'), Rule(SgmlLinkExtractor(allow=[start_urls[1] + HTML_FILE_NAME]), callback='parse_poem')]
You need to login to post a comment.
