Revision: 59331
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 1, 2012 07:15 by scrapy
Initial Code
# Parses data from eurovision.tv website. # # Item is: # # from scrapy.item import Item, Field # # class EurovItem(Item): # # define the fields for your item here like: # # name = Field() # title = Field() # year = Field() # stage = Field() # # location = Field() # venue = Field() # details = Field() # # participants = Field() # scores = Field() import re from scrapy.selector import HtmlXPathSelector from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.contrib.spiders import CrawlSpider, Rule from eurov.items import EurovItem class ScoreboardsSpider(CrawlSpider): name = 'scoreboards' allowed_domains = ['eurovision.tv'] start_urls = ['http://www.eurovision.tv/page/history/year'] rules = ( Rule(SgmlLinkExtractor(allow=r'.*/page/history/by-year/contest\?event=\d+$'), callback='parse_page', follow=True), ) def parse_page(self, response): hxs = HtmlXPathSelector(response) title = hxs.select('//div[@class="grid-column-block content-block is-not-tabbed cb-block cb-EventInfo cb-EventInfo-default"]/h2/text()')[0].extract() evtitle_m = re.match(r'^Eurovision Song Contest (\d{4})(.*)$', title) i = EurovItem() i['title'] = title i['year'] = int(evtitle_m.group(1)) i['stage'] = evtitle_m.group(2) participants = [] p_rows = hxs.select("//table[@class='sortable participants no-arrow decorated']/tbody/tr") for p_row in p_rows: cols = p_row.select(".//td") country_number = int(cols[0].select("text()")[0].extract()) country_name = cols[1].select(".//a/text()")[0].extract() country_broadcaster = cols[1].select(".//div/text()")[0].extract() country_performer = cols[2].select("text()").extract()[0].strip() country_song = cols[3].select("text()").extract()[0].strip() country_points = int(cols[4].select("text()").extract()[0]) country_place = int(cols[5].select("text()").extract()[0]) participants.append( { 'number': country_number, 'country': country_name, 'broadcaster': country_broadcaster, 'performer': country_performer, 'song': country_song, 'points': country_points, 'place': country_place, }) i['participants'] = participants score_strings = hxs.select("//*[contains(@class,'cb-EventInfo-scoreboard')]//tbody/tr/td/@title[contains(.,'goes to')]").extract() scores = [] for score_string in score_strings: s = {} m = re.match("^(\d+)pt from (.+) goes to (.+)$", score_string) if m: s['country_from'] = m.group(2) s['country_to'] = m.group(3) s['score'] = int(m.group(1)) scores.append(s) i['scores'] = scores details_rows = hxs.select('//table[@class="details"]/tr') details = dict( (row.select('th/text()').extract()[0].strip(), row.select('td/text()').extract()[0].strip()) for row in details_rows) i['location'] = details.pop('Location', None) i['venue'] = details.pop('Venue', None) i['details'] = details return i # Snippet imported from snippets.scrapy.org (which no longer works) # author: xepka # date : Apr 22, 2011
Initial URL
Initial Description
Initial Title
Eurovision.tv year archive parser
Initial Tags
Initial Language
Python