Return to Snippet

Revision: 59331
at September 1, 2012 07:15 by scrapy


Initial Code
# Parses data from eurovision.tv website.
# 
# Item is:
# 
#     from scrapy.item import Item, Field
#     
#     class EurovItem(Item):
#         # define the fields for your item here like:
#         # name = Field()
#         title = Field()
#         year = Field()
#         stage = Field()
#     
#         location = Field()
#         venue = Field()
#         details = Field()
#     
#         participants = Field()
#         scores = Field()

import re

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from eurov.items import EurovItem

class ScoreboardsSpider(CrawlSpider):
    name = 'scoreboards'
    allowed_domains = ['eurovision.tv']
    start_urls = ['http://www.eurovision.tv/page/history/year']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'.*/page/history/by-year/contest\?event=\d+$'),
            callback='parse_page',
            follow=True),
    )

    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)

        title = hxs.select('//div[@class="grid-column-block content-block is-not-tabbed cb-block cb-EventInfo cb-EventInfo-default"]/h2/text()')[0].extract()
        
        evtitle_m = re.match(r'^Eurovision Song Contest (\d{4})(.*)$', title)
        
        i = EurovItem()
        i['title'] = title
        i['year'] = int(evtitle_m.group(1))
        i['stage'] = evtitle_m.group(2)        
        
        participants = []
        p_rows = hxs.select("//table[@class='sortable participants no-arrow decorated']/tbody/tr")
        for p_row in p_rows:
            cols = p_row.select(".//td")
            country_number = int(cols[0].select("text()")[0].extract())
            country_name = cols[1].select(".//a/text()")[0].extract()
            country_broadcaster = cols[1].select(".//div/text()")[0].extract()
            country_performer = cols[2].select("text()").extract()[0].strip()
            country_song = cols[3].select("text()").extract()[0].strip()
            country_points = int(cols[4].select("text()").extract()[0])
            country_place = int(cols[5].select("text()").extract()[0])

            participants.append(
            {
                'number': country_number,
                'country': country_name,
                'broadcaster': country_broadcaster,
                'performer': country_performer,
                'song': country_song,
                'points': country_points,
                'place': country_place,
            })
        
        i['participants'] = participants

        score_strings = hxs.select("//*[contains(@class,'cb-EventInfo-scoreboard')]//tbody/tr/td/@title[contains(.,'goes to')]").extract()
        scores = []
        for score_string in score_strings:
            s = {}
            m = re.match("^(\d+)pt from (.+) goes to (.+)$", score_string)
            if m:
                s['country_from'] = m.group(2)
                s['country_to'] = m.group(3)
                s['score'] = int(m.group(1))
                
                scores.append(s)
                
        i['scores'] = scores
        
        
        details_rows = hxs.select('//table[@class="details"]/tr')
        details = dict(
            (row.select('th/text()').extract()[0].strip(),
            row.select('td/text()').extract()[0].strip())
            for row in details_rows)

        i['location'] = details.pop('Location', None)
        i['venue'] = details.pop('Venue', None)
        
        i['details'] = details

        return i

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: xepka
# date  : Apr 22, 2011

Initial URL


Initial Description


Initial Title
Eurovision.tv year archive parser

Initial Tags


Initial Language
Python