Posted By

scrapy on 09/01/12


Tagged

parser spider scrapy crawlspider eurovision


Versions (?)

Eurovision.tv year archive parser


 / Published in: Python
 

  1. # Parses data from eurovision.tv website.
  2. #
  3. # Item is:
  4. #
  5. # from scrapy.item import Item, Field
  6. #
  7. # class EurovItem(Item):
  8. # # define the fields for your item here like:
  9. # # name = Field()
  10. # title = Field()
  11. # year = Field()
  12. # stage = Field()
  13. #
  14. # location = Field()
  15. # venue = Field()
  16. # details = Field()
  17. #
  18. # participants = Field()
  19. # scores = Field()
  20.  
  21. import re
  22.  
  23. from scrapy.selector import HtmlXPathSelector
  24. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  25. from scrapy.contrib.spiders import CrawlSpider, Rule
  26. from eurov.items import EurovItem
  27.  
  28. class ScoreboardsSpider(CrawlSpider):
  29. name = 'scoreboards'
  30. allowed_domains = ['eurovision.tv']
  31. start_urls = ['http://www.eurovision.tv/page/history/year']
  32.  
  33. rules = (
  34. Rule(SgmlLinkExtractor(allow=r'.*/page/history/by-year/contest\?event=\d+$'),
  35. callback='parse_page',
  36. follow=True),
  37. )
  38.  
  39. def parse_page(self, response):
  40. hxs = HtmlXPathSelector(response)
  41.  
  42. title = hxs.select('//div[@class="grid-column-block content-block is-not-tabbed cb-block cb-EventInfo cb-EventInfo-default"]/h2/text()')[0].extract()
  43.  
  44. evtitle_m = re.match(r'^Eurovision Song Contest (\d{4})(.*)$', title)
  45.  
  46. i = EurovItem()
  47. i['title'] = title
  48. i['year'] = int(evtitle_m.group(1))
  49. i['stage'] = evtitle_m.group(2)
  50.  
  51. participants = []
  52. p_rows = hxs.select("//table[@class='sortable participants no-arrow decorated']/tbody/tr")
  53. for p_row in p_rows:
  54. cols = p_row.select(".//td")
  55. country_number = int(cols[0].select("text()")[0].extract())
  56. country_name = cols[1].select(".//a/text()")[0].extract()
  57. country_broadcaster = cols[1].select(".//div/text()")[0].extract()
  58. country_performer = cols[2].select("text()").extract()[0].strip()
  59. country_song = cols[3].select("text()").extract()[0].strip()
  60. country_points = int(cols[4].select("text()").extract()[0])
  61. country_place = int(cols[5].select("text()").extract()[0])
  62.  
  63. participants.append(
  64. {
  65. 'number': country_number,
  66. 'country': country_name,
  67. 'broadcaster': country_broadcaster,
  68. 'performer': country_performer,
  69. 'song': country_song,
  70. 'points': country_points,
  71. 'place': country_place,
  72. })
  73.  
  74. i['participants'] = participants
  75.  
  76. score_strings = hxs.select("//*[contains(@class,'cb-EventInfo-scoreboard')]//tbody/tr/td/@title[contains(.,'goes to')]").extract()
  77. scores = []
  78. for score_string in score_strings:
  79. s = {}
  80. m = re.match("^(\d+)pt from (.+) goes to (.+)$", score_string)
  81. if m:
  82. s['country_from'] = m.group(2)
  83. s['country_to'] = m.group(3)
  84. s['score'] = int(m.group(1))
  85.  
  86. scores.append(s)
  87.  
  88. i['scores'] = scores
  89.  
  90.  
  91. details_rows = hxs.select('//table[@class="details"]/tr')
  92. details = dict(
  93. (row.select('th/text()').extract()[0].strip(),
  94. row.select('td/text()').extract()[0].strip())
  95. for row in details_rows)
  96.  
  97. i['location'] = details.pop('Location', None)
  98. i['venue'] = details.pop('Venue', None)
  99.  
  100. i['details'] = details
  101.  
  102. return i
  103.  
  104. # Snippet imported from snippets.scrapy.org (which no longer works)
  105. # author: xepka
  106. # date : Apr 22, 2011
  107.  

Report this snippet  

You need to login to post a comment.