Posted By

scrapy on 09/01/12


Tagged

spider scrapy soccer


Versions (?)

Who likes this?

2 people have marked this snippet as a favorite

icecreamboyy
usdtorero


soccerway.com soccer matches data extraction


 / Published in: Python
 

  1. # This spider extracts the info (teams, date-time, score) related to soccer matches from soccerway.com, you can define the cups you want to scrape, the example extracts info related to brazil, but you can add as many tuples you want in the cup list variable, the tuple should have 2 members: the name of the cup and its url in soccerway.com
  2.  
  3. """ soccerway.com spider to scrape soccer matches info """
  4.  
  5. from datetime import datetime
  6.  
  7. from scrapy.http import Request
  8. from scrapy.selector import HtmlXPathSelector
  9. from scrapy.contrib.spiders import CrawlSpider
  10. from scrapy.item import Item, Field
  11. from scrapy.contrib.loader import XPathItemLoader
  12. from scrapy.contrib.loader.processor import TakeFirst, MapCompose
  13.  
  14.  
  15. class Match(Item):
  16. """ Item to load with scraped data """
  17. cup = Field()
  18. team1 = Field()
  19. team2 = Field()
  20. goals1 = Field()
  21. goals2 = Field()
  22. date = Field()
  23. time = Field()
  24.  
  25.  
  26. class MatchLoader(XPathItemLoader):
  27. """ Loader to make the exctraction easier """
  28. default_item_class = Match
  29. default_output_processor = TakeFirst()
  30. team1_in = MapCompose(lambda x:x.strip())
  31. team2_in = MapCompose(lambda x:x.strip())
  32.  
  33.  
  34. class SoccerwaySpider(CrawlSpider):
  35. """ The spider, you can define the cups you want to exctract """
  36. name = 'soccerway'
  37. allowed_domains = ['soccerway.com']
  38. cups = [
  39. ('brazil-2010',
  40. 'national/brazil/serie-a/2010/regular-season/matches/'),
  41. ]
  42.  
  43. def start_requests(self):
  44. for cup, url in self.cups:
  45. yield Request('http://www.soccerway.com/%s' % url,
  46. self.parse_matches, meta={'cup': cup})
  47.  
  48. def parse_matches(self, response):
  49. """ Parse the matches in a fixture listing """
  50. cup = response.request.meta['cup']
  51.  
  52. xs, dt, dat = HtmlXPathSelector(response), None, None
  53.  
  54. for tr in xs.select('//table[starts-with(@class,"matches")]'
  55. '/tbody/tr[not(contains(@class,"aggr"))]'):
  56. mi = MatchLoader(selector=tr, response=response)
  57. mi.add_value('cup', cup)
  58. mi.add_xpath('team1', 'td[3]/a/text()')
  59. mi.add_xpath('team2', 'td[5]/a/text()')
  60.  
  61. # Match status info
  62. sct = [x.strip() for x in tr.select('td[4]//text()').extract() \
  63. if x.strip()]
  64. sct = sct[1] if len(sct) > 1 else sct[0]
  65.  
  66. # Extract timestamp info
  67. day = tr.select('td[1]/span/text()').extract()
  68. if day:
  69. dat = datetime.strptime('%s %s' % (day[0], tr.select(
  70. 'td[2]/span/text()').extract()[0]), '%a %d/%m/%y')
  71. if dat:
  72. mi.add_value('date', dat.strftime('%Y-%m-%d'))
  73.  
  74. # If not postponed, take the time
  75. if sct not in ('PSTP', '-'):
  76. dt = datetime.fromtimestamp(float(
  77. tr.select('td[1]/span/@data-value').extract()[0]))
  78. else:
  79. dt = None
  80. if dt:
  81. mi.replace_value('date', dt.strftime('%Y-%m-%d'))
  82. mi.add_value('time', dt.strftime('%H:%M'))
  83.  
  84. # If played, scrape the result
  85. if '-' in sct:
  86. goals = [s.strip() for s in sct.split('-')] # 1) strip
  87. goals = [int(s) for s in goals if s] # 2) convert to int
  88. if len(goals) == 2:
  89. mi.add_value('goals1', str(goals[0]))
  90. mi.add_value('goals2', str(goals[1]))
  91.  
  92. yield mi.load_item()
  93.  
  94. SPIDER = SoccerwaySpider()
  95.  
  96. # Snippet imported from snippets.scrapy.org (which no longer works)
  97. # author: anibal
  98. # date : Aug 10, 2010
  99.  

Report this snippet  

You need to login to post a comment.