Posted By

asimmittal on 04/28/13


Tagged

python scraping sports NBA


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

Priestd09


Scrape list of all NBA players


 / Published in: Python
 

This is a python script that allows you to scrape historical player names and links from NBA.com historical data

  1. import urllib2, time, traceback, json
  2.  
  3. # Removes non ascii characters from a string 's'
  4. def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
  5.  
  6.  
  7. if __name__ == '__main__':
  8.  
  9. baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
  10. letter = "(%l)"
  11. offset = "(%o)"
  12.  
  13. startTime = time.asctime();
  14.  
  15. # ASCII values for letters from A to Z
  16. ordAllLetters = range(ord('A'), ord('Z') + 1)
  17. resultsPerPage = 50
  18. countPlayers = 0
  19. countPagesScraped = 0
  20. dictPlayerPageUrls = {}
  21.  
  22. # Keep scraping if there are letters still left in the list of all letters
  23.  
  24. while len(ordAllLetters) > 0:
  25.  
  26. # Grab the first letter from the list and try scraping the pages for that letter
  27. # For every letter, there are multiple pages. Each page contains 50 names of
  28. # athletes
  29.  
  30. each = ordAllLetters[0];
  31. curLetter = chr(each)
  32. curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
  33.  
  34. # For this letter, find out how many pages exist. We can do this by scraping
  35. # the "rnav" div at the bottom of the page. The number of <a> in that div
  36. # will tell you how many pages there are against this letter. Each page has
  37. # 50 results.
  38.  
  39. try:
  40.  
  41. html = urllib2.urlopen(curUrl, timeout=5).read();
  42. resultsCountDivTagStart = '<div class="rnav">'
  43. resultsCountDivTagStop = "</div>"
  44.  
  45. indexResultCountStart = html.find(resultsCountDivTagStart)
  46. indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
  47. resultsArea = html[indexResultCountStart:indexResultCountStop]
  48. linkSearchTag = "</a>"
  49. lstResultsLink = resultsArea.split(linkSearchTag)
  50. del lstResultsLink[0]
  51.  
  52. # number of items in lstResultsLink represents the number of offset pages
  53. # this letter has. Each page has 50 results per page. The offset values
  54. # range from 0, 50, 100... and so on. So let's create a list that contains
  55. # the exact offset values that we need to add to the baseUrl to get the appropriate
  56. # page for this letter.
  57.  
  58. listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
  59. if listPageOffsets == [] : listPageOffsets = [0]
  60.  
  61. print '\nLetter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets
  62.  
  63. # Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all
  64. # the offsets. So let's create the url for every offset, download that page
  65. # and grab the links for every player's individual pages.
  66.  
  67. for eachOffsetPage in listPageOffsets:
  68.  
  69. # We don't want to overload the NBA.com webserver with mulitple calls in a short
  70. # span of time, cuz if you do you'll notice they will lock you out (calls time out)
  71. # So we'll slow things down by making calls only every half a second
  72.  
  73. time.sleep(0.5)
  74.  
  75. # So for this page offset, construct the url and GET it
  76.  
  77. strOffset = str(eachOffsetPage)
  78. curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)
  79. countPagesScraped += 1
  80.  
  81. try:
  82.  
  83. # fetch the page and grab the results table. That contains
  84. # the list of players.
  85.  
  86. htmlPage = urllib2.urlopen(curUrl).read();
  87. indexStart = htmlPage.find('<div id="tableContainer">')
  88. indexStop = htmlPage.find("<script>",indexStart)
  89. resultsBody = htmlPage[indexStart:indexStop]
  90. lstResults = resultsBody.split('<tr class="resultsTable" >')
  91.  
  92. del lstResults[0];
  93.  
  94. print '--------> offset: ', eachOffsetPage
  95.  
  96. # Okay now each element in lstResults represents the html
  97. # surrounding a player. Let's process each element in lstResults
  98. # to extract player data
  99.  
  100. for eachPlayerHtml in lstResults:
  101.  
  102. indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
  103. indexEnd = eachPlayerHtml.find("</a>")
  104. lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
  105. link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
  106. key = link[link.find('=') + 1:]
  107. active = True if ('ACTIVE' in eachPlayerHtml) else False
  108.  
  109. # Now we have a player's details - Name, PageURL & if still active
  110. # Let's save this in a dictionary so that we can use this later
  111.  
  112. dictPlayerPageUrls[key] = {
  113. 'index': countPlayers,
  114. 'name' : removeNonAscii(name),
  115. 'link' : link,
  116. 'active' : active
  117. };
  118.  
  119. countPlayers+=1
  120. print '------------------------->',countPlayers,":",name
  121.  
  122. except:
  123. traceback.print_exc()
  124. print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset
  125.  
  126. # All the pages for this letter have been scraped. Let's get rid
  127. # of it and go to the next letter
  128.  
  129. del ordAllLetters[0]
  130.  
  131. except :
  132.  
  133. # Something went wrong while trying to scrape this letter's pages
  134. # so the letter has not been removed from the list 'ordAllLetters'
  135. # and we'll try scraping its pages again and again until its done
  136.  
  137. print '*********> Exception on letter: ', curLetter
  138.  
  139. print 'Started at: ', startTime
  140. print 'Ended at: ', time.asctime()
  141. print 'Total number of Players: ', countPlayers
  142. print 'Total pages scraped:', countPagesScraped
  143.  
  144. # When the control reaches here, all the URLs for the players in the nba.com historical
  145. # stats are scraped and stored in the dictionary. Let's save that in a JSON file for later
  146.  
  147. try:
  148. strJSON = json.dumps(dictPlayerPageUrls)
  149. f = open('players.json','w')
  150. f.write(strJSON)
  151. f.close()
  152. except UnicodeDecodeError:
  153.  
  154. print '\n\n\nError in JSON string with some non utf-8 characters'

Report this snippet  

You need to login to post a comment.