Posted By

asimmittal on 04/28/13
Tagged

python
Versions (?)

Last Edited at 04/28/13 04:33pm
Statistics

Viewed 679 times
Favorited by 1 user(s)
Related snippets

Scrape list of all NBA players

/ Published in: Python
This is a python script that allows you to scrape historical player names and links from NBA.com historical data
Expand | Embed | Plain Text
Copy this code and paste it in your HTML
import urllib2, time, traceback, json
 
# Removes non ascii characters from a string 's'
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
 
 
if __name__ == '__main__':
 
	baseUrl = "http://www.nba.com/historical/search/index.jsp?fl=(%l)&pager.offset=(%o)#results"
	letter = "(%l)"
	offset = "(%o)"
 
	startTime = time.asctime();
 
	# ASCII values for letters from A to Z
	ordAllLetters = range(ord('A'), ord('Z') + 1) 
	resultsPerPage = 50
	countPlayers = 0
	countPagesScraped = 0
	dictPlayerPageUrls = {}
 
	# Keep scraping if there are letters still left in the list of all letters
 
	while len(ordAllLetters) > 0:
 
		# Grab the first letter from the list and try scraping the pages for that letter
		# For every letter, there are multiple pages. Each page contains 50 names of 
		# athletes
 
		each = ordAllLetters[0];
		curLetter = chr(each)
		curUrl = baseUrl.replace(letter,curLetter).replace(offset,str(''))
 
		# For this letter, find out how many pages exist. We can do this by scraping
		# the "rnav" div at the bottom of the page. The number of <a> in that div 
		# will tell you how many pages there are against this letter. Each page has
		# 50 results.
 
		try:
 
			html = urllib2.urlopen(curUrl, timeout=5).read();
			resultsCountDivTagStart = '<div class="rnav">'
			resultsCountDivTagStop = "</div>"
 
			indexResultCountStart = html.find(resultsCountDivTagStart)
			indexResultCountStop = html.find(resultsCountDivTagStop, indexResultCountStart)
			resultsArea = html[indexResultCountStart:indexResultCountStop]
			linkSearchTag = "</a>"
			lstResultsLink = resultsArea.split(linkSearchTag)
			del lstResultsLink[0]
 
			# number of items in lstResultsLink represents the number of offset pages
			# this letter has. Each page has 50 results per page. The offset values
			# range from 0, 50, 100... and so on. So let's create a list that contains
			# the exact offset values that we need to add to the baseUrl to get the appropriate
			# page for this letter.
 
			listPageOffsets = [i * resultsPerPage for i in range(0,len(lstResultsLink))]
			if listPageOffsets == [] : listPageOffsets = [0]
 
			print '\nLetter:', curLetter, ' pages:', len(lstResultsLink), ' offset values:', listPageOffsets
 
			# Now listPageOffsets contains [0,50,100,150...]. For each letter, we have all 
			# the offsets. So let's create the url for every offset, download that page
			# and grab the links for every player's individual pages.
 
			for eachOffsetPage in listPageOffsets:
 
				# We don't want to overload the NBA.com webserver with mulitple calls in a short
				# span of time, cuz if you do you'll notice they will lock you out (calls time out)
				# So we'll slow things down by making calls only every half a second 
 
				time.sleep(0.5)
 
				# So for this page offset, construct the url and GET it
 
				strOffset = str(eachOffsetPage)
				curUrl = baseUrl.replace(letter,curLetter).replace(offset,strOffset)
				countPagesScraped += 1
 
				try:
 
					# fetch the page and grab the results table. That contains
					# the list of players.
 
					htmlPage = urllib2.urlopen(curUrl).read();
					indexStart = htmlPage.find('<div id="tableContainer">')
					indexStop = htmlPage.find("<script>",indexStart)
					resultsBody = htmlPage[indexStart:indexStop]
					lstResults = resultsBody.split('<tr class="resultsTable" >')
 
					del lstResults[0];
 
					print '--------> offset: ', eachOffsetPage
 
					# Okay now each element in lstResults represents the html
					# surrounding a player. Let's process each element in lstResults 
					# to extract player data
 
					for eachPlayerHtml in lstResults:
 
						indexStart = eachPlayerHtml.find("<a href='") + len("<a href='")
						indexEnd = eachPlayerHtml.find("</a>")
						lstPlayerInfo = eachPlayerHtml[indexStart:indexEnd].split("'>")
						link = "http://nba.com/"+lstPlayerInfo[0]; name = lstPlayerInfo[1]
						key = link[link.find('=') + 1:]
						active = True if ('ACTIVE' in eachPlayerHtml) else False
 
						# Now we have a player's details - Name, PageURL & if still active
						# Let's save this in a dictionary so that we can use this later
 
						dictPlayerPageUrls[key] = {
							'index': countPlayers,
							'name' : removeNonAscii(name),
							'link' : link,
							'active' : active
						};
 
						countPlayers+=1
						print '------------------------->',countPlayers,":",name
 
				except:					
					traceback.print_exc()
					print 'Exception in player page. Letter: ' + curLetter + ' Offset: ' + strOffset
 
			# All the pages for this letter have been scraped. Let's get rid
			# of it and go to the next letter
 
			del ordAllLetters[0]
 
		except :
 
			# Something went wrong while trying to scrape this letter's pages
			# so the letter has not been removed from the list 'ordAllLetters'
			# and we'll try scraping its pages again and again until its done
 
			print '*********> Exception on letter: ', curLetter
 
	print 'Started at: ', startTime
	print 'Ended at: ', time.asctime()
	print 'Total number of Players: ', countPlayers
	print 'Total pages scraped:', countPagesScraped
 
	# When the control reaches here, all the URLs for the players in the nba.com historical 
	# stats are scraped and stored in the dictionary. Let's save that in a JSON file for later
 
	try:
		strJSON = json.dumps(dictPlayerPageUrls)
		f = open('players.json','w')
		f.write(strJSON)
		f.close()
	except UnicodeDecodeError:
 
		print '\n\n\nError in JSON string with some non utf-8 characters'
Report this snippet Tweet
Comments

Subscribe to comments
Comment:
You need to login to post a comment.