We Recommend

Learning Python Learning Python
The authors of Learning Python show you enough essentials of the Python scripting language to enable you to begin solving problems right away, then reveal more powerful aspects of the language one at a time. This approach is sure to appeal to programmers and system administrators who have urgent problems and a preference for learning by semi-guided experimentation.


Posted By

mandric on 05/01/08


Tagged

location data python twitter


Versions (?)


Scrape twitterlocal and save to Django model


Published in: Python 


going to use twitterlocal to play around a bit with pygooglechart.

  1. #! /usr/bin/env python
  2. # -*- coding: iso-8859-1 -*-
  3. # vi:ts=4:et
  4. import os,re,sys
  5. from BeautifulSoup import BeautifulSoup
  6. from urllib2 import urlopen, HTTPError, URLError
  7. import datetime
  8. import optparse
  9.  
  10.  
  11. CACHE_FILE = '/tmp/twitterlocal_cache'
  12. CACHE_TIME = 10 #minutes
  13.  
  14. def fetch_html(url, cache_minutes=60*2):
  15. """
  16. Return HTML string from a url, caching for two hours by default.
  17. """
  18. if not url.startswith('http://'):
  19. url = 'http://'+url
  20.  
  21. try:
  22. last_mod = datetime.datetime.fromtimestamp(
  23. os.stat(CACHE_FILE).st_mtime)
  24. except OSError:
  25. # probably couldn't find the file
  26. last_mod = datetime.datetime(1900,1,1)
  27. html = urlopen(url).read()
  28. f = file(CACHE_FILE,'w')
  29. f.write(html)
  30. f.close()
  31.  
  32. delta = datetime.timedelta(minutes=cache_minutes)
  33. if last_mod < (datetime.datetime.now() - delta):
  34. # grab url and save to cache
  35. html = urlopen(url).read()
  36. f = file(CACHE_FILE,'w')
  37. f.write(html)
  38. f.close()
  39. else :
  40. # read cache file
  41. f= file(CACHE_FILE)
  42. html = f.read()
  43. f.close()
  44.  
  45. return html
  46.  
  47.  
  48. if __name__ == '__main__':
  49. parser = optparse.OptionParser()
  50. parser.add_option('--settings')
  51. options, args = parser.parse_args()
  52. # call this script like
  53. # twitterlocal/bin/scrape.py --settings myproject.settings
  54. if options.settings:
  55. os.environ["DJANGO_SETTINGS_MODULE"] = options.settings
  56.  
  57. sys.path.append(os.getcwd())
  58. try:
  59. from myproject.twitterlocal.models import TweetsStat
  60. except ImportError:
  61. print 'define your settings path, like --settings \'myproject.settings\''
  62. exit()
  63.  
  64. html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME)
  65. soup = BeautifulSoup(html)
  66.  
  67. # Parse dates
  68. dates = [d for d in soup('h3')[0].contents[2].split(' ') if d]
  69.  
  70. # List comprehension fun!
  71. m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')]
  72. end_time = datetime.datetime(y+2000,m,d,mi,s)
  73.  
  74. m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')]
  75. start_time = datetime.datetime(y+2000,m,d,mi,s)
  76.  
  77. for row in soup('tbody')[0].findAll('tr') :
  78. d = row.findAll('td')
  79. location = d[0].contents[0]
  80. tweets = d[1].contents[0]
  81. try:
  82. t = TweetsStat.objects.get(
  83. location=location,
  84. start_time=start_time,
  85. end_time=end_time,)
  86. except TweetsStat.DoesNotExist:
  87. t = TweetsStat(
  88. location=location,
  89. tweets=tweets,
  90. start_time=start_time,
  91. end_time=end_time,)
  92.  
  93. t.save()

Report this snippet 

You need to login to post a comment.