Scrape twitterlocal and save to Django model


/ Published in: Python
Save to your folder(s)

going to use twitterlocal to play around a bit with pygooglechart.


Copy this code and paste it in your HTML
  1. #! /usr/bin/env python
  2. # -*- coding: iso-8859-1 -*-
  3. # vi:ts=4:et
  4. import os,re,sys
  5. from BeautifulSoup import BeautifulSoup
  6. from urllib2 import urlopen, HTTPError, URLError
  7. import datetime
  8. import optparse
  9.  
  10.  
  11. CACHE_FILE = '/tmp/twitterlocal_cache'
  12. CACHE_TIME = 10 #minutes
  13.  
  14. def fetch_html(url, cache_minutes=60*2):
  15. """
  16. Return HTML string from a url, caching for two hours by default.
  17. """
  18. if not url.startswith('http://'):
  19. url = 'http://'+url
  20.  
  21. try:
  22. last_mod = datetime.datetime.fromtimestamp(
  23. os.stat(CACHE_FILE).st_mtime)
  24. except OSError:
  25. # probably couldn't find the file
  26. last_mod = datetime.datetime(1900,1,1)
  27. html = urlopen(url).read()
  28. f = file(CACHE_FILE,'w')
  29. f.write(html)
  30. f.close()
  31.  
  32. delta = datetime.timedelta(minutes=cache_minutes)
  33. if last_mod < (datetime.datetime.now() - delta):
  34. # grab url and save to cache
  35. html = urlopen(url).read()
  36. f = file(CACHE_FILE,'w')
  37. f.write(html)
  38. f.close()
  39. else :
  40. # read cache file
  41. f= file(CACHE_FILE)
  42. html = f.read()
  43. f.close()
  44.  
  45. return html
  46.  
  47.  
  48. if __name__ == '__main__':
  49. parser = optparse.OptionParser()
  50. parser.add_option('--settings')
  51. options, args = parser.parse_args()
  52. # call this script like
  53. # twitterlocal/bin/scrape.py --settings myproject.settings
  54. if options.settings:
  55. os.environ["DJANGO_SETTINGS_MODULE"] = options.settings
  56.  
  57. sys.path.append(os.getcwd())
  58. try:
  59. from myproject.twitterlocal.models import TweetsStat
  60. except ImportError:
  61. print 'define your settings path, like --settings \'myproject.settings\''
  62. exit()
  63.  
  64. html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME)
  65. soup = BeautifulSoup(html)
  66.  
  67. # Parse dates
  68. dates = [d for d in soup('h3')[0].contents[2].split(' ') if d]
  69.  
  70. # List comprehension fun!
  71. m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')]
  72. end_time = datetime.datetime(y+2000,m,d,mi,s)
  73.  
  74. m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')]
  75. start_time = datetime.datetime(y+2000,m,d,mi,s)
  76.  
  77. for row in soup('tbody')[0].findAll('tr') :
  78. d = row.findAll('td')
  79. location = d[0].contents[0]
  80. tweets = d[1].contents[0]
  81. try:
  82. t = TweetsStat.objects.get(
  83. location=location,
  84. start_time=start_time,
  85. end_time=end_time,)
  86. except TweetsStat.DoesNotExist:
  87. t = TweetsStat(
  88. location=location,
  89. tweets=tweets,
  90. start_time=start_time,
  91. end_time=end_time,)
  92.  
  93. t.save()

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.