/ Published in: Python
going to use twitterlocal to play around a bit with pygooglechart.
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
#! /usr/bin/env python # -*- coding: iso-8859-1 -*- # vi:ts=4:et import os,re,sys from BeautifulSoup import BeautifulSoup from urllib2 import urlopen, HTTPError, URLError import datetime import optparse CACHE_FILE = '/tmp/twitterlocal_cache' CACHE_TIME = 10 #minutes def fetch_html(url, cache_minutes=60*2): """ Return HTML string from a url, caching for two hours by default. """ if not url.startswith('http://'): url = 'http://'+url try: last_mod = datetime.datetime.fromtimestamp( os.stat(CACHE_FILE).st_mtime) except OSError: # probably couldn't find the file last_mod = datetime.datetime(1900,1,1) html = urlopen(url).read() f = file(CACHE_FILE,'w') f.write(html) f.close() delta = datetime.timedelta(minutes=cache_minutes) if last_mod < (datetime.datetime.now() - delta): # grab url and save to cache html = urlopen(url).read() f = file(CACHE_FILE,'w') f.write(html) f.close() else : # read cache file f= file(CACHE_FILE) html = f.read() f.close() return html if __name__ == '__main__': parser = optparse.OptionParser() parser.add_option('--settings') options, args = parser.parse_args() # call this script like # twitterlocal/bin/scrape.py --settings myproject.settings if options.settings: os.environ["DJANGO_SETTINGS_MODULE"] = options.settings sys.path.append(os.getcwd()) try: from myproject.twitterlocal.models import TweetsStat except ImportError: print 'define your settings path, like --settings \'myproject.settings\'' exit() html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME) soup = BeautifulSoup(html) # Parse dates dates = [d for d in soup('h3')[0].contents[2].split(' ') if d] # List comprehension fun! m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')] end_time = datetime.datetime(y+2000,m,d,mi,s) m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')] start_time = datetime.datetime(y+2000,m,d,mi,s) for row in soup('tbody')[0].findAll('tr') : d = row.findAll('td') location = d[0].contents[0] tweets = d[1].contents[0] try: t = TweetsStat.objects.get( location=location, start_time=start_time, end_time=end_time,) except TweetsStat.DoesNotExist: t = TweetsStat( location=location, tweets=tweets, start_time=start_time, end_time=end_time,) t.save()