Revision: 6153
Updated Code
at May 2, 2008 12:21 by mandric
Updated Code
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
import os,re,sys
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen, HTTPError, URLError
import datetime
import optparse
CACHE_FILE = '/tmp/twitterlocal_cache'
CACHE_TIME = 10 #minutes
def fetch_html(url, cache_minutes=60*2):
"""
Return HTML string from a url, caching for two hours by default.
"""
if not url.startswith('http://'):
url = 'http://'+url
try:
last_mod = datetime.datetime.fromtimestamp(
os.stat(CACHE_FILE).st_mtime)
except OSError:
# probably couldn't find the file
last_mod = datetime.datetime(1900,1,1)
html = urlopen(url).read()
f = file(CACHE_FILE,'w')
f.write(html)
f.close()
delta = datetime.timedelta(minutes=cache_minutes)
if last_mod < (datetime.datetime.now() - delta):
# grab url and save to cache
html = urlopen(url).read()
f = file(CACHE_FILE,'w')
f.write(html)
f.close()
else :
# read cache file
f= file(CACHE_FILE)
html = f.read()
f.close()
return html
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option('--settings')
options, args = parser.parse_args()
# call this script like
# twitterlocal/bin/scrape.py --settings myproject.settings
if options.settings:
os.environ["DJANGO_SETTINGS_MODULE"] = options.settings
sys.path.append(os.getcwd())
try:
from myproject.twitterlocal.models import TweetsStat
except ImportError:
print 'define your settings path, like --settings \'myproject.settings\''
exit()
html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME)
soup = BeautifulSoup(html)
# Parse dates
dates = [d for d in soup('h3')[0].contents[2].split(' ') if d]
# List comprehension fun!
m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')]
end_time = datetime.datetime(y+2000,m,d,mi,s)
m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')]
start_time = datetime.datetime(y+2000,m,d,mi,s)
for row in soup('tbody')[0].findAll('tr') :
d = row.findAll('td')
location = d[0].contents[0]
tweets = d[1].contents[0]
try:
t = TweetsStat.objects.get(
location=location,
start_time=start_time,
end_time=end_time,)
except TweetsStat.DoesNotExist:
t = TweetsStat(
location=location,
tweets=tweets,
start_time=start_time,
end_time=end_time,)
t.save()
Revision: 6152
Updated Code
at May 1, 2008 17:22 by mandric
Updated Code
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
import os,re,sys
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen, HTTPError, URLError
import datetime
import optparse
CACHE_FILE = '/tmp/twitterlocal_cache'
CACHE_TIME = 10 #minutes
def fetch_html(url, cache_minutes=60*2):
"""
Return HTML string from a url, caching for two hours by default.
"""
if not url.startswith('http://'):
url = 'http://'+url
try:
last_mod = datetime.datetime.fromtimestamp(
os.stat(CACHE_FILE).st_mtime)
except OSError:
# probably couldn't find the file
last_mod = datetime.datetime(1900,1,1)
html = urlopen(url).read()
f = file(CACHE_FILE,'w')
f.write(html)
f.close()
delta = datetime.timedelta(minutes=cache_minutes)
if last_mod < (datetime.datetime.now() - delta):
# grab url and save to cache
html = urlopen(url).read()
f = file(CACHE_FILE,'w')
f.write(html)
f.close()
else :
# read cache file
f= file(CACHE_FILE)
html = f.read()
f.close()
return html
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option('--settings')
options, args = parser.parse_args()
# call this script like
# twitterlocal/bin/scrape.py --settings myproject.settings
if options.settings:
os.environ["DJANGO_SETTINGS_MODULE"] = options.settings
sys.path.append(os.getcwd())
try:
from myproject.twitterlocal.models import TweetsStat
except ImportError:
print 'define your settings path, like --settings \'myproject.settings\''
exit()
html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME)
soup = BeautifulSoup(html)
# Parse dates
dates = [d for d in soup('h3')[0].contents[2].split(' ') if d]
# List comprehension fun!
m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')]
end_time = datetime.datetime(y+2000,m,d,mi,s)
m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')]
start_time = datetime.datetime(y,m,d,mi,s)
for row in soup('tbody')[0].findAll('tr') :
d = row.findAll('td')
location = d[0].contents[0]
tweets = d[1].contents[0]
try:
t = TweetsStat.objects.get(
location=location,
start_time=start_time,
end_time=end_time,)
except TweetsStat.DoesNotExist:
t = TweetsStat(
location=location,
tweets=tweets,
start_time=start_time,
end_time=end_time,)
t.save()
Revision: 6151
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at May 1, 2008 17:20 by mandric
Initial Code
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
import os,re,sys
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen, HTTPError, URLError
import datetime
import optparse
CACHE_FILE = '/tmp/twitterlocal_cache'
CACHE_TIME = 30 #minutes
def fetch_html(url, cache_minutes=60*2):
"""
Return HTML string from a url, caching for two hours by default.
"""
if not url.startswith('http://'):
url = 'http://'+url
try:
last_mod = datetime.datetime.fromtimestamp(
os.stat(CACHE_FILE).st_mtime)
except OSError:
# probably couldn't find the file
last_mod = datetime.datetime(1900,1,1)
html = urlopen(url).read()
f = file(CACHE_FILE,'w')
f.write(html)
f.close()
delta = datetime.timedelta(minutes=cache_minutes)
if last_mod < (datetime.datetime.now() - delta):
# grab url and save to cache
html = urlopen(url).read()
f = file(CACHE_FILE,'w')
f.write(html)
f.close()
else :
# read cache file
f= file(CACHE_FILE)
html = f.read()
f.close()
return html
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option('--settings')
options, args = parser.parse_args()
# call this script like
# twitterlocal/bin/scrape.py --settings myproject.settings
if options.settings:
os.environ["DJANGO_SETTINGS_MODULE"] = options.settings
sys.path.append(os.getcwd())
try:
from myproject.twitterlocal.models import TweetsStat
except ImportError:
print 'define your settings path, like --settings \'myproject.settings\''
exit()
html = fetch_html('http://twitterlocal.net/stats', CACHE_TIME)
soup = BeautifulSoup(html)
# Parse dates
dates = [d for d in soup('h3')[0].contents[2].split(' ') if d]
# List comprehension fun!
m,d,y,mi,s = [int(x) for x in dates[3].split('/') + dates[4].split(':')]
end_time = datetime.datetime(y+2000,m,d,mi,s)
m,d,y,mi,s = [int(x) for x in dates[0].split('/') + dates[1].split(':')]
start_time = datetime.datetime(y,m,d,mi,s)
for row in soup('tbody')[0].findAll('tr') :
d = row.findAll('td')
location = d[0].contents[0]
tweets = d[1].contents[0]
try:
t = TweetsStat.objects.get(
location=location,
start_time=start_time,
end_time=end_time,)
except TweetsStat.DoesNotExist:
t = TweetsStat(
location=location,
tweets=tweets,
start_time=start_time,
end_time=end_time,)
t.save()
Initial URL
Initial Description
going to use twitterlocal to play around a bit with pygooglechart.
Initial Title
Scrape twitterlocal and save to Django model
Initial Tags
data, python, twitter
Initial Language
Python