/ Published in: Python
e.g.
<pre>
$ twlog.py user
twitter account name: user
Now logging start, take several times. r u OK? [y/N]: y
DEBUG: make dir: twitter-log-user
DEBUG: get log: page=1
DEBUG: new tweet: 21, accum. tweet: 21
DEBUG: remain hits: 149
DEBUG: write file: 20090828-105421-page0001.xml
DEBUG: wait 3sec.
...
DEBUG: get log: page=7
DEBUG: new tweet: 0, accum. tweet: 115
DEBUG: remain hits: 143
OK!
</pre>
* cf. [Stylesheet for twitter-log file](http://snipplr.com/view/18927/stylesheet-for-twitterlog-file/)
* cf. [Merge twitter-logged files into one xml-file](http://snipplr.com/view/18928/merge-twitterlogged-files-into-one-xmlfile/)
<pre>
$ twlog.py user
twitter account name: user
Now logging start, take several times. r u OK? [y/N]: y
DEBUG: make dir: twitter-log-user
DEBUG: get log: page=1
DEBUG: new tweet: 21, accum. tweet: 21
DEBUG: remain hits: 149
DEBUG: write file: 20090828-105421-page0001.xml
DEBUG: wait 3sec.
...
DEBUG: get log: page=7
DEBUG: new tweet: 0, accum. tweet: 115
DEBUG: remain hits: 143
OK!
</pre>
* cf. [Stylesheet for twitter-log file](http://snipplr.com/view/18927/stylesheet-for-twitterlog-file/)
* cf. [Merge twitter-logged files into one xml-file](http://snipplr.com/view/18928/merge-twitterlogged-files-into-one-xmlfile/)
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
# vim: ts=2 sw=2 expandtab import sys from os import mkdir from os.path import exists import time from urllib2 import urlopen from xml.dom import Node from xml.dom.minidom import parseString from time import strptime, strftime, localtime, time, sleep from calendar import timegm debug=True def getRemain(): url='http://twitter.com/account/rate_limit_status.xml' raw=urlopen(url).read() dom=parseString(raw) tag=dom.getElementsByTagName('remaining-hits').item(0) remain=int(tag.firstChild.data) return remain def getResetTime(): url='http://twitter.com/account/rate_limit_status.xml' raw=urlopen(url).read() dom=parseString(raw) tag=dom.getElementsByTagName('reset-time').item(0) resetTime=tag.firstChild.data return resetTime def getIds(dom): ids=[] for idNode in dom.getElementsByTagName('id'): ids.append(int(idNode.childNodes[0].data)) return ids def getPage(user, page): url='http://twitter.com/statuses/user_timeline/' rawData=urlopen(url+user+'.xml?page='+str(page)).read() return rawData def getEpoch(str): dd = strptime(str, "%a %b %d %H:%M:%S +0000 %Y") return timegm(dd) def addEpochAttr(dom): ds=dom.getElementsByTagName('created_at') for d in ds: dtext=d.firstChild.data depoch=getEpoch(dtext) d.setAttribute('epoch', str(depoch)) def debug(str): if debug: print 'DEBUG: '+str ### MAIN #################################################### if(len(sys.argv)<2): print 'Usage: '+sys.argv[0]+' <twitter account name> [page offset]' else: user=sys.argv[1] print 'twitter account name: '+user dir='twitter-log-'+user sleepTime=3 # CONFIRM s=raw_input('Now logging start, take several times. r u OK? [y/N]: ') if s.lower()=='y' or s.lower()=='yes': # START LOGGING remain=getRemain() if(remain>0): if(not exists(dir)): mkdir(dir) # make dir debug('make dir: '+dir) idSet=set() # init if(len(sys.argv)>2): page=int(sys.argv[2]) else: page=1 pageStr='%04d'%page rawData=getPage(user, page) # get log dom=parseString(rawData) addEpochAttr(dom) debug('get log: page='+str(page)) newIdSet=set(getIds(dom))-idSet idSet=idSet|newIdSet debug('new tweet: '+str(len(newIdSet))+', accum. tweet: '+str(len(idSet))) remain=getRemain() debug('remain hits: '+str(remain)) while(len(newIdSet)>0 and remain>0): rawData=dom.toxml("utf-8") path=strftime("%Y%m%d-%H%M%S", localtime(time()))+'-page'+pageStr+'.xml' f=open(dir+'/'+path, 'w') f.write(rawData) debug('write file: '+path) debug('wait '+str(sleepTime)+'sec.') sleep(sleepTime); # wait page+=1 pageStr='%04d'%page rawData=getPage(user, page) # get log dom=parseString(rawData) addEpochAttr(dom) debug('get log: page='+str(page)) newIdSet=set(getIds(dom))-idSet idSet=idSet|newIdSet debug('new tweet: '+str(len(newIdSet))+', accum. tweet: '+str(len(idSet))) remain=getRemain() debug('remain hits: '+str(remain)) if(remain==0): print 'Error: no more request.' print 'Next reset time(UTC): '+getResetTime() else: print 'OK!' else: print 'Error: no more request.' print 'Next reset time(UTC): '+getResetTime() else: print 'Aborted.'