Backup my twitter-log


/ Published in: Python
Save to your folder(s)

e.g.
<pre>
$ twlog.py user
twitter account name: user
Now logging start, take several times. r u OK? [y/N]: y
DEBUG: make dir: twitter-log-user
DEBUG: get log: page=1
DEBUG: new tweet: 21, accum. tweet: 21
DEBUG: remain hits: 149
DEBUG: write file: 20090828-105421-page0001.xml
DEBUG: wait 3sec.
...
DEBUG: get log: page=7
DEBUG: new tweet: 0, accum. tweet: 115
DEBUG: remain hits: 143
OK!
</pre>

* cf. [Stylesheet for twitter-log file](http://snipplr.com/view/18927/stylesheet-for-twitterlog-file/)

* cf. [Merge twitter-logged files into one xml-file](http://snipplr.com/view/18928/merge-twitterlogged-files-into-one-xmlfile/)


Copy this code and paste it in your HTML
  1. # vim: ts=2 sw=2 expandtab
  2. import sys
  3. from os import mkdir
  4. from os.path import exists
  5. import time
  6. from urllib2 import urlopen
  7. from xml.dom import Node
  8. from xml.dom.minidom import parseString
  9. from time import strptime, strftime, localtime, time, sleep
  10. from calendar import timegm
  11.  
  12. debug=True
  13.  
  14. def getRemain():
  15. url='http://twitter.com/account/rate_limit_status.xml'
  16. raw=urlopen(url).read()
  17. dom=parseString(raw)
  18. tag=dom.getElementsByTagName('remaining-hits').item(0)
  19. remain=int(tag.firstChild.data)
  20. return remain
  21.  
  22. def getResetTime():
  23. url='http://twitter.com/account/rate_limit_status.xml'
  24. raw=urlopen(url).read()
  25. dom=parseString(raw)
  26. tag=dom.getElementsByTagName('reset-time').item(0)
  27. resetTime=tag.firstChild.data
  28. return resetTime
  29.  
  30. def getIds(dom):
  31. ids=[]
  32. for idNode in dom.getElementsByTagName('id'):
  33. ids.append(int(idNode.childNodes[0].data))
  34. return ids
  35.  
  36. def getPage(user, page):
  37. url='http://twitter.com/statuses/user_timeline/'
  38. rawData=urlopen(url+user+'.xml?page='+str(page)).read()
  39. return rawData
  40.  
  41. def getEpoch(str):
  42. dd = strptime(str, "%a %b %d %H:%M:%S +0000 %Y")
  43. return timegm(dd)
  44.  
  45. def addEpochAttr(dom):
  46. ds=dom.getElementsByTagName('created_at')
  47. for d in ds:
  48. dtext=d.firstChild.data
  49. depoch=getEpoch(dtext)
  50. d.setAttribute('epoch', str(depoch))
  51.  
  52. def debug(str):
  53. if debug: print 'DEBUG: '+str
  54.  
  55. ### MAIN ####################################################
  56. if(len(sys.argv)<2):
  57. print 'Usage: '+sys.argv[0]+' <twitter account name> [page offset]'
  58. else:
  59. user=sys.argv[1]
  60. print 'twitter account name: '+user
  61. dir='twitter-log-'+user
  62. sleepTime=3
  63.  
  64. # CONFIRM
  65. s=raw_input('Now logging start, take several times. r u OK? [y/N]: ')
  66.  
  67. if s.lower()=='y' or s.lower()=='yes':
  68. # START LOGGING
  69. remain=getRemain()
  70. if(remain>0):
  71. if(not exists(dir)): mkdir(dir) # make dir
  72. debug('make dir: '+dir)
  73. idSet=set() # init
  74. if(len(sys.argv)>2): page=int(sys.argv[2])
  75. else: page=1
  76. pageStr='%04d'%page
  77. rawData=getPage(user, page) # get log
  78. dom=parseString(rawData)
  79. addEpochAttr(dom)
  80. debug('get log: page='+str(page))
  81. newIdSet=set(getIds(dom))-idSet
  82. idSet=idSet|newIdSet
  83. debug('new tweet: '+str(len(newIdSet))+', accum. tweet: '+str(len(idSet)))
  84. remain=getRemain()
  85. debug('remain hits: '+str(remain))
  86.  
  87. while(len(newIdSet)>0 and remain>0):
  88. rawData=dom.toxml("utf-8")
  89. path=strftime("%Y%m%d-%H%M%S", localtime(time()))+'-page'+pageStr+'.xml'
  90. f=open(dir+'/'+path, 'w')
  91. f.write(rawData)
  92. debug('write file: '+path)
  93.  
  94. debug('wait '+str(sleepTime)+'sec.')
  95. sleep(sleepTime); # wait
  96.  
  97. page+=1
  98. pageStr='%04d'%page
  99. rawData=getPage(user, page) # get log
  100. dom=parseString(rawData)
  101. addEpochAttr(dom)
  102. debug('get log: page='+str(page))
  103. newIdSet=set(getIds(dom))-idSet
  104. idSet=idSet|newIdSet
  105. debug('new tweet: '+str(len(newIdSet))+', accum. tweet: '+str(len(idSet)))
  106. remain=getRemain()
  107. debug('remain hits: '+str(remain))
  108.  
  109. if(remain==0):
  110. print 'Error: no more request.'
  111. print 'Next reset time(UTC): '+getResetTime()
  112. else: print 'OK!'
  113.  
  114. else:
  115. print 'Error: no more request.'
  116. print 'Next reset time(UTC): '+getResetTime()
  117. else: print 'Aborted.'

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.