Posted By

mustam on 08/27/09


Tagged

twitter logging


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

gpupo


Backup my twitter-log


 / Published in: Python
 

e.g.

$ twlog.py user
twitter account name: user
Now logging start, take several times. r u OK? [y/N]: y
DEBUG: make dir: twitter-log-user
DEBUG: get log: page=1
DEBUG: new tweet: 21, accum. tweet: 21
DEBUG: remain hits: 149
DEBUG: write file: 20090828-105421-page0001.xml
DEBUG: wait 3sec.
...
DEBUG: get log: page=7
DEBUG: new tweet: 0, accum. tweet: 115
DEBUG: remain hits: 143
OK!

  1. # vim: ts=2 sw=2 expandtab
  2. import sys
  3. from os import mkdir
  4. from os.path import exists
  5. import time
  6. from urllib2 import urlopen
  7. from xml.dom import Node
  8. from xml.dom.minidom import parseString
  9. from time import strptime, strftime, localtime, time, sleep
  10. from calendar import timegm
  11.  
  12. debug=True
  13.  
  14. def getRemain():
  15. url='http://twitter.com/account/rate_limit_status.xml'
  16. raw=urlopen(url).read()
  17. dom=parseString(raw)
  18. tag=dom.getElementsByTagName('remaining-hits').item(0)
  19. remain=int(tag.firstChild.data)
  20. return remain
  21.  
  22. def getResetTime():
  23. url='http://twitter.com/account/rate_limit_status.xml'
  24. raw=urlopen(url).read()
  25. dom=parseString(raw)
  26. tag=dom.getElementsByTagName('reset-time').item(0)
  27. resetTime=tag.firstChild.data
  28. return resetTime
  29.  
  30. def getIds(dom):
  31. ids=[]
  32. for idNode in dom.getElementsByTagName('id'):
  33. ids.append(int(idNode.childNodes[0].data))
  34. return ids
  35.  
  36. def getPage(user, page):
  37. url='http://twitter.com/statuses/user_timeline/'
  38. rawData=urlopen(url+user+'.xml?page='+str(page)).read()
  39. return rawData
  40.  
  41. def getEpoch(str):
  42. dd = strptime(str, "%a %b %d %H:%M:%S +0000 %Y")
  43. return timegm(dd)
  44.  
  45. def addEpochAttr(dom):
  46. ds=dom.getElementsByTagName('created_at')
  47. for d in ds:
  48. dtext=d.firstChild.data
  49. depoch=getEpoch(dtext)
  50. d.setAttribute('epoch', str(depoch))
  51.  
  52. def debug(str):
  53. if debug: print 'DEBUG: '+str
  54.  
  55. ### MAIN ####################################################
  56. if(len(sys.argv)<2):
  57. print 'Usage: '+sys.argv[0]+' <twitter account name> [page offset]'
  58. else:
  59. user=sys.argv[1]
  60. print 'twitter account name: '+user
  61. dir='twitter-log-'+user
  62. sleepTime=3
  63.  
  64. # CONFIRM
  65. s=raw_input('Now logging start, take several times. r u OK? [y/N]: ')
  66.  
  67. if s.lower()=='y' or s.lower()=='yes':
  68. # START LOGGING
  69. remain=getRemain()
  70. if(remain>0):
  71. if(not exists(dir)): mkdir(dir) # make dir
  72. debug('make dir: '+dir)
  73. idSet=set() # init
  74. if(len(sys.argv)>2): page=int(sys.argv[2])
  75. else: page=1
  76. pageStr='%04d'%page
  77. rawData=getPage(user, page) # get log
  78. dom=parseString(rawData)
  79. addEpochAttr(dom)
  80. debug('get log: page='+str(page))
  81. newIdSet=set(getIds(dom))-idSet
  82. idSet=idSet|newIdSet
  83. debug('new tweet: '+str(len(newIdSet))+', accum. tweet: '+str(len(idSet)))
  84. remain=getRemain()
  85. debug('remain hits: '+str(remain))
  86.  
  87. while(len(newIdSet)>0 and remain>0):
  88. rawData=dom.toxml("utf-8")
  89. path=strftime("%Y%m%d-%H%M%S", localtime(time()))+'-page'+pageStr+'.xml'
  90. f=open(dir+'/'+path, 'w')
  91. f.write(rawData)
  92. debug('write file: '+path)
  93.  
  94. debug('wait '+str(sleepTime)+'sec.')
  95. sleep(sleepTime); # wait
  96.  
  97. page+=1
  98. pageStr='%04d'%page
  99. rawData=getPage(user, page) # get log
  100. dom=parseString(rawData)
  101. addEpochAttr(dom)
  102. debug('get log: page='+str(page))
  103. newIdSet=set(getIds(dom))-idSet
  104. idSet=idSet|newIdSet
  105. debug('new tweet: '+str(len(newIdSet))+', accum. tweet: '+str(len(idSet)))
  106. remain=getRemain()
  107. debug('remain hits: '+str(remain))
  108.  
  109. if(remain==0):
  110. print 'Error: no more request.'
  111. print 'Next reset time(UTC): '+getResetTime()
  112. else: print 'OK!'
  113.  
  114. else:
  115. print 'Error: no more request.'
  116. print 'Next reset time(UTC): '+getResetTime()
  117. else: print 'Aborted.'

Report this snippet  

You need to login to post a comment.