Posted By

mustam on 08/27/09


Tagged

twitter logging


Versions (?)


Advertising

Website Promotion DIRECTORY is a crucial factor for all websites that need to gain better organic search engine rankings and increase website traffic.
Submitting your website as part of your Web Promotion strategy to our SEO friendly and high traffic Business Directory for review is an excellent way to gain a valuable backlink and increase your websites visibility online.

Submit Site


Who likes this?

1 person has marked this snippet as a favorite

gpupo


Backup my twitter-log


Published in: Python 






e.g.

$ twlog.py user
twitter account name: user
Now logging start, take several times. r u OK? [y/N]: y
DEBUG: make dir: twitter-log-user
DEBUG: get log: page=1
DEBUG: new tweet: 21, accum. tweet: 21
DEBUG: remain hits: 149
DEBUG: write file: 20090828-105421-page0001.xml
DEBUG: wait 3sec.
...
DEBUG: get log: page=7
DEBUG: new tweet: 0, accum. tweet: 115
DEBUG: remain hits: 143
OK!
Expand | Embed | Plain Text
  1. # vim: ts=2 sw=2 expandtab
  2. import sys
  3. from os import mkdir
  4. from os.path import exists
  5. import time
  6. from urllib2 import urlopen
  7. from xml.dom import Node
  8. from xml.dom.minidom import parseString
  9. from time import strptime, strftime, localtime, time, sleep
  10. from calendar import timegm
  11.  
  12. debug=True
  13.  
  14. def getRemain():
  15. url='http://twitter.com/account/rate_limit_status.xml'
  16. raw=urlopen(url).read()
  17. dom=parseString(raw)
  18. tag=dom.getElementsByTagName('remaining-hits').item(0)
  19. remain=int(tag.firstChild.data)
  20. return remain
  21.  
  22. def getResetTime():
  23. url='http://twitter.com/account/rate_limit_status.xml'
  24. raw=urlopen(url).read()
  25. dom=parseString(raw)
  26. tag=dom.getElementsByTagName('reset-time').item(0)
  27. resetTime=tag.firstChild.data
  28. return resetTime
  29.  
  30. def getIds(dom):
  31. ids=[]
  32. for idNode in dom.getElementsByTagName('id'):
  33. ids.append(int(idNode.childNodes[0].data))
  34. return ids
  35.  
  36. def getPage(user, page):
  37. url='http://twitter.com/statuses/user_timeline/'
  38. rawData=urlopen(url+user+'.xml?page='+str(page)).read()
  39. return rawData
  40.  
  41. def getEpoch(str):
  42. dd = strptime(str, "%a %b %d %H:%M:%S +0000 %Y")
  43. return timegm(dd)
  44.  
  45. def addEpochAttr(dom):
  46. ds=dom.getElementsByTagName('created_at')
  47. for d in ds:
  48. dtext=d.firstChild.data
  49. depoch=getEpoch(dtext)
  50. d.setAttribute('epoch', str(depoch))
  51.  
  52. def debug(str):
  53. if debug: print 'DEBUG: '+str
  54.  
  55. ### MAIN ####################################################
  56. if(len(sys.argv)<2):
  57. print 'Usage: '+sys.argv[0]+' <twitter account name> [page offset]'
  58. else:
  59. user=sys.argv[1]
  60. print 'twitter account name: '+user
  61. dir='twitter-log-'+user
  62. sleepTime=3
  63.  
  64. # CONFIRM
  65. s=raw_input('Now logging start, take several times. r u OK? [y/N]: ')
  66.  
  67. if s.lower()=='y' or s.lower()=='yes':
  68. # START LOGGING
  69. remain=getRemain()
  70. if(remain>0):
  71. if(not exists(dir)): mkdir(dir) # make dir
  72. debug('make dir: '+dir)
  73. idSet=set() # init
  74. if(len(sys.argv)>2): page=int(sys.argv[2])
  75. else: page=1
  76. pageStr='%04d'%page
  77. rawData=getPage(user, page) # get log
  78. dom=parseString(rawData)
  79. addEpochAttr(dom)
  80. debug('get log: page='+str(page))
  81. newIdSet=set(getIds(dom))-idSet
  82. idSet=idSet|newIdSet
  83. debug('new tweet: '+str(len(newIdSet))+', accum. tweet: '+str(len(idSet)))
  84. remain=getRemain()
  85. debug('remain hits: '+str(remain))
  86.  
  87. while(len(newIdSet)>0 and remain>0):
  88. rawData=dom.toxml("utf-8")
  89. path=strftime("%Y%m%d-%H%M%S", localtime(time()))+'-page'+pageStr+'.xml'
  90. f=open(dir+'/'+path, 'w')
  91. f.write(rawData)
  92. debug('write file: '+path)
  93.  
  94. debug('wait '+str(sleepTime)+'sec.')
  95. sleep(sleepTime); # wait
  96.  
  97. page+=1
  98. pageStr='%04d'%page
  99. rawData=getPage(user, page) # get log
  100. dom=parseString(rawData)
  101. addEpochAttr(dom)
  102. debug('get log: page='+str(page))
  103. newIdSet=set(getIds(dom))-idSet
  104. idSet=idSet|newIdSet
  105. debug('new tweet: '+str(len(newIdSet))+', accum. tweet: '+str(len(idSet)))
  106. remain=getRemain()
  107. debug('remain hits: '+str(remain))
  108.  
  109. if(remain==0):
  110. print 'Error: no more request.'
  111. print 'Next reset time(UTC): '+getResetTime()
  112. else: print 'OK!'
  113.  
  114. else:
  115. print 'Error: no more request.'
  116. print 'Next reset time(UTC): '+getResetTime()
  117. else: print 'Aborted.'

Report this snippet 

You need to login to post a comment.