Merge twitter-logged files into one xml-file


/ Published in: Python
Save to your folder(s)

e.g.
<pre><code>
$ twlogmerge.py twitter-log-user-dir
read file: twitter-log-user-dir/page001.xml
get status: id: 123456789
append status: accum.:1234
...
append status: accum.:3456
write file: twitter-log-user-dir-marged-20090828-130146.xml
</code></pre>

* cf. [Backup my twitter-log](http://snipplr.com/view/18925/backup-my-twitterlog/)

* cf. [Stylesheet for twitter-log file](http://snipplr.com/view/18927/stylesheet-for-twitterlog-file/)


Copy this code and paste it in your HTML
  1. import os
  2. import os.path
  3. import sys
  4. from codecs import open
  5. from time import strftime, localtime, time
  6. from xml.dom.minidom import parse, parseString
  7.  
  8. initXmlStr='''\
  9. <?xml version="1.0" encoding="UTF-8"?>
  10. <?xml-stylesheet type="text/xsl" href="viewstyle.xsl"?>
  11. <statuses></statuses>'''
  12.  
  13. if len(sys.argv)<2:
  14. print 'Usage: '+sys.argv[0]+' <log-dir>'
  15. elif not os.path.exists(sys.argv[1]):
  16. print 'No such file or directory'
  17. else:
  18. dir=sys.argv[1]
  19. print 'dir: '+dir
  20.  
  21. # init
  22. dom=parseString(initXmlStr)
  23. statuses=dom.getElementsByTagName('statuses').item(0)
  24. ids=set();
  25. # read log-files
  26. for path in os.listdir(dir):
  27. path=dir+'/'+path
  28. print 'read file: '+path
  29. localDom=parse(path)
  30. localStatuses=localDom.getElementsByTagName('status')
  31. # append statuses
  32. for status in localStatuses:
  33. idNode=status.getElementsByTagName('id').item(0)
  34. id=int(idNode.firstChild.data)
  35. print 'get status: id: '+str(id)
  36. if id not in ids:
  37. statuses.appendChild(status)
  38. ids.add(id)
  39. print 'append status: accum.:'+str(len(ids))
  40. else: print 'not append status'
  41. # write marged-file
  42. path=dir+'-marged-'+strftime("%Y%m%d-%H%M%S", localtime(time()))+'.xml'
  43. f=open(path,'w','utf-8')
  44. dom.writexml(f)
  45. print 'write file: '+path

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.