Posted By

mustam on 08/28/09


Tagged

twitter logging


Versions (?)

Merge twitter-logged files into one xml-file


 / Published in: Python
 

e.g.


$ twlogmerge.py twitter-log-user-dir
read file: twitter-log-user-dir/page001.xml
get status: id: 123456789
append status: accum.:1234
...
append status: accum.:3456
write file: twitter-log-user-dir-marged-20090828-130146.xml

  1. import os
  2. import os.path
  3. import sys
  4. from codecs import open
  5. from time import strftime, localtime, time
  6. from xml.dom.minidom import parse, parseString
  7.  
  8. initXmlStr='''\
  9. <?xml version="1.0" encoding="UTF-8"?>
  10. <?xml-stylesheet type="text/xsl" href="viewstyle.xsl"?>
  11. <statuses></statuses>'''
  12.  
  13. if len(sys.argv)<2:
  14. print 'Usage: '+sys.argv[0]+' <log-dir>'
  15. elif not os.path.exists(sys.argv[1]):
  16. print 'No such file or directory'
  17. else:
  18. dir=sys.argv[1]
  19. print 'dir: '+dir
  20.  
  21. # init
  22. dom=parseString(initXmlStr)
  23. statuses=dom.getElementsByTagName('statuses').item(0)
  24. ids=set();
  25. # read log-files
  26. for path in os.listdir(dir):
  27. path=dir+'/'+path
  28. print 'read file: '+path
  29. localDom=parse(path)
  30. localStatuses=localDom.getElementsByTagName('status')
  31. # append statuses
  32. for status in localStatuses:
  33. idNode=status.getElementsByTagName('id').item(0)
  34. id=int(idNode.firstChild.data)
  35. print 'get status: id: '+str(id)
  36. if id not in ids:
  37. statuses.appendChild(status)
  38. ids.add(id)
  39. print 'append status: accum.:'+str(len(ids))
  40. else: print 'not append status'
  41. # write marged-file
  42. path=dir+'-marged-'+strftime("%Y%m%d-%H%M%S", localtime(time()))+'.xml'
  43. f=open(path,'w','utf-8')
  44. dom.writexml(f)
  45. print 'write file: '+path

Report this snippet  

You need to login to post a comment.