Script to print crawl tree of a spider run


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # This is a script to print the crawl tree of spider run.
  2. #
  3. # Usage example:
  4. #
  5. # $ python ctree.py myspider.log
  6. # None
  7. # http://www.example.com/start_page1
  8. # http://www.example.com/second_page
  9. # http://www.example.com/another_page
  10. # None
  11. # http://www.example.com/start_page2
  12. # http://www.example.com/yet_another_page
  13.  
  14. #!/usr/bin/env python
  15.  
  16. import fileinput, re
  17. from collections import defaultdict
  18.  
  19. def print_urls(allurls, referer, indent=0):
  20. urls = allurls[referer]
  21. for url in urls:
  22. print ' '*indent + referer
  23. if url in allurls:
  24. print_urls(allurls, url, indent+2)
  25.  
  26. def main():
  27. log_re = re.compile(r'<GET (.*?)> \(referer: (.*?)\)')
  28. allurls = defaultdict(list)
  29. for l in fileinput.input():
  30. m = log_re.search(l)
  31. if m:
  32. url, ref = m.groups()
  33. allurls[ref] += [url]
  34. print_urls(allurls, 'None')
  35.  
  36. main()
  37.  
  38. # Snippet imported from snippets.scrapy.org (which no longer works)
  39. # author: pablo
  40. # date : Sep 15, 2011
  41.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.