Posted By

scrapy on 09/01/12


Tagged

html script log scrapy


Versions (?)

make and filter the log file into a html file


 / Published in: Python
 

  1. # this script make the scrapy log files into a html file and generate crawl tree.error be taged red,404 be taged yellow and offsite be taged green.
  2. #
  3. # usage example:
  4. #
  5. # $logview.py logfile
  6. #
  7. # output is t.html
  8.  
  9. import fileinput, re, os
  10. from collections import defaultdict
  11.  
  12. header ='''<!DOCTYPE html public "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  13. <html>
  14. <head>
  15. <style type="text/css">
  16. .father{
  17. background-color:#E7EFFF;
  18. cursor:pointer;
  19. margin:1px;
  20. }
  21. .add{
  22. background-color:#A5CBF7;
  23. text-align:center;
  24. padding-left:0.5em;
  25. padding-right:0.5em;
  26. float:left;
  27. cursor:pointer;
  28. }
  29. .child{
  30. margin-left:20px;
  31. display:none;
  32. }
  33. .family{
  34. margin-left:20px;
  35. display:none;
  36. }
  37. .r404{
  38. background-color:#FFFF99;
  39. }
  40. .roffsite{
  41. background-color:#CCFF99;
  42. }
  43. .rerror{
  44. background-color:pink;
  45. }
  46. </style>
  47. <script language="javascript">
  48. function show(e){
  49. var c = e.parentNode;
  50. c = c.childNodes;
  51. for (var i = 0; i < c.length; i++)
  52. if (c[i].nodeType == 1 && c[i] !== e) c[i].style.display = 'block';
  53. e.firstChild.innerHTML = '-';
  54. }
  55. function hide(e){
  56. var c = e.parentNode;
  57. c = c.childNodes;
  58. for (var i = 0; i < c.length; i++)
  59. if (c[i].nodeType == 1 && c[i] !== e) c[i].style.display = 'none';
  60. e.firstChild.innerHTML = '+';
  61. }
  62. function showhide(){
  63. var c = this.parentNode.childNodes;
  64. for (var i = 0; i < c.length; i++){
  65. if (c[i].nodeType == 1 && c[i] != this){
  66. if (c[i].style.display == 'block') {
  67. hide(this);
  68. break;
  69. }else {
  70. show(this);
  71. break;
  72. }
  73. }
  74. }
  75. }
  76. function isMember(element,classname){
  77. var classes = element.className;
  78. if (!classes) return false;
  79. if (classes == classname) return true;
  80.  
  81. var whitespace = /\s+/;
  82. if (!whitespace.test(classes)) return false;
  83.  
  84. var c = classes.split(whitespace);
  85. for (var i = 0; i < c.length; i++){
  86. if (c[i] == classname) return true;
  87. }
  88. return false;
  89. }
  90. function getElements(classname, tagname, root){
  91. if (!root) root = document;
  92. else if (typeof root == 'string') root = doucment.getElementById(root);
  93. if (!tagname) tagname = '*';
  94. var all = root.getElementsByTagName(tagname);
  95. if (!classname) return all;
  96.  
  97. var elements = [];
  98. for (var i = 0; i < all.length; i++){
  99. var element = all[i];
  100. if (isMember(element, classname))
  101. elements.push(element);
  102. }
  103. return elements;
  104. }
  105. function init(){
  106. var f = document.body.childNodes;
  107. for (var i=0; i < f.length; i++){
  108. if (f[i].nodeType == 1) {
  109. f[i].style.display = 'block';
  110. }
  111. }
  112. var f = getElements('father')
  113. for (var i=0;i<f.length;i++){
  114. if (f[i].nodeType==1) f[i].onclick = showhide;
  115. }
  116. }
  117. function openall(){
  118. var t = getElements('father');
  119. for (var i = 0; i < t.length; i++){
  120. if (t[i].nodeType == 1) show(t[i]);
  121. }
  122. }
  123. function closeall(){
  124. var t = getElements('father');
  125. for (var i = 0; i < t.length; i++){
  126. if (t[i].nodeType == 1) hide(t[i]);
  127. }
  128. }
  129. window.onload = function(){
  130. init();
  131. };
  132. </script>
  133. </head>
  134. <body>
  135. <div class="family">
  136. <button onclick="openall();">expand all</button><button onclick="closeall();">close all</button><div>red:error green:offsite blue:can expand yellow:not find</div>
  137. </div>'''
  138. footer = '''</body></html>'''
  139. content = ''
  140. file = open('t.html','w')
  141.  
  142. class Crawl:
  143. def __init__(self,referer = 'None',url = None, status = None):
  144. self.referer = referer
  145. self.url = url
  146. self.status = status
  147. def error(self,error):
  148. self.error = error
  149.  
  150. def print_urls(crawllist, referer):
  151. global file, header, content, footer
  152. crawls = crawllist[referer]
  153. for crawl in crawls:
  154. if crawl.url in crawllist:
  155. content = content + '<div class="family"><div class="father"><div class="add">+</div><a href="'+str(crawl.url)+'">' + str(crawl.url) + '</a></div>'
  156. print_urls(crawllist, crawl.url)
  157. content = content +'</div>'
  158. else:
  159. if crawl.status == None:
  160. c = ''
  161. else:
  162. c = 'r'+crawl.status
  163. if crawl.error != None:
  164. content = content +'<div class="child '+c+'"><a href="'+str(crawl.url)+'">'+str(crawl.url)+'</a><div class="errordetail">'+crawl.error+'</div></div>'
  165. else:
  166. print crawl.url
  167. content = content +'<div class="child '+c+'"><a href="'+str(crawl.url)+'">'+str(crawl.url)+'</a></div>'
  168.  
  169. def main():
  170. global file, header, content, footer
  171. crawl_re = re.compile(r'\((.*?)\) <GET (.*?)> \(referer: (.*?)\)')
  172. offsite = re.compile(r'Filtered offsite .* <GET (.*?)>')
  173. process_error = 'Error processing'
  174. spider_error = 'Spider error'
  175. crawl_start_re = re.compile(r'Scrapy .* started')
  176. allurls = defaultdict(list)
  177. currentCrawl = None
  178. error = None
  179. crawllist = []
  180. for l in fileinput.input():
  181. r = crawl_re.search(l)
  182. if r:
  183. if currentCrawl != None:
  184. currentCrawl.error(error)
  185. crawllist.append(currentCrawl)
  186. collect = False
  187. error = None
  188. t = Crawl(r.group(3), r.group(2), r.group(1))
  189. currentCrawl = t
  190. continue
  191. r = offsite.search(l)
  192. if r:
  193. if currentCrawl != None:
  194. currentCrawl.error(error)
  195. crawllist.append(currentCrawl)
  196. collect = False
  197. error = None
  198. print r.groups()
  199. t = Crawl(url = r.group(1), status='offsite')
  200. currentCrawl = t
  201. continue
  202. r = crawl_start_re.search(l)
  203. if r:
  204. if currentCrawl != None:
  205. currentCrawl.error(error)
  206. crawllist.append(currentCrawl)
  207. collect = False
  208. error = None
  209. t = Crawl(status='started')
  210. currentCrawl = t
  211. continue
  212. if process_error in l:
  213. collect = True
  214. currentCrawl.status = 'error'
  215. error = l
  216. continue
  217. if spider_error in l:
  218. collect = True
  219. currentCrawl.status = 'error'
  220. error = l
  221. continue
  222. if collect == True:
  223. error += l
  224. start = None
  225. worklist = []
  226. for i in crawllist:
  227. if i.status == 'started':
  228. #new block
  229. worklist.append(allurls)
  230. allurls = defaultdict(list)
  231. allurls[i.referer] += [i]
  232. worklist.append(allurls)
  233. for i in worklist:
  234. print_urls(i, 'None')
  235. file.writelines(header+content+footer)
  236.  
  237. def _test():
  238. import doctest
  239. doctest.testmod(verbose=True)
  240.  
  241. if __name__=='__main__':
  242. main()
  243.  
  244. # Snippet imported from snippets.scrapy.org (which no longer works)
  245. # author: outofthink
  246. # date : Sep 22, 2011
  247.  

Report this snippet  

You need to login to post a comment.