Revision: 59321
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 1, 2012 07:15 by scrapy
Initial Code
# this script make the scrapy log files into a html file and generate crawl tree.error be taged red,404 be taged yellow and offsite be taged green.
#
# usage example:
#
# $logview.py logfile
#
# output is t.html
import fileinput, re, os
from collections import defaultdict
header ='''<!DOCTYPE html public "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<style type="text/css">
.father{
background-color:#E7EFFF;
cursor:pointer;
margin:1px;
}
.add{
background-color:#A5CBF7;
text-align:center;
padding-left:0.5em;
padding-right:0.5em;
float:left;
cursor:pointer;
}
.child{
margin-left:20px;
display:none;
}
.family{
margin-left:20px;
display:none;
}
.r404{
background-color:#FFFF99;
}
.roffsite{
background-color:#CCFF99;
}
.rerror{
background-color:pink;
}
</style>
<script language="javascript">
function show(e){
var c = e.parentNode;
c = c.childNodes;
for (var i = 0; i < c.length; i++)
if (c[i].nodeType == 1 && c[i] !== e) c[i].style.display = 'block';
e.firstChild.innerHTML = '-';
}
function hide(e){
var c = e.parentNode;
c = c.childNodes;
for (var i = 0; i < c.length; i++)
if (c[i].nodeType == 1 && c[i] !== e) c[i].style.display = 'none';
e.firstChild.innerHTML = '+';
}
function showhide(){
var c = this.parentNode.childNodes;
for (var i = 0; i < c.length; i++){
if (c[i].nodeType == 1 && c[i] != this){
if (c[i].style.display == 'block') {
hide(this);
break;
}else {
show(this);
break;
}
}
}
}
function isMember(element,classname){
var classes = element.className;
if (!classes) return false;
if (classes == classname) return true;
var whitespace = /\s+/;
if (!whitespace.test(classes)) return false;
var c = classes.split(whitespace);
for (var i = 0; i < c.length; i++){
if (c[i] == classname) return true;
}
return false;
}
function getElements(classname, tagname, root){
if (!root) root = document;
else if (typeof root == 'string') root = doucment.getElementById(root);
if (!tagname) tagname = '*';
var all = root.getElementsByTagName(tagname);
if (!classname) return all;
var elements = [];
for (var i = 0; i < all.length; i++){
var element = all[i];
if (isMember(element, classname))
elements.push(element);
}
return elements;
}
function init(){
var f = document.body.childNodes;
for (var i=0; i < f.length; i++){
if (f[i].nodeType == 1) {
f[i].style.display = 'block';
}
}
var f = getElements('father')
for (var i=0;i<f.length;i++){
if (f[i].nodeType==1) f[i].onclick = showhide;
}
}
function openall(){
var t = getElements('father');
for (var i = 0; i < t.length; i++){
if (t[i].nodeType == 1) show(t[i]);
}
}
function closeall(){
var t = getElements('father');
for (var i = 0; i < t.length; i++){
if (t[i].nodeType == 1) hide(t[i]);
}
}
window.onload = function(){
init();
};
</script>
</head>
<body>
<div class="family">
<button onclick="openall();">expand all</button><button onclick="closeall();">close all</button><div>red:error green:offsite blue:can expand yellow:not find</div>
</div>'''
footer = '''</body></html>'''
content = ''
file = open('t.html','w')
class Crawl:
def __init__(self,referer = 'None',url = None, status = None):
self.referer = referer
self.url = url
self.status = status
def error(self,error):
self.error = error
def print_urls(crawllist, referer):
global file, header, content, footer
crawls = crawllist[referer]
for crawl in crawls:
if crawl.url in crawllist:
content = content + '<div class="family"><div class="father"><div class="add">+</div><a href="'+str(crawl.url)+'">' + str(crawl.url) + '</a></div>'
print_urls(crawllist, crawl.url)
content = content +'</div>'
else:
if crawl.status == None:
c = ''
else:
c = 'r'+crawl.status
if crawl.error != None:
content = content +'<div class="child '+c+'"><a href="'+str(crawl.url)+'">'+str(crawl.url)+'</a><div class="errordetail">'+crawl.error+'</div></div>'
else:
print crawl.url
content = content +'<div class="child '+c+'"><a href="'+str(crawl.url)+'">'+str(crawl.url)+'</a></div>'
def main():
global file, header, content, footer
crawl_re = re.compile(r'\((.*?)\) <GET (.*?)> \(referer: (.*?)\)')
offsite = re.compile(r'Filtered offsite .* <GET (.*?)>')
process_error = 'Error processing'
spider_error = 'Spider error'
crawl_start_re = re.compile(r'Scrapy .* started')
allurls = defaultdict(list)
currentCrawl = None
error = None
crawllist = []
for l in fileinput.input():
r = crawl_re.search(l)
if r:
if currentCrawl != None:
currentCrawl.error(error)
crawllist.append(currentCrawl)
collect = False
error = None
t = Crawl(r.group(3), r.group(2), r.group(1))
currentCrawl = t
continue
r = offsite.search(l)
if r:
if currentCrawl != None:
currentCrawl.error(error)
crawllist.append(currentCrawl)
collect = False
error = None
print r.groups()
t = Crawl(url = r.group(1), status='offsite')
currentCrawl = t
continue
r = crawl_start_re.search(l)
if r:
if currentCrawl != None:
currentCrawl.error(error)
crawllist.append(currentCrawl)
collect = False
error = None
t = Crawl(status='started')
currentCrawl = t
continue
if process_error in l:
collect = True
currentCrawl.status = 'error'
error = l
continue
if spider_error in l:
collect = True
currentCrawl.status = 'error'
error = l
continue
if collect == True:
error += l
start = None
worklist = []
for i in crawllist:
if i.status == 'started':
#new block
worklist.append(allurls)
allurls = defaultdict(list)
allurls[i.referer] += [i]
worklist.append(allurls)
for i in worklist:
print_urls(i, 'None')
file.writelines(header+content+footer)
def _test():
import doctest
doctest.testmod(verbose=True)
if __name__=='__main__':
main()
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: outofthink
# date : Sep 22, 2011
Initial URL
Initial Description
Initial Title
make and filter the log file into a html file
Initial Tags
html, script, log
Initial Language
Python