Revision: 59332
                            
                                                            
                                    
                                        
Initial Code
                                    
                                    
                                                            
                                    
                                        
Initial URL
                                    
                                    
                                
                                                            
                                    
                                        
Initial Description
                                    
                                    
                                
                                                            
                                    
                                        
Initial Title
                                    
                                    
                                                            
                                    
                                        
Initial Tags
                                    
                                    
                                
                                                            
                                    
                                        
Initial Language
                                    
                                    
                                                    
                        at September 1, 2012 07:15 by scrapy
                            
                            Initial Code
# This is a middleware to respect robots.txt policies. To activate it you must
# enable this middleware and enable the ROBOTSTXT_OBEY setting. This version uses
# the improved robotsexclusionrulesparse which can handle GYM2008 wildcards and
# such.
# 
# Also note, this middleware implicitly assumes one spider will crawl one domain
# with one robots.txt file.  This may or may not be true for your application.
# Using this approach, the robots.txt file is downloaded only once for each spider
# type and fewer page requests that violate robots.txt occur
"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting. This version uses
the improved robotsexclusionrulesparse which can handle GYM2008 wildcards and
such.
Also note, this middleware implicitly assumes one spider will crawl one domain
with one robots.txt file.  This may or may not be true for your application.
Using this approach, the robots.txt file is downloaded only once for each spider
type and fewer page requests that violate robots.txt occur
"""
from robotexclusionrulesparser import RobotExclusionRulesParser
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals, log
from scrapy.project import crawler
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from scrapy.conf import settings
class CreepyRobotsTxt(object):
    DOWNLOAD_PRIORITY = 1000
    def __init__(self):
        if not settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured
        self._parsers = {}
        self._spider_netlocs = {}
        self._useragents = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
    def process_request(self, request, spider):
        if spider.settings.getbool('ROBOTSTXT_OBEY'):
            useragent = self._useragents[spider.name]
            rp = self.robot_parser(request, spider)
            if rp and not rp.is_allowed(useragent, request.url):
                log.msg("Forbidden by robots.txt: %s" % request,
                        level=log.ERROR, spider=spider)
                raise IgnoreRequest
    def robot_parser(self, request, spider):
        url = urlparse_cached(request)
        netloc = url.netloc
        if netloc not in self._parsers:
            self._parsers[netloc] = None
            robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
            robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
            dfd = crawler.engine.download(robotsreq, spider)
            dfd.addCallback(self._parse_robots)
            self._spider_netlocs[spider.name].add(netloc)
        return self._parsers[netloc]
    def _parse_robots(self, response):
        rp = RobotExclusionRulesParser()
        rp.parse(response.body)
        self._parsers[urlparse_cached(response).netloc] = rp
    def spider_opened(self, spider):
        if not self._spider_netlocs.has_key(spider.name):
            self._spider_netlocs[spider.name] = set()
            self._useragents[spider.name] = spider.settings['USER_AGENT']
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: kurtjx
# date  : Apr 12, 2011
                                Initial URL
Initial Description
Initial Title
robot exclusion rules parser
Initial Tags
Initial Language
Python