Return to Snippet

Revision: 59332
at September 1, 2012 07:15 by scrapy


Initial Code
# This is a middleware to respect robots.txt policies. To activate it you must
# enable this middleware and enable the ROBOTSTXT_OBEY setting. This version uses
# the improved robotsexclusionrulesparse which can handle GYM2008 wildcards and
# such.
# 
# Also note, this middleware implicitly assumes one spider will crawl one domain
# with one robots.txt file.  This may or may not be true for your application.
# Using this approach, the robots.txt file is downloaded only once for each spider
# type and fewer page requests that violate robots.txt occur

"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting. This version uses
the improved robotsexclusionrulesparse which can handle GYM2008 wildcards and
such.

Also note, this middleware implicitly assumes one spider will crawl one domain
with one robots.txt file.  This may or may not be true for your application.
Using this approach, the robots.txt file is downloaded only once for each spider
type and fewer page requests that violate robots.txt occur

"""

from robotexclusionrulesparser import RobotExclusionRulesParser

from scrapy.xlib.pydispatch import dispatcher

from scrapy import signals, log
from scrapy.project import crawler
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from scrapy.conf import settings

class CreepyRobotsTxt(object):
    DOWNLOAD_PRIORITY = 1000

    def __init__(self):
        if not settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self._parsers = {}
        self._spider_netlocs = {}
        self._useragents = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)

    def process_request(self, request, spider):
        if spider.settings.getbool('ROBOTSTXT_OBEY'):
            useragent = self._useragents[spider.name]
            rp = self.robot_parser(request, spider)
            if rp and not rp.is_allowed(useragent, request.url):
                log.msg("Forbidden by robots.txt: %s" % request,
                        level=log.ERROR, spider=spider)
                raise IgnoreRequest

    def robot_parser(self, request, spider):
        url = urlparse_cached(request)
        netloc = url.netloc
        if netloc not in self._parsers:
            self._parsers[netloc] = None
            robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
            robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
            dfd = crawler.engine.download(robotsreq, spider)
            dfd.addCallback(self._parse_robots)
            self._spider_netlocs[spider.name].add(netloc)
        return self._parsers[netloc]

    def _parse_robots(self, response):
        rp = RobotExclusionRulesParser()
        rp.parse(response.body)
        self._parsers[urlparse_cached(response).netloc] = rp

    def spider_opened(self, spider):
        if not self._spider_netlocs.has_key(spider.name):
            self._spider_netlocs[spider.name] = set()
            self._useragents[spider.name] = spider.settings['USER_AGENT']

# Snippet imported from snippets.scrapy.org (which no longer works)
# author: kurtjx
# date  : Apr 12, 2011

Initial URL


Initial Description


Initial Title
robot exclusion rules parser

Initial Tags


Initial Language
Python