Posted By

scrapy on 09/01/12

Tagged

Versions (?)

Last Edited at 09/01/12 07:15am

Statistics

Viewed 330 times

Favorited by 0 user(s)

Related snippets

robot exclusion rules parser

/ Published in: Python

Expand | Embed | Plain Text

Copy this code and paste it in your HTML

# This is a middleware to respect robots.txt policies. To activate it you must
# enable this middleware and enable the ROBOTSTXT_OBEY setting. This version uses
# the improved robotsexclusionrulesparse which can handle GYM2008 wildcards and
# such.
# 
# Also note, this middleware implicitly assumes one spider will crawl one domain
# with one robots.txt file.  This may or may not be true for your application.
# Using this approach, the robots.txt file is downloaded only once for each spider
# type and fewer page requests that violate robots.txt occur
 
"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting. This version uses
the improved robotsexclusionrulesparse which can handle GYM2008 wildcards and
such.
 
Also note, this middleware implicitly assumes one spider will crawl one domain
with one robots.txt file.  This may or may not be true for your application.
Using this approach, the robots.txt file is downloaded only once for each spider
type and fewer page requests that violate robots.txt occur
 
"""
 
from robotexclusionrulesparser import RobotExclusionRulesParser
 
from scrapy.xlib.pydispatch import dispatcher
 
from scrapy import signals, log
from scrapy.project import crawler
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from scrapy.conf import settings
 
class CreepyRobotsTxt(object):
    DOWNLOAD_PRIORITY = 1000
 
    def __init__(self):
        if not settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured
 
        self._parsers = {}
        self._spider_netlocs = {}
        self._useragents = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
 
    def process_request(self, request, spider):
        if spider.settings.getbool('ROBOTSTXT_OBEY'):
            useragent = self._useragents[spider.name]
            rp = self.robot_parser(request, spider)
            if rp and not rp.is_allowed(useragent, request.url):
                log.msg("Forbidden by robots.txt: %s" % request,
                        level=log.ERROR, spider=spider)
                raise IgnoreRequest
 
    def robot_parser(self, request, spider):
        url = urlparse_cached(request)
        netloc = url.netloc
        if netloc not in self._parsers:
            self._parsers[netloc] = None
            robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
            robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
            dfd = crawler.engine.download(robotsreq, spider)
            dfd.addCallback(self._parse_robots)
            self._spider_netlocs[spider.name].add(netloc)
        return self._parsers[netloc]
 
    def _parse_robots(self, response):
        rp = RobotExclusionRulesParser()
        rp.parse(response.body)
        self._parsers[urlparse_cached(response).netloc] = rp
 
    def spider_opened(self, spider):
        if not self._spider_netlocs.has_key(spider.name):
            self._spider_netlocs[spider.name] = set()
            self._useragents[spider.name] = spider.settings['USER_AGENT']
 
# Snippet imported from snippets.scrapy.org (which no longer works)
# author: kurtjx
# date  : Apr 12, 2011

Report this snippet Tweet

Comments

Subscribe to comments

Comment:

You need to login to post a comment.

Posted By

Tagged

Versions (?)

Statistics

Related snippets

robot exclusion rules parser

Comments

Choose a language for easy browsing: