robot exclusion rules parser


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # This is a middleware to respect robots.txt policies. To activate it you must
  2. # enable this middleware and enable the ROBOTSTXT_OBEY setting. This version uses
  3. # the improved robotsexclusionrulesparse which can handle GYM2008 wildcards and
  4. # such.
  5. #
  6. # Also note, this middleware implicitly assumes one spider will crawl one domain
  7. # with one robots.txt file. This may or may not be true for your application.
  8. # Using this approach, the robots.txt file is downloaded only once for each spider
  9. # type and fewer page requests that violate robots.txt occur
  10.  
  11. """
  12. This is a middleware to respect robots.txt policies. To activate it you must
  13. enable this middleware and enable the ROBOTSTXT_OBEY setting. This version uses
  14. the improved robotsexclusionrulesparse which can handle GYM2008 wildcards and
  15. such.
  16.  
  17. Also note, this middleware implicitly assumes one spider will crawl one domain
  18. with one robots.txt file. This may or may not be true for your application.
  19. Using this approach, the robots.txt file is downloaded only once for each spider
  20. type and fewer page requests that violate robots.txt occur
  21.  
  22. """
  23.  
  24. from robotexclusionrulesparser import RobotExclusionRulesParser
  25.  
  26. from scrapy.xlib.pydispatch import dispatcher
  27.  
  28. from scrapy import signals, log
  29. from scrapy.project import crawler
  30. from scrapy.exceptions import NotConfigured, IgnoreRequest
  31. from scrapy.http import Request
  32. from scrapy.utils.httpobj import urlparse_cached
  33. from scrapy.conf import settings
  34.  
  35. class CreepyRobotsTxt(object):
  36. DOWNLOAD_PRIORITY = 1000
  37.  
  38. def __init__(self):
  39. if not settings.getbool('ROBOTSTXT_OBEY'):
  40. raise NotConfigured
  41.  
  42. self._parsers = {}
  43. self._spider_netlocs = {}
  44. self._useragents = {}
  45. dispatcher.connect(self.spider_opened, signals.spider_opened)
  46.  
  47. def process_request(self, request, spider):
  48. if spider.settings.getbool('ROBOTSTXT_OBEY'):
  49. useragent = self._useragents[spider.name]
  50. rp = self.robot_parser(request, spider)
  51. if rp and not rp.is_allowed(useragent, request.url):
  52. log.msg("Forbidden by robots.txt: %s" % request,
  53. level=log.ERROR, spider=spider)
  54. raise IgnoreRequest
  55.  
  56. def robot_parser(self, request, spider):
  57. url = urlparse_cached(request)
  58. netloc = url.netloc
  59. if netloc not in self._parsers:
  60. self._parsers[netloc] = None
  61. robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
  62. robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
  63. dfd = crawler.engine.download(robotsreq, spider)
  64. dfd.addCallback(self._parse_robots)
  65. self._spider_netlocs[spider.name].add(netloc)
  66. return self._parsers[netloc]
  67.  
  68. def _parse_robots(self, response):
  69. rp = RobotExclusionRulesParser()
  70. rp.parse(response.body)
  71. self._parsers[urlparse_cached(response).netloc] = rp
  72.  
  73. def spider_opened(self, spider):
  74. if not self._spider_netlocs.has_key(spider.name):
  75. self._spider_netlocs[spider.name] = set()
  76. self._useragents[spider.name] = spider.settings['USER_AGENT']
  77.  
  78. # Snippet imported from snippets.scrapy.org (which no longer works)
  79. # author: kurtjx
  80. # date : Apr 12, 2011
  81.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.