CrawlPathMiddleware: easily control the crawl path of a spider


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # This is a piece of SpiderMiddleware to control the crawl path of a CrawlSpider-like spider. It works by dropping
  2. # some links extracted from certain pages. The idea is to get a bit more fine-grained control than using
  3. # LinkExtractors alone because with this middleware you can set different rules to be applied on different pages.
  4. #
  5. # You do this with a series of regular expressions which define the path that the spider is allowed to take.
  6. # Specifically, you put a list of tuples in your settings.py file of the form:
  7. # (fromUrlPattern, [allowedToURLs], [denyToURLs]).
  8. #
  9. # The allow / deny mechanism works similarly to the one Scrapy uses in LinkExtractors. Each tuple says something along
  10. # the lines of:
  11. #
  12. # ` If a link was gathered at a page matching the regex in fromUrlPattern
  13. # Keep it as long as the link's target URL matches at least one pattern in [allowedToURLs]
  14. # Unless the link's target URL also matches at least one pattern in [denyToURLs]`
  15. #
  16. # Each 'from' page is handled by the first tuple whose fromUrlPattern matches its URL.
  17. # If no tuple matches the URL for the 'from' page, CrawlPathMiddleware ignores that page and doesn't do drop any of
  18. # its extracted links.
  19. #
  20. # If you leave [allowedToURLs] as either '' or [], it allows all URLs. This is the same as passing [r'.*']. This is
  21. # useful if you want to have a deny rule without an allow rule.
  22. #
  23. # If you leave [allowedToURLs] as None, it doesn't allow any URLs. This is the same as passing something like [r'a^']
  24. # and is useful if you want to designate a certain page as a dead end.
  25. #
  26. # If you leave [denyToURLs] as either '', [], or None, it doesn't deny any URLs. This is the same as passing something
  27. # like [r'a^'] and is useful if you want to have an allow rule without a deny rule.
  28. #
  29. # You can also provide a string containing a single regex instead of a list of regexes for [allowedToURLs] or
  30. # [denyToURLs]. For example, [r'.*my_regex.*'] and r'.*my_regex.*' do the same thing for [allowedToURLs] and
  31. # [denyToURLs].
  32. #
  33. # See the settings.py code for examples.
  34.  
  35. =======================
  36. ===== settings.py =====
  37. =======================
  38.  
  39. SPIDER_MIDDLEWARES = {
  40. 'path.to.crawlpath_module.CrawlPathMiddleware': 550,
  41. }
  42.  
  43. # regular expression patterns for defining url types in shorthand
  44. type1 = r'.*this_regex_uniquely_selects_urls_from_page_type_1.*'
  45. type2 = r'.*this_regex_uniquely_selects_urls_from_page_type_2.*'
  46. type3 = r'.*this_regex_uniquely_selects_urls_from_page_type_3.*'
  47. type4 = r'.*this_regex_uniquely_selects_urls_from_page_type_4.*'
  48.  
  49. # list of crawl paths for CrawlPathMiddleware
  50. # each crawl path is defined by a tuple comprised of three elements:
  51. # fromUrlPattern: a single regular expression (regex) defining pages where this crawl path apples
  52. # that is, fromUrlPattern identifies candidate 'from' pages
  53. # [allowPatterns]: regex or list of regexes defining urls the spider can crawl to from this
  54. # 'from' page
  55. # [denyPatterns]: regex or list of regexes defining urls the spider can't crawl to from this
  56. # 'from' page
  57. PATH_TUPLES = [
  58. # these should take the form:
  59. # (fromUrlPattern, [allowPatterns], [denyPatterns])
  60.  
  61. # type2 pages can only go to any type2 or type3 pages
  62. (type1, [type2, type3], ''),
  63.  
  64. # type2 pages can only go to type2 pages which don't match either of the bad patterns
  65. (type2, type2, [r'.*bad_pattern_1.*', r'.*bad_pattern_2.*']),
  66.  
  67. # type3 pages can go anywhere except type1 pages
  68. (type3, '', type1),
  69.  
  70. # type4 pages can't go anywhere
  71. (type4, None, ''),
  72. ]
  73.  
  74. # If you set PATH_DEBUG to True, CrawlPathMiddleware will log information about
  75. # which links were allowed / denied and why
  76. PATH_DEBUG = True
  77.  
  78. # This setting controls how wide the printed URLs should be in the logged DEBUG statements.
  79. # This number is only about making the log pretty and readable. Adjust it as you like.
  80. PATH_DEBUG_URL_LENGTH = 95
  81.  
  82.  
  83.  
  84. ========================
  85. ===== crawlpath.py =====
  86. ========================
  87.  
  88. # This is a piece of SpiderMiddleware to control the crawl path of a CrawlSpider-like spider. It works by dropping
  89. # some links extracted from certain pages. The idea is to get a bit more fine-grained control than using
  90. # LinkExtractors alone because with this middleware you can set different rules to be applied on different pages.
  91. #
  92. # You do this with a series of regular expressions which define the path that the spider is allowed to take.
  93. # Specifically, you put a list of tuples in your settings.py file of the form:
  94. # (fromUrlPattern, [allowedToURLs], [denyToURLs]).
  95. #
  96. # The allow / deny mechanism works similarly to the one Scrapy uses in LinkExtractors. Each tuple says something along
  97. # the lines of:
  98. #
  99. # If a link was gathered at a page matching the regex in fromUrlPattern
  100. # Keep it as long as the link's target URL matches at least one pattern in [allowedToURLs]
  101. # Unless the link's target URL also matches at least one pattern in [denyToURLs]
  102. #
  103. # Each 'from' page is handled by the first tuple whose fromUrlPattern matches its URL.
  104. # If no tuple matches the URL for the 'from' page, CrawlPathMiddleware ignores that page and doesn't do drop any of
  105. # its extracted links.
  106. #
  107. # If you leave [allowedToURLs] as either '' or [], it allows all URLs. This is the same as passing [r'.*']. This is
  108. # useful if you want to have a deny rule without an allow rule.
  109. #
  110. # If you leave [allowedToURLs] as None, it doesn't allow any URLs. This is the same as passing something like [r'a^']
  111. # and is useful if you want to designate a certain page as a dead end.
  112. #
  113. # If you leave [denyToURLs] as either '', [], or None, it doesn't deny any URLs. This is the same as passing something
  114. # like [r'a^'] and is useful if you want to have an allow rule without a deny rule.
  115. #
  116. # You can also provide a string containing a single regex instead of a list of regexes for [allowedToURLs] or
  117. # [denyToURLs]. For example, [r'.*my_regex.*'] and r'.*my_regex.*' do the same thing for [allowedToURLs] and
  118. # [denyToURLs].
  119. #
  120. # See the settings.py code for examples.
  121.  
  122. from scrapy.http.request import Request
  123. import re
  124. from scrapy import log
  125.  
  126. class CrawlPathMiddleware(object):
  127. """SpiderMiddleware to shape the crawl path of a CrawlSpider-like spider using PATH_TUPLES defined in settings.py"""
  128.  
  129. def __init__(self, path_tuples, debug, debug_url_length):
  130. self.DEBUG = debug
  131. self.DEBUG_URL_LENGTH = debug_url_length
  132. self.FIRST_MSG_LEN = len("Didn't match 'allow' patterns for") + 2 # not set in any setting
  133.  
  134. # path_tuples gets split up into three separate variables
  135. # self.fromPats is a list of patterns to match fromUrls
  136. # self.allowPatGroups is a list of lists of allow patterns
  137. # self.denyPatGroups is a list of lists of deny patterns
  138. self.fromPats = [re.compile(t[0]) for t in path_tuples] # patterns to match against each fromURL
  139. allowPatGroups = [t[1] for t in path_tuples] # allow patterns to match against each toURL
  140. denyPatGroups = [t[2] for t in path_tuples] # deny patterns to match against each toURL
  141.  
  142. # process and compile allow patterns
  143. self.allowPatGroups = []
  144. for pat_group in allowPatGroups:
  145. if pat_group == '' or pat_group == []:
  146. # blank allowPats ==> allow everything
  147. # this will always match as the input string which will evaluate to True (unless input string is '')
  148. pat_group = '.*'
  149. elif pat_group is None:
  150. # None allowPats ==> match nothing
  151. pat_group = 'a^'
  152.  
  153. # compile all patterns in the group
  154. if isinstance(pat_group, (str, unicode)):
  155. pats_compiled = [re.compile(pat_group)]
  156. else:
  157. pats_compiled = [re.compile(pat) for pat in pat_group]
  158. self.allowPatGroups.append(pats_compiled)
  159.  
  160. # process and compile deny patterns
  161. self.denyPatGroups = []
  162. for pat_group in denyPatGroups:
  163. # blank or None denyPats ==> deny nothing
  164. if pat_group == '' or pat_group == [] or pat_group is None:
  165. # this compiles without a problem and won't match anything. tries to match "a before line start"
  166. pat_group = r'a^'
  167.  
  168. # compile all patterns in the group
  169. if isinstance(pat_group, (str, unicode)):
  170. pats_compiled = [re.compile(pat_group)]
  171. else:
  172. pats_compiled = [re.compile(pat) for pat in pat_group]
  173. self.denyPatGroups.append(pats_compiled)
  174.  
  175. @classmethod
  176. def from_settings(cls, settings):
  177. path_tuples = settings.getlist('PATH_TUPLES')
  178. debug = settings.getbool('PATH_DEBUG', default=False)
  179. debug_url_length = settings.getint('PATH_DEBUG_URL_LENGTH', default=90)
  180. return cls(path_tuples, debug, debug_url_length)
  181.  
  182. def _firstIndex(self, myIterable):
  183. """find the index of the first element in myIterable which evaluates to True"""
  184. for (counter, option) in enumerate(myIterable):
  185. if option: return counter
  186. return None
  187.  
  188. def log(self, message, spider, level=log.DEBUG):
  189. """Log the given messages at the given log level. Stolen from BaseSpider."""
  190. # prepend the name of this class to message
  191. message = '[' + self.__class__.__name__ + '] ' + message
  192. log.msg(message, spider=spider, level=level)
  193.  
  194. def process_spider_output(self, response, result, spider):
  195. fromUrl = response.url
  196.  
  197. # figure out which tuple should handle links from this fromUrl
  198. fromMatches = [re.match(p, fromUrl) for p in self.fromPats]
  199. tupleIndex = self._firstIndex(fromMatches)
  200.  
  201. if tupleIndex is None:
  202. # fromUrl didn't match any pattern in fromPats. don't change anything.
  203. if self.DEBUG: self.log('No matching fromUrl pattern for'.ljust(self.FIRST_MSG_LEN) + \
  204. fromUrl.ljust(self.DEBUG_URL_LENGTH), spider=spider)
  205. for r in result: yield r
  206. else:
  207. # get the allow and deny patterns from the proper tuple
  208. allowPats = self.allowPatGroups[tupleIndex]
  209. denyPats = self.denyPatGroups[tupleIndex]
  210.  
  211. # check each result element against the allow and deny patterns for the appropriate tuple
  212. for r in result:
  213. if isinstance(r, Request):
  214. toUrl = r.url
  215.  
  216. allowMatches = [re.match(p, toUrl) for p in allowPats]
  217. if any(allowMatches):
  218. denyMatches = [re.match(p, toUrl) for p in denyPats]
  219. if not any(denyMatches):
  220. # toUrl matched an allow pattern and no deny patterns. allow it to pass.
  221. if self.DEBUG: self.log('All ok for'.ljust(self.FIRST_MSG_LEN) + \
  222. fromUrl.ljust(self.DEBUG_URL_LENGTH) + ' linking to'.ljust(14)+ \
  223. toUrl, spider=spider)
  224. yield r
  225. else:
  226. # toUrl matched a deny pattern. drop it.
  227. if self.DEBUG: self.log('Matched deny for'.ljust(self.FIRST_MSG_LEN) + \
  228. fromUrl.ljust(self.DEBUG_URL_LENGTH) + ' linking to'.ljust(14) + \
  229. toUrl, spider=spider)
  230. yield None
  231. else:
  232. # toUrl didn't match any of the allow patterns, drop it
  233. if self.DEBUG: self.log("Didn't match 'allow' patterns for".ljust(self.FIRST_MSG_LEN) + \
  234. fromUrl.ljust(self.DEBUG_URL_LENGTH) + ' linking to'.ljust(14) + \
  235. toUrl, spider=spider)
  236. yield None
  237. else:
  238. # r is an Item. allow it to pass.
  239. yield r
  240.  
  241. # Snippet imported from snippets.scrapy.org (which no longer works)
  242. # author: kevinbache
  243. # date : May 11, 2012
  244.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.