Downloader middleware to redirect to rel=canonical urls


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # A downloader middleware automatically to redirect pages containing a rel=canonical in their contents to the canonical url (if the page itself is not the canonical one),
  2.  
  3. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  4. from scrapy.utils.url import url_is_from_spider
  5. from scrapy.http import HtmlResponse
  6. from scrapy import log
  7.  
  8. class RelCanonicalMiddleware(object):
  9. _extractor = SgmlLinkExtractor(restrict_xpaths=['//head/link[@rel="canonical"]'], tags=['link'], attrs=['href'])
  10.  
  11. def process_response(self, request, response, spider):
  12. if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False):
  13. rel_canonical = self._extractor.extract_links(response)
  14. if rel_canonical:
  15. rel_canonical = rel_canonical[0].url
  16. if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider):
  17. log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider)
  18. return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response)
  19. return response
  20.  
  21. # Snippet imported from snippets.scrapy.org (which no longer works)
  22. # author: pablo
  23. # date : Aug 27, 2010
  24.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.