Avoid downloading pages which exceed a certain size


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # This snippet was taken from the old wiki.
  2. #
  3. # You can do this by overriding the Scrapy HTTP Client Factory, with the following (undocumented) setting:
  4. #
  5. # DOWNLOADER_HTTPCLIENTFACTORY = 'myproject.downloader.LimitSizeHTTPClientFactory'
  6. #
  7.  
  8. MAX_RESPONSE_SIZE = 1048576 # 1Mb
  9.  
  10. from scrapy.core.downloader.webclient import ScrapyHTTPClientFactory, ScrapyHTTPPageGetter
  11.  
  12. class LimitSizePageGetter(ScrapyHTTPPageGetter):
  13.  
  14. def handleHeader(self, key, value):
  15. ScrapyHTTPPageGetter.handleHeader(self, key, value)
  16. if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE:
  17. self.connectionLost('oversized')
  18.  
  19. class LimitSizeHTTPClientFactory(ScrapyHTTPClientFactory):
  20.  
  21. protocol = LimitSizePageGetter
  22.  
  23. # Snippet imported from snippets.scrapy.org (which no longer works)
  24. # author: pablo
  25. # date : Sep 16, 2011
  26.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.