Using Scrapy crawler with a blocking API from a thread


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. # This script shows how you can use the Scrapy crawler from a thread simulating a blocking API:
  2. #
  3. # The following example shows how you can interact with the crawler in a blocking fashion, to run two spider, one that scrapes 15 items and then another that scrapes 50 items. IPython is installed so its console is used, instead of the standard Python console.
  4. #
  5. # For more information see [Twisted Threads](http://twistedmatrix.com/documents/current/core/howto/threading.html)
  6. #
  7. # $ python this_script.py
  8. # [ ... Scrapy initialization log here ... ]
  9. #
  10. # In [1]: items = crawler.crawl('somespider')
  11. # [ ... somespider log here ... ]
  12. #
  13. # In [2]: len(items)
  14. # Out[2]: 15
  15. #
  16. # In [3]: items2 = crawler.crawl('otherspider')
  17. # [ ... otherspider log here ... ]
  18. #
  19. # In [4]: len(items2)
  20. # Out[4]: 50
  21. #
  22. # In [5]: ^D
  23. # [ ... Scrapy termination log here ... ]
  24. # $
  25.  
  26. from scrapy import log, signals
  27. from scrapy.utils.console import start_python_console
  28. from scrapy.xlib.pydispatch import dispatcher
  29. from scrapy.conf import settings
  30. from scrapy.crawler import CrawlerProcess
  31.  
  32. class BlockingCrawlerFromThread(object):
  33.  
  34. def __init__(self, crawler):
  35. self.crawler = crawler
  36. dispatcher.connect(self._spider_closed, signals.spider_closed)
  37. dispatcher.connect(self._item_passed, signals.item_passed)
  38.  
  39. def _crawl(self, spider_name):
  40. spider = self.crawler.spiders.create(spider_name)
  41. if spider:
  42. self.items = []
  43. self.crawler.queue.append_spider(spider)
  44. self.deferred = defer.Deferred()
  45. return self.deferred
  46.  
  47. def _item_passed(self, item):
  48. self.items.append(item)
  49.  
  50. def _spider_closed(self, spider):
  51. self.deferred.callback(self.items)
  52.  
  53. def crawl(self, spider_name):
  54. return threads.blockingCallFromThread(reactor, self._crawl, spider_name)
  55.  
  56. log.start()
  57. settings.overrides['QUEUE_CLASS'] = 'scrapy.core.queue.KeepAliveExecutionQueue'
  58. crawler = CrawlerProcess(settings)
  59. crawler.install()
  60. crawler.configure()
  61. blocking_crawler = BlockingCrawlerFromThread(crawler)
  62. d = threads.deferToThread(start_python_console, {'crawler': blocking_crawler})
  63. d.addBoth(lambda x: crawler.stop())
  64. crawler.start()
  65.  
  66. # Snippet imported from snippets.scrapy.org (which no longer works)
  67. # author: pablo
  68. # date : Aug 26, 2010
  69.  

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.