Revision: 59340
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 1, 2012 07:15 by scrapy
Initial Code
# This script shows how you can use the Scrapy crawler from a thread simulating a blocking API: # # The following example shows how you can interact with the crawler in a blocking fashion, to run two spider, one that scrapes 15 items and then another that scrapes 50 items. IPython is installed so its console is used, instead of the standard Python console. # # For more information see [Twisted Threads](http://twistedmatrix.com/documents/current/core/howto/threading.html) # # $ python this_script.py # [ ... Scrapy initialization log here ... ] # # In [1]: items = crawler.crawl('somespider') # [ ... somespider log here ... ] # # In [2]: len(items) # Out[2]: 15 # # In [3]: items2 = crawler.crawl('otherspider') # [ ... otherspider log here ... ] # # In [4]: len(items2) # Out[4]: 50 # # In [5]: ^D # [ ... Scrapy termination log here ... ] # $ from scrapy import log, signals from scrapy.utils.console import start_python_console from scrapy.xlib.pydispatch import dispatcher from scrapy.conf import settings from scrapy.crawler import CrawlerProcess class BlockingCrawlerFromThread(object): def __init__(self, crawler): self.crawler = crawler dispatcher.connect(self._spider_closed, signals.spider_closed) dispatcher.connect(self._item_passed, signals.item_passed) def _crawl(self, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.items = [] self.crawler.queue.append_spider(spider) self.deferred = defer.Deferred() return self.deferred def _item_passed(self, item): self.items.append(item) def _spider_closed(self, spider): self.deferred.callback(self.items) def crawl(self, spider_name): return threads.blockingCallFromThread(reactor, self._crawl, spider_name) log.start() settings.overrides['QUEUE_CLASS'] = 'scrapy.core.queue.KeepAliveExecutionQueue' crawler = CrawlerProcess(settings) crawler.install() crawler.configure() blocking_crawler = BlockingCrawlerFromThread(crawler) d = threads.deferToThread(start_python_console, {'crawler': blocking_crawler}) d.addBoth(lambda x: crawler.stop()) crawler.start() # Snippet imported from snippets.scrapy.org (which no longer works) # author: pablo # date : Aug 26, 2010
Initial URL
Initial Description
Initial Title
Using Scrapy crawler with a blocking API from a thread
Initial Tags
api
Initial Language
Python