与:
from twisted.internet import reactor from scrapy.crawler import CrawlerProcess
我一直成功地运行了此过程:
process = CrawlerProcess(get_project_settings()) process.crawl(*args) # the script will block here until the crawling is finished process.start()
但由于我已将此代码移到一个web_crawler(self)函数中,如下所示:
web_crawler(self)
def web_crawler(self): # set up a crawler process = CrawlerProcess(get_project_settings()) process.crawl(*args) # the script will block here until the crawling is finished process.start() # (...) return (result1, result2)
并开始使用类实例化调用该方法,例如:
def __call__(self): results1 = test.web_crawler()[1] results2 = test.web_crawler()[0]
并运行:
test()
我收到以下错误:
Traceback (most recent call last): File "test.py", line 573, in <module> print (test()) File "test.py", line 530, in __call__ artists = test.web_crawler() File "test.py", line 438, in web_crawler process.start() File "/Library/Python/2.7/site-packages/scrapy/crawler.py", line 280, in start reactor.run(installSignalHandlers=False) # blocking call File "/Library/Python/2.7/site-packages/twisted/internet/base.py", line 1194, in run self.startRunning(installSignalHandlers=installSignalHandlers) File "/Library/Python/2.7/site-packages/twisted/internet/base.py", line 1174, in startRunning ReactorBase.startRunning(self) File "/Library/Python/2.7/site-packages/twisted/internet/base.py", line 684, in startRunning raise error.ReactorNotRestartable() twisted.internet.error.ReactorNotRestartable
怎么了?
你无法重新启动反应堆,但是应该可以通过分叉一个单独的过程来使其运行更多次:
import scrapy import scrapy.crawler as crawler from multiprocessing import Process, Queue from twisted.internet import reactor # your spider class QuotesSpider(scrapy.Spider): name = "quotes" start_urls = ['http://quotes.toscrape.com/tag/humor/'] def parse(self, response): for quote in response.css('div.quote'): print(quote.css('span.text::text').extract_first()) # the wrapper to make it run more times def run_spider(spider): def f(q): try: runner = crawler.CrawlerRunner() deferred = runner.crawl(spider) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e) q = Queue() p = Process(target=f, args=(q,)) p.start() result = q.get() p.join() if result is not None: raise result
运行两次:
print('first run:') run_spider(QuotesSpider) print('\nsecond run:') run_spider(QuotesSpider)
结果:
first run: “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.” “A day without sunshine is like, you know, night.” ... second run: “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.” “A day without sunshine is like, you know, night.” ...