我们从Python开源项目中,提取了以下13个代码示例,用于说明如何使用scrapy.crawler.CrawlerRunner()。
def launch_crawlers(crawler_class, exclusion=None): settings = get_settings() configure_logging(settings=settings) launcher = CrawlerRunner(settings) crawlers = launcher.spider_loader.list() crawlers = list([c for c in crawlers if c.__contains__(crawler_class)]) if exclusion: for c in settings.get(exclusion, []): crawlers.remove(c) try: for crawler in crawlers: launcher.crawl(crawler) d = launcher.join() d.addBoth(lambda _: reactor.stop()) reactor.run() return True except Exception as e: launch_logger.error('(????)????? | ?????:\n{excep}' .format(excep=e)) return False
def run(): configure_logging() # importing project settings for further usage # mainly because of the middlewares settings = get_project_settings() runner = CrawlerRunner(settings) # running spiders sequentially (non-distributed) @defer.inlineCallbacks def crawl(): yield runner.crawl(IPTesterSpider) yield runner.crawl(UATesterSpider) reactor.stop() crawl() reactor.run() # block until the last call
def crawl(args): spids = args.get('spiders') configure_logging(SETTINGS, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) runner = CrawlerRunner(SETTINGS) loader = runner.spider_loader if 'all' in spids: spids = loader.list() spiders = [loader.load(_) for _ in filter(lambda __: __ in loader.list(), spids)] if not spiders: return False random.shuffle(spiders) for __ in spiders: runner.crawl(__) d = runner.join() d.addBoth(lambda _: reactor.stop()) logger.info('crawl reator starting ...') reactor.run() logging.info('crawl reator stopped')
def make_crawler(spider_cls=ATestBaseSpider, **extra_settings): # clean up queue before starting spider assert spider_cls.name.startswith('test_'), 'pass a special test spider' redis_server = redis.from_url('redis://localhost') name = spider_cls.name redis_server.delete( SCHEDULER_DUPEFILTER_KEY % {'spider': name}, *redis_server.keys( SCHEDULER_QUEUE_KEY % {'spider': name} + '*')) settings = Settings() settings.setmodule(dd_crawler.settings) settings['ITEM_PIPELINES']['tests.utils.CollectorPipeline'] = 100 settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(spider_cls)
def test_spider(setting): setting = setting.copy() spid = str(uuid.uuid4()) setting['_id'] = spid try: cls = SpiderFactory.mkspider(setting) except SpiderFactoryException as e: logger.error('Error in test_spider SpiderFactory[%s]', e) return False url = SETTINGS['TEMP_SPIDER_STATS_URL'] TEST_SETTINGS = { 'EXTENSIONS': {'mydm.extensions.ExtensionStats': 900, 'scrapy.extensions.logstats.LogStats': None, 'scrapy.extensions.spiderstate.SpiderState': None, 'scrapy.extensions.telnet.TelnetConsole': None, }, 'SPIDER_STATS_URL': url, 'BOT_NAME': 'TestSpider', 'WEBSERVICE_ENABLED': False, 'TELNETCONSOLE_ENABLED': False, 'LOG_LEVEL': 'INFO', 'LOG_FORMAT': '%(asctime)s-%(levelname)s: %(message)s', 'LOG_DATEFORMAT': '%Y-%m-%d %H:%M:%S' } configure_logging(TEST_SETTINGS, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) runner = CrawlerRunner(TEST_SETTINGS) d = runner.crawl(cls) d.addBoth(lambda _: reactor.stop()) logger.info('test_spider reator starting ...') reactor.run() logger.info('test_spider reator stopped') stats = get_stats(url, [spid]) n = stats[spid] return True if n > 0 else False
def run(cls): runner = CrawlerRunner(get_project_settings()) @defer.inlineCallbacks def deferred_crawl(): for spider, args, kwargs in cls.queue: try: yield runner.crawl(spider, *args, **kwargs) except KeyError as err: # Log a warning if the scraper name is invalid instead of # causing the job to fail. # NOTE: If there is any other type of error, the job will # fail, and all the jobs that depend on it will fail as # well. logger.warning(err.args[0]) # XXX: If all the names fail, then trying to run # `reactor.stop()` will give an "Unhandled error in # Deferred" complaint and hang. It will also hang in # general if no spiders have been run. I assume there's # some twisted-way to handle this, but for now, just log an # error. if reactor.running: reactor.stop() else: logger.critical("LocalQueue: No valid scraper names found.") deferred_crawl() reactor.run()
def make_crawler(**extra_settings): settings = Settings() settings['ITEM_PIPELINES'] = { 'scrapy_cdr.media_pipeline.CDRMediaPipeline': 1, 'tests.utils.CollectorPipeline': 100, } settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(Spider)
def make_crawler(settings, **extra_settings): settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(BaseSpider)
def run(self, args, opts): conn = redis.Redis(decode_responses=True) runner = CrawlerRunner(get_project_settings()) try: rules = Rule.loads() if not rules: raise ValueError except ValueError: print('Error in loading Redis rules, fallback to CSV rules') rules = Rule.loads('csv') for rule in rules: rule.save() if rule.name in self.excludes: continue if conn.hget('Rule:' + rule.name, 'status') == 'started': d = runner.crawl(ProxySpider, rule) # Set status to stopped if crawler finished d.addBoth(lambda _: conn.hset( 'Rule:' + rule.name, 'status', 'finished')) rule_maintainer = RuleMaintainer(conn, runner) proxy_maintainer = ProxyMaintainer(conn) schedule_maintainer = ScheduleMaintainer(conn) lc = task.LoopingCall(rule_maintainer) lc.start(1) lc = task.LoopingCall(proxy_maintainer) lc.start(0.5) lc = task.LoopingCall(schedule_maintainer) lc.start(10) reactor.run()
def runTest(self): settings = get_project_settings() settings.set('SPIDER_MODULES', ['classes.spiders']) try: sys.path.append(scrapy_path) runner = CrawlerRunner(settings) spiders = runner.spider_loader.list() self.assertEqual(set(class_pipeline.get_spiders()), set(spiders)) except: pass
def make_crawler(settings, spider_cls=None, **extra_settings): settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(spider_cls or TestSpider)
def _run_spiders(ticker_list, start_date, end_date): configure_logging() runner = CrawlerRunner(settings=get_project_settings()) spider_dict = { 'symbols': ticker_list, 'start_date': start_date, 'end_date': end_date } runner.crawl(EdgarSpider, **spider_dict) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def runspider(self): configure_logging(install_root_handler = False) s = get_project_settings() runner = CrawlerRunner(settings = s) @defer.inlineCallbacks def crawl(**spargs): yield runner.crawl(JDItemInfoSpider, **spargs) yield runner.crawl(JDCommentSpider, **spargs) reactor.stop() crawl(**self.spargs) reactor.run() # the script will block here until the last crawl call is finished # ????