Python scrapy.crawler 模块,CrawlerRunner() 实例源码

我们从Python开源项目中,提取了以下13个代码示例,用于说明如何使用scrapy.crawler.CrawlerRunner()

项目:ArticlePusher    作者:aforwardz    | 项目源码 | 文件源码
def launch_crawlers(crawler_class, exclusion=None):
    settings = get_settings()
    configure_logging(settings=settings)
    launcher = CrawlerRunner(settings)
    crawlers = launcher.spider_loader.list()
    crawlers = list([c for c in crawlers if c.__contains__(crawler_class)])
    if exclusion:
        for c in settings.get(exclusion, []):
            crawlers.remove(c)

    try:
        for crawler in crawlers:
            launcher.crawl(crawler)
        d = launcher.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run()
        return True
    except Exception as e:
        launch_logger.error('(????)????? | ?????:\n{excep}'
                            .format(excep=e))
        return False
项目:PythonScrapyBasicSetup    作者:matejbasic    | 项目源码 | 文件源码
def run():
    configure_logging()
    # importing project settings for further usage
    # mainly because of the middlewares
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    # running spiders sequentially (non-distributed)
    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(IPTesterSpider)
        yield runner.crawl(UATesterSpider)
        reactor.stop()

    crawl()
    reactor.run() # block until the last call
项目:BlogSpider    作者:hack4code    | 项目源码 | 文件源码
def crawl(args):
    spids = args.get('spiders')
    configure_logging(SETTINGS,
                      install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(SETTINGS)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [loader.load(_)
               for _ in filter(lambda __: __ in loader.list(),
                               spids)]
    if not spiders:
        return False

    random.shuffle(spiders)
    for __ in spiders:
        runner.crawl(__)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())

    logger.info('crawl reator starting ...')
    reactor.run()
    logging.info('crawl reator stopped')
项目:domain-discovery-crawler    作者:TeamHG-Memex    | 项目源码 | 文件源码
def make_crawler(spider_cls=ATestBaseSpider, **extra_settings):
    # clean up queue before starting spider
    assert spider_cls.name.startswith('test_'), 'pass a special test spider'
    redis_server = redis.from_url('redis://localhost')
    name = spider_cls.name
    redis_server.delete(
        SCHEDULER_DUPEFILTER_KEY % {'spider': name},
        *redis_server.keys(
            SCHEDULER_QUEUE_KEY % {'spider': name} + '*'))

    settings = Settings()
    settings.setmodule(dd_crawler.settings)
    settings['ITEM_PIPELINES']['tests.utils.CollectorPipeline'] = 100
    settings.update(extra_settings)
    runner = CrawlerRunner(settings)
    return runner.create_crawler(spider_cls)
项目:BlogSpider    作者:hack4code    | 项目源码 | 文件源码
def test_spider(setting):
    setting = setting.copy()
    spid = str(uuid.uuid4())
    setting['_id'] = spid
    try:
        cls = SpiderFactory.mkspider(setting)
    except SpiderFactoryException as e:
        logger.error('Error in test_spider SpiderFactory[%s]',
                     e)
        return False
    url = SETTINGS['TEMP_SPIDER_STATS_URL']
    TEST_SETTINGS = {
        'EXTENSIONS': {'mydm.extensions.ExtensionStats': 900,
                       'scrapy.extensions.logstats.LogStats': None,
                       'scrapy.extensions.spiderstate.SpiderState': None,
                       'scrapy.extensions.telnet.TelnetConsole': None, },
        'SPIDER_STATS_URL': url,
        'BOT_NAME': 'TestSpider',
        'WEBSERVICE_ENABLED': False,
        'TELNETCONSOLE_ENABLED': False,
        'LOG_LEVEL': 'INFO',
        'LOG_FORMAT': '%(asctime)s-%(levelname)s: %(message)s',
        'LOG_DATEFORMAT': '%Y-%m-%d %H:%M:%S'
    }

    configure_logging(TEST_SETTINGS,
                      install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(TEST_SETTINGS)
    d = runner.crawl(cls)
    d.addBoth(lambda _: reactor.stop())
    logger.info('test_spider reator starting ...')
    reactor.run()
    logger.info('test_spider reator stopped')
    stats = get_stats(url,
                      [spid])
    n = stats[spid]
    return True if n > 0 else False
项目:osp-scraper    作者:opensyllabus    | 项目源码 | 文件源码
def run(cls):
        runner = CrawlerRunner(get_project_settings())

        @defer.inlineCallbacks
        def deferred_crawl():
            for spider, args, kwargs in cls.queue:
                try:
                    yield runner.crawl(spider, *args, **kwargs)
                except KeyError as err:
                    # Log a warning if the scraper name is invalid instead of
                    # causing the job to fail.
                    # NOTE: If there is any other type of error, the job will
                    # fail, and all the jobs that depend on it will fail as
                    # well.
                    logger.warning(err.args[0])

            # XXX: If all the names fail, then trying to run
            # `reactor.stop()` will give an "Unhandled error in
            # Deferred" complaint and hang.  It will also hang in
            # general if no spiders have been run.  I assume there's
            # some twisted-way to handle this, but for now, just log an
            # error.
            if reactor.running:
                reactor.stop()
            else:
                logger.critical("LocalQueue: No valid scraper names found.")

        deferred_crawl()
        reactor.run()
项目:scrapy-cdr    作者:TeamHG-Memex    | 项目源码 | 文件源码
def make_crawler(**extra_settings):
    settings = Settings()
    settings['ITEM_PIPELINES'] = {
        'scrapy_cdr.media_pipeline.CDRMediaPipeline': 1,
        'tests.utils.CollectorPipeline': 100,
    }
    settings.update(extra_settings)
    runner = CrawlerRunner(settings)
    return runner.create_crawler(Spider)
项目:undercrawler    作者:TeamHG-Memex    | 项目源码 | 文件源码
def make_crawler(settings, **extra_settings):
    settings.update(extra_settings)
    runner = CrawlerRunner(settings)
    return runner.create_crawler(BaseSpider)
项目:ProxyPool    作者:Time1ess    | 项目源码 | 文件源码
def run(self, args, opts):
        conn = redis.Redis(decode_responses=True)
        runner = CrawlerRunner(get_project_settings())
        try:
            rules = Rule.loads()
            if not rules:
                raise ValueError
        except ValueError:
            print('Error in loading Redis rules, fallback to CSV rules')
            rules = Rule.loads('csv')
        for rule in rules:
            rule.save()
            if rule.name in self.excludes:
                continue
            if conn.hget('Rule:' + rule.name, 'status') == 'started':
                d = runner.crawl(ProxySpider, rule)
                # Set status to stopped if crawler finished
                d.addBoth(lambda _: conn.hset(
                    'Rule:' + rule.name, 'status', 'finished'))
        rule_maintainer = RuleMaintainer(conn, runner)
        proxy_maintainer = ProxyMaintainer(conn)
        schedule_maintainer = ScheduleMaintainer(conn)
        lc = task.LoopingCall(rule_maintainer)
        lc.start(1)
        lc = task.LoopingCall(proxy_maintainer)
        lc.start(0.5)
        lc = task.LoopingCall(schedule_maintainer)
        lc.start(10)
        reactor.run()
项目:dancedeets-monorepo    作者:mikelambert    | 项目源码 | 文件源码
def runTest(self):
        settings = get_project_settings()
        settings.set('SPIDER_MODULES', ['classes.spiders'])
        try:
            sys.path.append(scrapy_path)
            runner = CrawlerRunner(settings)
            spiders = runner.spider_loader.list()
            self.assertEqual(set(class_pipeline.get_spiders()), set(spiders))
        except:
            pass
项目:autologin-middleware    作者:TeamHG-Memex    | 项目源码 | 文件源码
def make_crawler(settings, spider_cls=None, **extra_settings):
    settings.update(extra_settings)
    runner = CrawlerRunner(settings)
    return runner.create_crawler(spider_cls or TestSpider)
项目:py-investment    作者:kprestel    | 项目源码 | 文件源码
def _run_spiders(ticker_list, start_date, end_date):
        configure_logging()
        runner = CrawlerRunner(settings=get_project_settings())

        spider_dict = {
            'symbols': ticker_list,
            'start_date': start_date,
            'end_date': end_date
        }
        runner.crawl(EdgarSpider, **spider_dict)
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run()
项目:jd_analysis    作者:awolfly9    | 项目源码 | 文件源码
def runspider(self):
        configure_logging(install_root_handler = False)
        s = get_project_settings()
        runner = CrawlerRunner(settings = s)

        @defer.inlineCallbacks
        def crawl(**spargs):
            yield runner.crawl(JDItemInfoSpider, **spargs)
            yield runner.crawl(JDCommentSpider, **spargs)
            reactor.stop()

        crawl(**self.spargs)
        reactor.run()  # the script will block here until the last crawl call is finished

    # ????