Python scrapy.crawler 模块，CrawlerProcess() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用scrapy.crawler.CrawlerProcess()。

项目：rental 作者：meihuanyu | 项目源码 | 文件源码

def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)

项目：web-search-engine 作者：AnthonySigogne | 项目源码 | 文件源码

def index_job(link) :
    """
    Index a single page.
    """
    print("index page : %s"%link)

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    process = CrawlerProcess({
        'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT':100,
        'REDIRECT_ENABLED':False,
        'SPIDER_MIDDLEWARES' : {
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True
        }
    })
    process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn)
    process.start() # block until finished

项目：osp-scraper 作者：opensyllabus | 项目源码 | 文件源码

def crawl(spider, *args, **kwargs):
    """Run a spider.

    Args:
        spider (str): The Scrapy `name` of the spider.
    """
    settings = get_project_settings()
    if kwargs.get('ignore_robots_txt') is True:
        settings.attributes.get('ROBOTSTXT_OBEY').value = False

    proc = CrawlerProcess(settings)
    try:
        proc.crawl(spider, *args, **kwargs)
        proc.start()
    except KeyError as err:
        # Log a warning if the scraper name is invalid instead of
        # causing the job to fail.
        # NOTE: If there is any other type of error, the job will fail, and all
        # the jobs that depend on it will fail as well.
        logger.warning(err.args[0])

项目：Newscrawler 作者：JBH168 | 项目源码 | 文件源码

def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param class crawler: class of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex)

项目：feeds 作者：nblock | 项目源码 | 文件源码

def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj['settings']
    if stats:
        settings.set('STATS_CLASS',
                     'scrapy.statscollectors.MemoryStatsCollector')

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error('Please specify what spiders you want to run!')
    else:
        for spider in spiders:
            logger.info('Starting crawl of {} ...'.format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool('HTTPCACHE_ENABLED'):
        run_cleanup_cache(settings)

项目：feeds 作者：nblock | 项目源码 | 文件源码

def cleanup(ctx):
    """
    Cleanup old cache entries.

    By default, entries older than 14 days will be removed. This value can be
    overriden in the config file.
    """
    settings = ctx.obj['settings']
    # Manually configure logging since we don't have a CrawlerProcess which
    # would take care of that.
    configure_logging(settings)

    if not settings.getbool('HTTPCACHE_ENABLED'):
        logger.error('Cache is disabled, will not clean up cache dir.')
        return 1

    run_cleanup_cache(settings)

项目：jd_comment 作者：awolfly9 | 项目源码 | 文件源码

def runspider(name, product_id):
    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % product_id,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runscrapy start spider:%s' % name)
        data = {
            'product_id': product_id
        }
        process.crawl(name, **data)
        process.start()
    except Exception, e:
        logging.error('runscrapy spider:%s exception:%s' % (name, e))
        pass

    logging.info('finish this spider:%s\n\n' % name)

项目：wallstreetcnScrapy 作者：jianzhichun | 项目源码 | 文件源码

def main():
#     process = CrawlerProcess()
#     process.crawl(CommentarySpider.CommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-1'])
#     process.crawl(jqkaCommentarySpider.jqkaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-2'])
#     process.crawl(sinaCommentarySpider.sinaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-3'])
#     process.start()
#     scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'CommentarySpider','-s','JOBDIR=crawls/CommentarySpider-1'])
    scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'jqkaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-2'])
#     scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'sinaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-3'])

项目：IPProxyTool 作者：awolfly9 | 项目源码 | 文件源码

def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)

项目：news-please 作者：fhamborg | 项目源码 | 文件源码

def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param class crawler: class of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex)

项目：kmanga 作者：aplanas | 项目源码 | 文件源码

def __init__(self, accounts, loglevel, remote=False):
        self.accounts = settings.SCRAPY_ACCOUNTS
        if accounts:
            self.accounts.update(accounts)
        self.loglevel = loglevel
        self.settings = self._get_settings()
        # Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
        self.settings.set('LOG_LEVEL', loglevel)
        if remote:
            # Configure remote logging and disable the scrapy logging.
            self.settings.set('LOG_ENABLED', False)
            logger = logging.getLogger()
            handler = ScrapySocketHandler(
                'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
            handler.setLevel(loglevel)
            logger.addHandler(handler)

        self.process = CrawlerProcess(self.settings)

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()

项目：hoaxy-backend 作者：IUNetSci | 项目源码 | 文件源码

def fetch_url(cls, session, msites, platform_id, purpose):
        """Actual method to do fetch url action.

        Parameters
        ----------
            msites : list
                a list of Site model class, contains info to build spiders.
            platform_id : int
                id of platform, bind fetched url with this id.
            purpose : {'update', 'archive'}
                indicate which url to fetch.
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.UrlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        for ms in msites:
            for sm in build_spiders_iter(ms, purpose):
                sm['kwargs']['session'] = session
                sm['kwargs']['platform_id'] = platform_id
                process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
        process.start()

项目：hoaxy-backend 作者：IUNetSci | 项目源码 | 文件源码

def fetch_html(cls, session, url_tuples):
        """Actual method to do fetch html action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, raw, status_code).
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.HtmlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.warning('Number of url to fetch html is: %s', len(url_tuples))
        process.crawl(
            HtmlSpider,
            session=session,
            url_tuples=url_tuples,
            excluded_domains=cls.conf['crawl']['excluded_domains'])
        process.start()

项目：hoaxy-backend 作者：IUNetSci | 项目源码 | 文件源码

def parse_article(cls, session, url_tuples):
        """Actual method to do parse to article action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, created_at, date_published,
                canonical, site_id)
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.ArticlePipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.info('Number of url to parse is: %s', len(url_tuples))
        process.crawl(
            ArticleParserSpider,
            session=session,
            url_tuples=url_tuples,
            api_key=cls.conf['crawl']['article_parser']['webparser_api_key'],)
        process.start()

项目：pydata_webscraping 作者：jmortega | 项目源码 | 文件源码

def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(EuropythonSpyder())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"

项目：pydata_webscraping 作者：jmortega | 项目源码 | 文件源码

def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item Extraido:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(BloggerSpider())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"

项目：pydata_webscraping 作者：jmortega | 项目源码 | 文件源码

def main():
    from scrapy.xlib.pydispatch import dispatcher

    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # define spyder for the crawler
    crawler.crawl(PydataSpiderDetails())

    print "STARTING ENGINE"
    crawler.start() #start  the crawler
    print "ENGINE STOPPED"

项目：Python_Stock_Github 作者：DavidFnck | 项目源码 | 文件源码

def ProcessRun():

    process = CrawlerProcess(get_project_settings())
    # ????spider
    process.crawl("news")
    # process.crawl("favorite_spider")
    # ???? spider
    for spider_name in process.spider_loader.list():
        # print spider_name
        process.crawl(spider_name)
    process.start()

项目：caoliuscrapy 作者：leyle | 项目源码 | 文件源码

def run(max_page=5):
    settings = get_project_settings()
    settings.set('MAX_PAGE', max_page, 'project')
    crawler_process = CrawlerProcess(settings)
    crawler_process.crawl(CaoLiuSpider)
    crawler_process.start()

项目：makinami 作者：Coderhypo | 项目源码 | 文件源码

def __init__(self):
        self.crawler = CrawlerProcess(settings)

项目：StrepHit 作者：Wikidata | 项目源码 | 文件源码

def crawl(spider_name, results_dir):
    """ Run one or more spiders """
    settings = get_project_settings()
    # prevent scrapy from configuring its own logging, since we already have it
    settings.set('LOG_ENABLED', False)

    process = CrawlerProcess(settings)
    for s in spider_name:
        process.settings.set('FEED_URI',
                             'file://%s.jsonlines' % os.path.join(results_dir, s))
        process.settings.set('FEED_FORMAT', 'jsonlines')
        spider = process.spider_loader.load(s)
        process.crawl(spider)
    process.start()

项目：Pixiv-Spider 作者：cathor01 | 项目源码 | 文件源码

def main():
    settings = get_project_settings()
    process = CrawlerProcess(settings)
    process.crawl("pixiv")
    process.start()

项目：wechat-crawler 作者：DMGbupt | 项目源码 | 文件源码

def crawl(spiders, query, start, end, page):
    spider_logger.info("Start crawling {0} from {1} to {2}".format(query, start, end))
    process = CrawlerProcess(get_project_settings())
    process.crawl(spiders, query=query, start_time=start, end_time=end, index_pages=page)
    process.start()

项目：Music-Scraper 作者：srivatsan-ramesh | 项目源码 | 文件源码

def start_gui(process):
    """
    A function that takes care of starting the GUI and stops the Scrapy crawler process when exited from program.

    :param CrawlerProcess process: The scrapy crawler process that is used to scrape the web. The instance is used for stopping the process.
    """

    def create_ui(screen):
        """
        A function passes to curses wrapper for safe execution of terminal GUI.

        :param screen: The screen parameter to run the GUI. Sent from the curses wrapper.
        """

        GUI.screen = screen  # All the statis variables of the GUI class is initialized
        GUI.strings = []  # the list of songs is empty initially
        GUI.init_display()  # init the variables required for GUI
        GUI.update_on_key()  # Starts a loop that waits for key input and acts accordingly

        curses.nocbreak()
        curses.echo()
        curses.endwin()
        GUI.gui_stopped = True

    curses.wrapper(create_ui)
    process.stop()  # Stopping the scrapy crawler process

项目：twitter-sentiment 作者：words-sdsc | 项目源码 | 文件源码

def startCrawler():
    """ Initiates process of the web crawler above.

    Arguments: None

    Return: None
    """

    # Starts a Twisted reactors to configure logs and set shutdown handlers
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    process.crawl(TwitterSpider)
    process.start()

项目：feeds 作者：nblock | 项目源码 | 文件源码

def list():
    """List all available spiders."""
    settings = get_project_settings()
    settings['LOG_ENABLED'] = False
    process = CrawlerProcess(settings)
    for s in sorted(process.spider_loader.list()):
        print(s)

项目：czl-scrape 作者：code4romania | 项目源码 | 文件源码

def main():
    from scrapy.crawler import CrawlerProcess, Crawler
    process = CrawlerProcess()
    process.crawl(EducatieSpider)
    process.start()

项目：czl-scrape 作者：code4romania | 项目源码 | 文件源码

def main():
    process = CrawlerProcess()
    process.crawl(ComunicatiiSpider)
    process.start()

项目：czl-scrape 作者：code4romania | 项目源码 | 文件源码

def main():
    from scrapy.crawler import CrawlerProcess
    process = CrawlerProcess()
    process.crawl(DialogSpider)
    process.start()

项目：decoration-design-crawler 作者：imflyn | 项目源码 | 文件源码

def start_scrapy(self):
        self.process = CrawlerProcess(self.setting)
        self.crawl()
        reactor.run()

项目：decoration-design-crawler 作者：imflyn | 项目源码 | 文件源码

def start_scrapy(self):
        self.process = CrawlerProcess(self.setting)
        self.crawl()
        reactor.run()

项目：Get-Positive 作者：M-shin | 项目源码 | 文件源码

def getReviewCount(url):
  # Get the number of reviews
  process = CrawlerProcess(get_project_settings())
  process.crawl(review_count_spider, start_url=url)
  process.start()

项目：aquam 作者：xncbf | 项目源码 | 文件源码

def crawl_naver_blog():
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    process.crawl(NaverBlogSpider)
    process.start()

项目：wayback-machine-scraper 作者：sangaline | 项目源码 | 文件源码

def main():
    # configure the settings for the crawler and spider
    args = parse_args()
    config = {
        'domains': args.domains,
        'directory': args.output,
        'allow': args.allow,
        'deny': args.deny,
        'unix': args.unix,
    }
    settings = Settings({
        'USER_AGENT': (
            'Wayback Machine Scraper/{0} '
            '(+https://github.com/sangaline/scrapy-wayback-machine)'
        ).format(get_distribution('wayback-machine-scraper').version),
        'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
        },
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_DEBUG': args.verbose,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
        'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
    })

    # start the crawler
    process = CrawlerProcess(settings)
    process.crawl(MirrorSpider, **config)
    process.start()

项目：blog_analysis 作者：Databrawl | 项目源码 | 文件源码

def get_top_blogs():
    """
    Get URLs of most popular blog posts for most popular programming languages
    on GitHub.
    """
    languages = get_top_languages(30, settings['LANGUAGES_DATA'])
    process = CrawlerProcess(settings)
    process.crawl(BlogsSpider, languages)
    process.start()  # the script will block here until the crawling is done

项目：blog_analysis 作者：Databrawl | 项目源码 | 文件源码

def estimate_traffic():
    """
    Analyze traffic of the scraped blogs.
    """
    process = CrawlerProcess(settings)
    blogs_file = get_latest_file(settings['BLOGS_FEED_DIR'])
    with open(blogs_file) as f:
        blogs = json.load(f)
    process.crawl(TrafficSpider, blogs)
    process.start()  # the script will block here until the crawling is done

项目：tobber 作者：fchamicapereira | 项目源码 | 文件源码

def run_crawler(self):

        process = CrawlerProcess(self.settings)

        if self.args.anime:
            if self.args.skip == None or 'nyaa' not in self.args.skip:
                process.crawl(Nyaa,  title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'shanaproject' not in self.args.skip:
                process.crawl(Shanaproject,  title=self.search, season=self.args.season, file=self.args.file)

        else:
            if self.args.skip == None or 'zooqle' not in self.args.skip:
                process.crawl(Zooqle, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or '1337x' not in self.args.skip:
                process.crawl(_1337x, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'eztv' not in self.args.skip:
                process.crawl(Eztv,   title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'rarbg' not in self.args.skip:
                process.crawl(Rarbg, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'torrentdownloads' not in self.args.skip:
                process.crawl(Torrentdownloads, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'limetorrents' not in self.args.skip:
                process.crawl(Limetorrents, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'thepiratebay' not in self.args.skip:
                process.crawl(Thepiratebay, title=self.search, season=self.args.season, file=self.args.file)

        process.start()

项目：web-search-engine 作者：AnthonySigogne | 项目源码 | 文件源码

def explore_job(link) :
    """
    Explore a website and index all urls (redis-rq process).
    """
    print("explore website at : %s"%link)

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    # create or update domain data
    domain = url.domain(link)
    res = client.index(index="web", doc_type='domain', id=domain, body={
        "homepage":link,
        "domain":domain,
        "last_crawl":datetime.now()
    })

    # start crawler
    process = CrawlerProcess({
        'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT':100,
        'DOWNLOAD_DELAY':0.25,
        'ROBOTSTXT_OBEY':True,
        'HTTPCACHE_ENABLED':False,
        'REDIRECT_ENABLED':False,
        'SPIDER_MIDDLEWARES' : {
            'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware':True,
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True,
            'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware':True,
            'scrapy.extensions.closespider.CloseSpider':True
        },
        'CLOSESPIDER_PAGECOUNT':500 #only for debug
    })
    process.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls = [link,], es_client=client, redis_conn=redis_conn)
    process.start()

    return 1

项目：hsdata 作者：youfou | 项目源码 | 文件源码

def _crawl(deck_ids):
        logging.info('????????')
        decks = list()
        cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hearthstats.HearthStatsScrapyPipeline': 1}})
        cp.crawl(HearthStatsScrapySpider, deck_ids=deck_ids, decks=decks)
        cp.start()
        return decks

项目：hsdata 作者：youfou | 项目源码 | 文件源码

def _crawl(deck_ids):
        logging.info('??????????')
        results = list()
        cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hsbox.HSBoxScrapyPipeline': 1}})
        cp.crawl(HSBoxScrapySpider, deck_ids=deck_ids, results=results)
        cp.start()
        logging.info('??? {} ??????????'.format(len(results)))
        return results

项目：Pysearch2.0 作者：Pysearch | 项目源码 | 文件源码

def crawl(url):
    """Initialize crawling sequence."""
    settings = get_project_settings()
    settings.url = url
    settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT
    settings["DEPTH_LEVEL"] = DEPTH_LEVEL
    process = CrawlerProcess(settings)

    class ThisSpider(CrawlingSpider):
        """Create a spider to crawl with."""

        start_urls = [url]
    process.crawl(ThisSpider)
    process.start()

项目：Pysearch2.0 作者：Pysearch | 项目源码 | 文件源码

def harvest(url):
    """Initialize harvest sequence."""
    settings = get_project_settings()
    settings.url = url
    process = CrawlerProcess(settings)
    process.crawl(HarvestSpider, url=url)
    process.start()

项目：hermes 作者：xutaoding | 项目源码 | 文件源码

def test_waizard_spider():
    crawler = CrawlerProcess(get_project_settings())
    crawler.crawl(WizardSpider)
    crawler.start()

项目：myaddons 作者：luohuayong | 项目源码 | 文件源码

def run():
    # ??settings.py?????
    settings = get_project_settings()
    process = CrawlerProcess(settings=settings)

    # ??????spider
    # process.crawl(Spider1)
    # process.crawl(Spider2)
    process.crawl(GuaziSaleSpider)

    # ???????????????
    process.start()

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start()

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def collect(conf, conn, page_from=None, page_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to)
    process.start()

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start()

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()