Python scrapy.crawler 模块,CrawlerProcess() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.crawler.CrawlerProcess()

项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)
项目:web-search-engine    作者:AnthonySigogne    | 项目源码 | 文件源码
def index_job(link) :
    """
    Index a single page.
    """
    print("index page : %s"%link)

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    process = CrawlerProcess({
        'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT':100,
        'REDIRECT_ENABLED':False,
        'SPIDER_MIDDLEWARES' : {
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True
        }
    })
    process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn)
    process.start() # block until finished
项目:osp-scraper    作者:opensyllabus    | 项目源码 | 文件源码
def crawl(spider, *args, **kwargs):
    """Run a spider.

    Args:
        spider (str): The Scrapy `name` of the spider.
    """
    settings = get_project_settings()
    if kwargs.get('ignore_robots_txt') is True:
        settings.attributes.get('ROBOTSTXT_OBEY').value = False

    proc = CrawlerProcess(settings)
    try:
        proc.crawl(spider, *args, **kwargs)
        proc.start()
    except KeyError as err:
        # Log a warning if the scraper name is invalid instead of
        # causing the job to fail.
        # NOTE: If there is any other type of error, the job will fail, and all
        # the jobs that depend on it will fail as well.
        logger.warning(err.args[0])
项目:Newscrawler    作者:JBH168    | 项目源码 | 文件源码
def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param class crawler: class of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex)
项目:feeds    作者:nblock    | 项目源码 | 文件源码
def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj['settings']
    if stats:
        settings.set('STATS_CLASS',
                     'scrapy.statscollectors.MemoryStatsCollector')

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error('Please specify what spiders you want to run!')
    else:
        for spider in spiders:
            logger.info('Starting crawl of {} ...'.format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool('HTTPCACHE_ENABLED'):
        run_cleanup_cache(settings)
项目:feeds    作者:nblock    | 项目源码 | 文件源码
def cleanup(ctx):
    """
    Cleanup old cache entries.

    By default, entries older than 14 days will be removed. This value can be
    overriden in the config file.
    """
    settings = ctx.obj['settings']
    # Manually configure logging since we don't have a CrawlerProcess which
    # would take care of that.
    configure_logging(settings)

    if not settings.getbool('HTTPCACHE_ENABLED'):
        logger.error('Cache is disabled, will not clean up cache dir.')
        return 1

    run_cleanup_cache(settings)
项目:jd_comment    作者:awolfly9    | 项目源码 | 文件源码
def runspider(name, product_id):
    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % product_id,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runscrapy start spider:%s' % name)
        data = {
            'product_id': product_id
        }
        process.crawl(name, **data)
        process.start()
    except Exception, e:
        logging.error('runscrapy spider:%s exception:%s' % (name, e))
        pass

    logging.info('finish this spider:%s\n\n' % name)
项目:wallstreetcnScrapy    作者:jianzhichun    | 项目源码 | 文件源码
def main():
#     process = CrawlerProcess()
#     process.crawl(CommentarySpider.CommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-1'])
#     process.crawl(jqkaCommentarySpider.jqkaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-2'])
#     process.crawl(sinaCommentarySpider.sinaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-3'])
#     process.start()
#     scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'CommentarySpider','-s','JOBDIR=crawls/CommentarySpider-1'])
    scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'jqkaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-2'])
#     scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'sinaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-3'])
项目:IPProxyTool    作者:awolfly9    | 项目源码 | 文件源码
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)
项目:news-please    作者:fhamborg    | 项目源码 | 文件源码
def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param class crawler: class of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex)
项目:kmanga    作者:aplanas    | 项目源码 | 文件源码
def __init__(self, accounts, loglevel, remote=False):
        self.accounts = settings.SCRAPY_ACCOUNTS
        if accounts:
            self.accounts.update(accounts)
        self.loglevel = loglevel
        self.settings = self._get_settings()
        # Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
        self.settings.set('LOG_LEVEL', loglevel)
        if remote:
            # Configure remote logging and disable the scrapy logging.
            self.settings.set('LOG_ENABLED', False)
            logger = logging.getLogger()
            handler = ScrapySocketHandler(
                'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
            handler.setLevel(loglevel)
            logger.addHandler(handler)

        self.process = CrawlerProcess(self.settings)
项目:collectors    作者:opentrials    | 项目源码 | 文件源码
def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()
项目:hoaxy-backend    作者:IUNetSci    | 项目源码 | 文件源码
def fetch_url(cls, session, msites, platform_id, purpose):
        """Actual method to do fetch url action.

        Parameters
        ----------
            msites : list
                a list of Site model class, contains info to build spiders.
            platform_id : int
                id of platform, bind fetched url with this id.
            purpose : {'update', 'archive'}
                indicate which url to fetch.
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.UrlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        for ms in msites:
            for sm in build_spiders_iter(ms, purpose):
                sm['kwargs']['session'] = session
                sm['kwargs']['platform_id'] = platform_id
                process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
        process.start()
项目:hoaxy-backend    作者:IUNetSci    | 项目源码 | 文件源码
def fetch_html(cls, session, url_tuples):
        """Actual method to do fetch html action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, raw, status_code).
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.HtmlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.warning('Number of url to fetch html is: %s', len(url_tuples))
        process.crawl(
            HtmlSpider,
            session=session,
            url_tuples=url_tuples,
            excluded_domains=cls.conf['crawl']['excluded_domains'])
        process.start()
项目:hoaxy-backend    作者:IUNetSci    | 项目源码 | 文件源码
def parse_article(cls, session, url_tuples):
        """Actual method to do parse to article action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, created_at, date_published,
                canonical, site_id)
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.ArticlePipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.info('Number of url to parse is: %s', len(url_tuples))
        process.crawl(
            ArticleParserSpider,
            session=session,
            url_tuples=url_tuples,
            api_key=cls.conf['crawl']['article_parser']['webparser_api_key'],)
        process.start()
项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(EuropythonSpyder())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"
项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item Extraido:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(BloggerSpider())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"
项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def main():
    from scrapy.xlib.pydispatch import dispatcher

    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # define spyder for the crawler
    crawler.crawl(PydataSpiderDetails())

    print "STARTING ENGINE"
    crawler.start() #start  the crawler
    print "ENGINE STOPPED"
项目:Python_Stock_Github    作者:DavidFnck    | 项目源码 | 文件源码
def ProcessRun():

    process = CrawlerProcess(get_project_settings())
    # ????spider
    process.crawl("news")
    # process.crawl("favorite_spider")
    # ???? spider
    for spider_name in process.spider_loader.list():
        # print spider_name
        process.crawl(spider_name)
    process.start()
项目:caoliuscrapy    作者:leyle    | 项目源码 | 文件源码
def run(max_page=5):
    settings = get_project_settings()
    settings.set('MAX_PAGE', max_page, 'project')
    crawler_process = CrawlerProcess(settings)
    crawler_process.crawl(CaoLiuSpider)
    crawler_process.start()
项目:makinami    作者:Coderhypo    | 项目源码 | 文件源码
def __init__(self):
        self.crawler = CrawlerProcess(settings)
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def crawl(spider_name, results_dir):
    """ Run one or more spiders """
    settings = get_project_settings()
    # prevent scrapy from configuring its own logging, since we already have it
    settings.set('LOG_ENABLED', False)

    process = CrawlerProcess(settings)
    for s in spider_name:
        process.settings.set('FEED_URI',
                             'file://%s.jsonlines' % os.path.join(results_dir, s))
        process.settings.set('FEED_FORMAT', 'jsonlines')
        spider = process.spider_loader.load(s)
        process.crawl(spider)
    process.start()
项目:Pixiv-Spider    作者:cathor01    | 项目源码 | 文件源码
def main():
    settings = get_project_settings()
    process = CrawlerProcess(settings)
    process.crawl("pixiv")
    process.start()
项目:wechat-crawler    作者:DMGbupt    | 项目源码 | 文件源码
def crawl(spiders, query, start, end, page):
    spider_logger.info("Start crawling {0} from {1} to {2}".format(query, start, end))
    process = CrawlerProcess(get_project_settings())
    process.crawl(spiders, query=query, start_time=start, end_time=end, index_pages=page)
    process.start()
项目:Music-Scraper    作者:srivatsan-ramesh    | 项目源码 | 文件源码
def start_gui(process):
    """
    A function that takes care of starting the GUI and stops the Scrapy crawler process when exited from program.

    :param CrawlerProcess process: The scrapy crawler process that is used to scrape the web. The instance is used for stopping the process.
    """

    def create_ui(screen):
        """
        A function passes to curses wrapper for safe execution of terminal GUI.

        :param screen: The screen parameter to run the GUI. Sent from the curses wrapper.
        """

        GUI.screen = screen  # All the statis variables of the GUI class is initialized
        GUI.strings = []  # the list of songs is empty initially
        GUI.init_display()  # init the variables required for GUI
        GUI.update_on_key()  # Starts a loop that waits for key input and acts accordingly

        curses.nocbreak()
        curses.echo()
        curses.endwin()
        GUI.gui_stopped = True

    curses.wrapper(create_ui)
    process.stop()  # Stopping the scrapy crawler process
项目:twitter-sentiment    作者:words-sdsc    | 项目源码 | 文件源码
def startCrawler():
    """ Initiates process of the web crawler above.

    Arguments: None

    Return: None
    """

    # Starts a Twisted reactors to configure logs and set shutdown handlers
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    process.crawl(TwitterSpider)
    process.start()
项目:feeds    作者:nblock    | 项目源码 | 文件源码
def list():
    """List all available spiders."""
    settings = get_project_settings()
    settings['LOG_ENABLED'] = False
    process = CrawlerProcess(settings)
    for s in sorted(process.spider_loader.list()):
        print(s)
项目:czl-scrape    作者:code4romania    | 项目源码 | 文件源码
def main():
    from scrapy.crawler import CrawlerProcess, Crawler
    process = CrawlerProcess()
    process.crawl(EducatieSpider)
    process.start()
项目:czl-scrape    作者:code4romania    | 项目源码 | 文件源码
def main():
    process = CrawlerProcess()
    process.crawl(ComunicatiiSpider)
    process.start()
项目:czl-scrape    作者:code4romania    | 项目源码 | 文件源码
def main():
    from scrapy.crawler import CrawlerProcess
    process = CrawlerProcess()
    process.crawl(DialogSpider)
    process.start()
项目:decoration-design-crawler    作者:imflyn    | 项目源码 | 文件源码
def start_scrapy(self):
        self.process = CrawlerProcess(self.setting)
        self.crawl()
        reactor.run()
项目:decoration-design-crawler    作者:imflyn    | 项目源码 | 文件源码
def start_scrapy(self):
        self.process = CrawlerProcess(self.setting)
        self.crawl()
        reactor.run()
项目:Get-Positive    作者:M-shin    | 项目源码 | 文件源码
def getReviewCount(url):
  # Get the number of reviews
  process = CrawlerProcess(get_project_settings())
  process.crawl(review_count_spider, start_url=url)
  process.start()
项目:aquam    作者:xncbf    | 项目源码 | 文件源码
def crawl_naver_blog():
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    process.crawl(NaverBlogSpider)
    process.start()
项目:wayback-machine-scraper    作者:sangaline    | 项目源码 | 文件源码
def main():
    # configure the settings for the crawler and spider
    args = parse_args()
    config = {
        'domains': args.domains,
        'directory': args.output,
        'allow': args.allow,
        'deny': args.deny,
        'unix': args.unix,
    }
    settings = Settings({
        'USER_AGENT': (
            'Wayback Machine Scraper/{0} '
            '(+https://github.com/sangaline/scrapy-wayback-machine)'
        ).format(get_distribution('wayback-machine-scraper').version),
        'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
        },
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_DEBUG': args.verbose,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
        'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
    })

    # start the crawler
    process = CrawlerProcess(settings)
    process.crawl(MirrorSpider, **config)
    process.start()
项目:blog_analysis    作者:Databrawl    | 项目源码 | 文件源码
def get_top_blogs():
    """
    Get URLs of most popular blog posts for most popular programming languages
    on GitHub.
    """
    languages = get_top_languages(30, settings['LANGUAGES_DATA'])
    process = CrawlerProcess(settings)
    process.crawl(BlogsSpider, languages)
    process.start()  # the script will block here until the crawling is done
项目:blog_analysis    作者:Databrawl    | 项目源码 | 文件源码
def estimate_traffic():
    """
    Analyze traffic of the scraped blogs.
    """
    process = CrawlerProcess(settings)
    blogs_file = get_latest_file(settings['BLOGS_FEED_DIR'])
    with open(blogs_file) as f:
        blogs = json.load(f)
    process.crawl(TrafficSpider, blogs)
    process.start()  # the script will block here until the crawling is done
项目:tobber    作者:fchamicapereira    | 项目源码 | 文件源码
def run_crawler(self):

        process = CrawlerProcess(self.settings)

        if self.args.anime:
            if self.args.skip == None or 'nyaa' not in self.args.skip:
                process.crawl(Nyaa,  title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'shanaproject' not in self.args.skip:
                process.crawl(Shanaproject,  title=self.search, season=self.args.season, file=self.args.file)

        else:
            if self.args.skip == None or 'zooqle' not in self.args.skip:
                process.crawl(Zooqle, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or '1337x' not in self.args.skip:
                process.crawl(_1337x, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'eztv' not in self.args.skip:
                process.crawl(Eztv,   title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'rarbg' not in self.args.skip:
                process.crawl(Rarbg, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'torrentdownloads' not in self.args.skip:
                process.crawl(Torrentdownloads, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'limetorrents' not in self.args.skip:
                process.crawl(Limetorrents, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'thepiratebay' not in self.args.skip:
                process.crawl(Thepiratebay, title=self.search, season=self.args.season, file=self.args.file)

        process.start()
项目:web-search-engine    作者:AnthonySigogne    | 项目源码 | 文件源码
def explore_job(link) :
    """
    Explore a website and index all urls (redis-rq process).
    """
    print("explore website at : %s"%link)

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    # create or update domain data
    domain = url.domain(link)
    res = client.index(index="web", doc_type='domain', id=domain, body={
        "homepage":link,
        "domain":domain,
        "last_crawl":datetime.now()
    })

    # start crawler
    process = CrawlerProcess({
        'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT':100,
        'DOWNLOAD_DELAY':0.25,
        'ROBOTSTXT_OBEY':True,
        'HTTPCACHE_ENABLED':False,
        'REDIRECT_ENABLED':False,
        'SPIDER_MIDDLEWARES' : {
            'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware':True,
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True,
            'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware':True,
            'scrapy.extensions.closespider.CloseSpider':True
        },
        'CLOSESPIDER_PAGECOUNT':500 #only for debug
    })
    process.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls = [link,], es_client=client, redis_conn=redis_conn)
    process.start()

    return 1
项目:hsdata    作者:youfou    | 项目源码 | 文件源码
def _crawl(deck_ids):
        logging.info('????????')
        decks = list()
        cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hearthstats.HearthStatsScrapyPipeline': 1}})
        cp.crawl(HearthStatsScrapySpider, deck_ids=deck_ids, decks=decks)
        cp.start()
        return decks
项目:hsdata    作者:youfou    | 项目源码 | 文件源码
def _crawl(deck_ids):
        logging.info('??????????')
        results = list()
        cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hsbox.HSBoxScrapyPipeline': 1}})
        cp.crawl(HSBoxScrapySpider, deck_ids=deck_ids, results=results)
        cp.start()
        logging.info('??? {} ??????????'.format(len(results)))
        return results
项目:Pysearch2.0    作者:Pysearch    | 项目源码 | 文件源码
def crawl(url):
    """Initialize crawling sequence."""
    settings = get_project_settings()
    settings.url = url
    settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT
    settings["DEPTH_LEVEL"] = DEPTH_LEVEL
    process = CrawlerProcess(settings)

    class ThisSpider(CrawlingSpider):
        """Create a spider to crawl with."""

        start_urls = [url]
    process.crawl(ThisSpider)
    process.start()
项目:Pysearch2.0    作者:Pysearch    | 项目源码 | 文件源码
def harvest(url):
    """Initialize harvest sequence."""
    settings = get_project_settings()
    settings.url = url
    process = CrawlerProcess(settings)
    process.crawl(HarvestSpider, url=url)
    process.start()
项目:hermes    作者:xutaoding    | 项目源码 | 文件源码
def test_waizard_spider():
    crawler = CrawlerProcess(get_project_settings())
    crawler.crawl(WizardSpider)
    crawler.start()
项目:myaddons    作者:luohuayong    | 项目源码 | 文件源码
def run():
    # ??settings.py?????
    settings = get_project_settings()
    process = CrawlerProcess(settings=settings)

    # ??????spider
    # process.crawl(Spider1)
    # process.crawl(Spider2)
    process.crawl(GuaziSaleSpider)

    # ???????????????
    process.start()
项目:collectors    作者:opentrials    | 项目源码 | 文件源码
def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start()
项目:collectors    作者:opentrials    | 项目源码 | 文件源码
def collect(conf, conn, page_from=None, page_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to)
    process.start()
项目:collectors    作者:opentrials    | 项目源码 | 文件源码
def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start()
项目:collectors    作者:opentrials    | 项目源码 | 文件源码
def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()
项目:collectors    作者:opentrials    | 项目源码 | 文件源码
def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()