我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.crawler.CrawlerProcess()。
def runspider(name): configure_logging(install_root_handler=False) logging.basicConfig( filename='log/%s.log' % name, format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG ) process = CrawlerProcess(get_project_settings()) try: logging.info('runspider start spider:%s' % name) process.crawl(name) process.start() except Exception as e: logging.exception('runspider spider:%s exception:%s' % (name, e)) logging.debug('finish this spider:%s\n\n' % name)
def index_job(link) : """ Index a single page. """ print("index page : %s"%link) # get final url after possible redictions try : link = url.crawl(link).url except : return 0 process = CrawlerProcess({ 'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36", 'DOWNLOAD_TIMEOUT':100, 'REDIRECT_ENABLED':False, 'SPIDER_MIDDLEWARES' : { 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True } }) process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn) process.start() # block until finished
def crawl(spider, *args, **kwargs): """Run a spider. Args: spider (str): The Scrapy `name` of the spider. """ settings = get_project_settings() if kwargs.get('ignore_robots_txt') is True: settings.attributes.get('ROBOTSTXT_OBEY').value = False proc = CrawlerProcess(settings) try: proc.crawl(spider, *args, **kwargs) proc.start() except KeyError as err: # Log a warning if the scraper name is invalid instead of # causing the job to fail. # NOTE: If there is any other type of error, the job will fail, and all # the jobs that depend on it will fail as well. logger.warning(err.args[0])
def load_crawler(self, crawler, url, ignore_regex): """ Loads the given crawler with the given url. :param class crawler: class of the crawler to load :param str url: url to start the crawler with :param regex ignore_regex: to be able to ignore urls that match this regex code """ self.process = CrawlerProcess(self.cfg.get_scrapy_options()) self.process.crawl( crawler, self.helper, url=url, config=self.cfg, ignore_regex=ignore_regex)
def crawl(ctx, spiders, stats): """ Crawl one or many or all pages. What spider(s) to run is determined in the following order: 1. Spider(s) given as argument(s) 2. Spider(s) specified in the configuration file Note that if a spider is given as an argument, the spiders in the configuration file are ignored. All available spiders will be used to crawl if no arguments are given and no spiders are configured. """ settings = ctx.obj['settings'] if stats: settings.set('STATS_CLASS', 'scrapy.statscollectors.MemoryStatsCollector') # Start a new crawler process. process = CrawlerProcess(settings) spiders = spiders_to_crawl(process, spiders) if not spiders: logger.error('Please specify what spiders you want to run!') else: for spider in spiders: logger.info('Starting crawl of {} ...'.format(spider)) process.crawl(spider) process.start() if settings.getbool('HTTPCACHE_ENABLED'): run_cleanup_cache(settings)
def cleanup(ctx): """ Cleanup old cache entries. By default, entries older than 14 days will be removed. This value can be overriden in the config file. """ settings = ctx.obj['settings'] # Manually configure logging since we don't have a CrawlerProcess which # would take care of that. configure_logging(settings) if not settings.getbool('HTTPCACHE_ENABLED'): logger.error('Cache is disabled, will not clean up cache dir.') return 1 run_cleanup_cache(settings)
def runspider(name, product_id): configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % product_id, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.DEBUG ) process = CrawlerProcess(get_project_settings()) try: logging.info('runscrapy start spider:%s' % name) data = { 'product_id': product_id } process.crawl(name, **data) process.start() except Exception, e: logging.error('runscrapy spider:%s exception:%s' % (name, e)) pass logging.info('finish this spider:%s\n\n' % name)
def main(): # process = CrawlerProcess() # process.crawl(CommentarySpider.CommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-1']) # process.crawl(jqkaCommentarySpider.jqkaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-2']) # process.crawl(sinaCommentarySpider.sinaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-3']) # process.start() # scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'CommentarySpider','-s','JOBDIR=crawls/CommentarySpider-1']) scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'jqkaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-2']) # scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'sinaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-3'])
def __init__(self, accounts, loglevel, remote=False): self.accounts = settings.SCRAPY_ACCOUNTS if accounts: self.accounts.update(accounts) self.loglevel = loglevel self.settings = self._get_settings() # Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG. self.settings.set('LOG_LEVEL', loglevel) if remote: # Configure remote logging and disable the scrapy logging. self.settings.set('LOG_ENABLED', False) logger = logging.getLogger() handler = ScrapySocketHandler( 'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT) handler.setLevel(loglevel) logger.addHandler(handler) self.process = CrawlerProcess(self.settings)
def collect(conf, conn, date_from=None, date_to=None): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) process.start()
def fetch_url(cls, session, msites, platform_id, purpose): """Actual method to do fetch url action. Parameters ---------- msites : list a list of Site model class, contains info to build spiders. platform_id : int id of platform, bind fetched url with this id. purpose : {'update', 'archive'} indicate which url to fetch. """ settings = Settings(cls.conf['crawl']['scrapy']) settings.set('ITEM_PIPELINES', {'hoaxy.crawl.pipelines.UrlPipeline': 300}) process = CrawlerProcess(settings) sll = cls.conf['logging']['loggers']['scrapy']['level'] logging.getLogger('scrapy').setLevel(logging.getLevelName(sll)) for ms in msites: for sm in build_spiders_iter(ms, purpose): sm['kwargs']['session'] = session sm['kwargs']['platform_id'] = platform_id process.crawl(sm['cls'], *sm['args'], **sm['kwargs']) process.start()
def fetch_html(cls, session, url_tuples): """Actual method to do fetch html action. Parameters ---------- session : object a SQLAlchemy session object. url_tuples : list a list of url tuple (id, raw, status_code). """ settings = Settings(cls.conf['crawl']['scrapy']) settings.set('ITEM_PIPELINES', {'hoaxy.crawl.pipelines.HtmlPipeline': 300}) process = CrawlerProcess(settings) sll = cls.conf['logging']['loggers']['scrapy']['level'] logging.getLogger('scrapy').setLevel(logging.getLevelName(sll)) logger.warning('Number of url to fetch html is: %s', len(url_tuples)) process.crawl( HtmlSpider, session=session, url_tuples=url_tuples, excluded_domains=cls.conf['crawl']['excluded_domains']) process.start()
def parse_article(cls, session, url_tuples): """Actual method to do parse to article action. Parameters ---------- session : object a SQLAlchemy session object. url_tuples : list a list of url tuple (id, created_at, date_published, canonical, site_id) """ settings = Settings(cls.conf['crawl']['scrapy']) settings.set('ITEM_PIPELINES', {'hoaxy.crawl.pipelines.ArticlePipeline': 300}) process = CrawlerProcess(settings) sll = cls.conf['logging']['loggers']['scrapy']['level'] logging.getLogger('scrapy').setLevel(logging.getLevelName(sll)) logger.info('Number of url to parse is: %s', len(url_tuples)) process.crawl( ArticleParserSpider, session=session, url_tuples=url_tuples, api_key=cls.conf['crawl']['article_parser']['webparser_api_key'],) process.start()
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # definir el spider para el crawler crawler.crawl(EuropythonSpyder()) # iniciar scrapy print "STARTING ENGINE" crawler.start() #iniciar el crawler llamando al spider definido print "ENGINE STOPPED"
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item Extraido:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # definir el spider para el crawler crawler.crawl(BloggerSpider()) # iniciar scrapy print "STARTING ENGINE" crawler.start() #iniciar el crawler llamando al spider definido print "ENGINE STOPPED"
def main(): from scrapy.xlib.pydispatch import dispatcher """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # define spyder for the crawler crawler.crawl(PydataSpiderDetails()) print "STARTING ENGINE" crawler.start() #start the crawler print "ENGINE STOPPED"
def ProcessRun(): process = CrawlerProcess(get_project_settings()) # ????spider process.crawl("news") # process.crawl("favorite_spider") # ???? spider for spider_name in process.spider_loader.list(): # print spider_name process.crawl(spider_name) process.start()
def run(max_page=5): settings = get_project_settings() settings.set('MAX_PAGE', max_page, 'project') crawler_process = CrawlerProcess(settings) crawler_process.crawl(CaoLiuSpider) crawler_process.start()
def __init__(self): self.crawler = CrawlerProcess(settings)
def crawl(spider_name, results_dir): """ Run one or more spiders """ settings = get_project_settings() # prevent scrapy from configuring its own logging, since we already have it settings.set('LOG_ENABLED', False) process = CrawlerProcess(settings) for s in spider_name: process.settings.set('FEED_URI', 'file://%s.jsonlines' % os.path.join(results_dir, s)) process.settings.set('FEED_FORMAT', 'jsonlines') spider = process.spider_loader.load(s) process.crawl(spider) process.start()
def main(): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl("pixiv") process.start()
def crawl(spiders, query, start, end, page): spider_logger.info("Start crawling {0} from {1} to {2}".format(query, start, end)) process = CrawlerProcess(get_project_settings()) process.crawl(spiders, query=query, start_time=start, end_time=end, index_pages=page) process.start()
def start_gui(process): """ A function that takes care of starting the GUI and stops the Scrapy crawler process when exited from program. :param CrawlerProcess process: The scrapy crawler process that is used to scrape the web. The instance is used for stopping the process. """ def create_ui(screen): """ A function passes to curses wrapper for safe execution of terminal GUI. :param screen: The screen parameter to run the GUI. Sent from the curses wrapper. """ GUI.screen = screen # All the statis variables of the GUI class is initialized GUI.strings = [] # the list of songs is empty initially GUI.init_display() # init the variables required for GUI GUI.update_on_key() # Starts a loop that waits for key input and acts accordingly curses.nocbreak() curses.echo() curses.endwin() GUI.gui_stopped = True curses.wrapper(create_ui) process.stop() # Stopping the scrapy crawler process
def startCrawler(): """ Initiates process of the web crawler above. Arguments: None Return: None """ # Starts a Twisted reactors to configure logs and set shutdown handlers process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(TwitterSpider) process.start()
def list(): """List all available spiders.""" settings = get_project_settings() settings['LOG_ENABLED'] = False process = CrawlerProcess(settings) for s in sorted(process.spider_loader.list()): print(s)
def main(): from scrapy.crawler import CrawlerProcess, Crawler process = CrawlerProcess() process.crawl(EducatieSpider) process.start()
def main(): process = CrawlerProcess() process.crawl(ComunicatiiSpider) process.start()
def main(): from scrapy.crawler import CrawlerProcess process = CrawlerProcess() process.crawl(DialogSpider) process.start()
def start_scrapy(self): self.process = CrawlerProcess(self.setting) self.crawl() reactor.run()
def getReviewCount(url): # Get the number of reviews process = CrawlerProcess(get_project_settings()) process.crawl(review_count_spider, start_url=url) process.start()
def crawl_naver_blog(): process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(NaverBlogSpider) process.start()
def main(): # configure the settings for the crawler and spider args = parse_args() config = { 'domains': args.domains, 'directory': args.output, 'allow': args.allow, 'deny': args.deny, 'unix': args.unix, } settings = Settings({ 'USER_AGENT': ( 'Wayback Machine Scraper/{0} ' '(+https://github.com/sangaline/scrapy-wayback-machine)' ).format(get_distribution('wayback-machine-scraper').version), 'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO', 'DOWNLOADER_MIDDLEWARES': { 'scrapy_wayback_machine.WaybackMachineMiddleware': 5, }, 'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_DEBUG': args.verbose, 'AUTOTHROTTLE_START_DELAY': 1, 'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency, 'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to), }) # start the crawler process = CrawlerProcess(settings) process.crawl(MirrorSpider, **config) process.start()
def get_top_blogs(): """ Get URLs of most popular blog posts for most popular programming languages on GitHub. """ languages = get_top_languages(30, settings['LANGUAGES_DATA']) process = CrawlerProcess(settings) process.crawl(BlogsSpider, languages) process.start() # the script will block here until the crawling is done
def estimate_traffic(): """ Analyze traffic of the scraped blogs. """ process = CrawlerProcess(settings) blogs_file = get_latest_file(settings['BLOGS_FEED_DIR']) with open(blogs_file) as f: blogs = json.load(f) process.crawl(TrafficSpider, blogs) process.start() # the script will block here until the crawling is done
def run_crawler(self): process = CrawlerProcess(self.settings) if self.args.anime: if self.args.skip == None or 'nyaa' not in self.args.skip: process.crawl(Nyaa, title=self.search, season=self.args.season, file=self.args.file) if self.args.skip == None or 'shanaproject' not in self.args.skip: process.crawl(Shanaproject, title=self.search, season=self.args.season, file=self.args.file) else: if self.args.skip == None or 'zooqle' not in self.args.skip: process.crawl(Zooqle, title=self.search, season=self.args.season, file=self.args.file) if self.args.skip == None or '1337x' not in self.args.skip: process.crawl(_1337x, title=self.search, season=self.args.season, file=self.args.file) if self.args.skip == None or 'eztv' not in self.args.skip: process.crawl(Eztv, title=self.search, season=self.args.season, file=self.args.file) if self.args.skip == None or 'rarbg' not in self.args.skip: process.crawl(Rarbg, title=self.search, season=self.args.season, file=self.args.file) if self.args.skip == None or 'torrentdownloads' not in self.args.skip: process.crawl(Torrentdownloads, title=self.search, season=self.args.season, file=self.args.file) if self.args.skip == None or 'limetorrents' not in self.args.skip: process.crawl(Limetorrents, title=self.search, season=self.args.season, file=self.args.file) if self.args.skip == None or 'thepiratebay' not in self.args.skip: process.crawl(Thepiratebay, title=self.search, season=self.args.season, file=self.args.file) process.start()
def explore_job(link) : """ Explore a website and index all urls (redis-rq process). """ print("explore website at : %s"%link) # get final url after possible redictions try : link = url.crawl(link).url except : return 0 # create or update domain data domain = url.domain(link) res = client.index(index="web", doc_type='domain', id=domain, body={ "homepage":link, "domain":domain, "last_crawl":datetime.now() }) # start crawler process = CrawlerProcess({ 'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36", 'DOWNLOAD_TIMEOUT':100, 'DOWNLOAD_DELAY':0.25, 'ROBOTSTXT_OBEY':True, 'HTTPCACHE_ENABLED':False, 'REDIRECT_ENABLED':False, 'SPIDER_MIDDLEWARES' : { 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware':True, 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware':True, 'scrapy.extensions.closespider.CloseSpider':True }, 'CLOSESPIDER_PAGECOUNT':500 #only for debug }) process.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls = [link,], es_client=client, redis_conn=redis_conn) process.start() return 1
def _crawl(deck_ids): logging.info('????????') decks = list() cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hearthstats.HearthStatsScrapyPipeline': 1}}) cp.crawl(HearthStatsScrapySpider, deck_ids=deck_ids, decks=decks) cp.start() return decks
def _crawl(deck_ids): logging.info('??????????') results = list() cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hsbox.HSBoxScrapyPipeline': 1}}) cp.crawl(HSBoxScrapySpider, deck_ids=deck_ids, results=results) cp.start() logging.info('??? {} ??????????'.format(len(results))) return results
def crawl(url): """Initialize crawling sequence.""" settings = get_project_settings() settings.url = url settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT settings["DEPTH_LEVEL"] = DEPTH_LEVEL process = CrawlerProcess(settings) class ThisSpider(CrawlingSpider): """Create a spider to crawl with.""" start_urls = [url] process.crawl(ThisSpider) process.start()
def harvest(url): """Initialize harvest sequence.""" settings = get_project_settings() settings.url = url process = CrawlerProcess(settings) process.crawl(HarvestSpider, url=url) process.start()
def test_waizard_spider(): crawler = CrawlerProcess(get_project_settings()) crawler.crawl(WizardSpider) crawler.start()
def run(): # ??settings.py????? settings = get_project_settings() process = CrawlerProcess(settings=settings) # ??????spider # process.crawl(Spider1) # process.crawl(Spider2) process.crawl(GuaziSaleSpider) # ??????????????? process.start()
def collect(conf, conn): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn) process.start()
def collect(conf, conn, page_from=None, page_to=None): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to) process.start()