我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用scrapy.settings()。
def handle_form(self, url, form, meta): action = canonicalize_url(urljoin(url, form.action)) if not self.link_extractor.matches(action): return if (meta['form'] == 'search' and self.settings.getbool('CRAZY_SEARCH_ENABLED') and action not in self.handled_search_forms and len(self.handled_search_forms) < self.settings.getint('MAX_DOMAIN_SEARCH_FORMS')): self.logger.debug('Found a search form at %s', url) self.handled_search_forms.add(action) for request_kwargs in search_form_requests( url, form, meta, search_terms=self.search_terms, extra_search_terms=self.extra_search_terms): request_kwargs['meta'] = {'is_search': True} request_kwargs['cls'] = \ SplashFormRequest if self.use_splash else FormRequest yield request_kwargs
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # definir el spider para el crawler crawler.crawl(EuropythonSpyder()) # iniciar scrapy print "STARTING ENGINE" crawler.start() #iniciar el crawler llamando al spider definido print "ENGINE STOPPED"
def main(): from scrapy.xlib.pydispatch import dispatcher """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # define spyder for the crawler crawler.crawl(PydataSpiderDetails()) print "STARTING ENGINE" crawler.start() #start the crawler print "ENGINE STOPPED"
def make_crawler(**extra_settings): settings = Settings() settings['ITEM_PIPELINES'] = { 'scrapy_cdr.media_pipeline.CDRMediaPipeline': 1, 'tests.utils.CollectorPipeline': 100, } settings.update(extra_settings) runner = CrawlerRunner(settings) return runner.create_crawler(Spider)
def start_requests(self): self.use_splash = using_splash(self.settings) for url in self.start_urls: yield self.make_request(url, callback=self.parse_first)
def make_request( self, url, callback=None, meta=None, cls=None, **kwargs): callback = callback or self.parse cls = cls or (SplashRequest if self.use_splash else Request) if self.use_splash: settings = self.settings splash_args = { 'lua_source': self.lua_source, 'js_source': self.js_source, 'run_hh': settings.getbool('RUN_HH'), 'return_png': settings.getbool('SCREENSHOT'), 'images_enabled': settings.getbool('IMAGES_ENABLED'), } for s in ['VIEWPORT_WIDTH', 'VIEWPORT_HEIGHT', 'SCREENSHOT_WIDTH', 'SCREENSHOT_HEIGHT']: if self.settings.get(s): splash_args[s.lower()] = self.settings.getint(s) if self.settings.getbool('ADBLOCK'): splash_args['filters'] = 'fanboy-annoyance,easylist' if self.settings.getbool('FORCE_TOR'): splash_args['proxy'] = 'tor' kwargs.update(dict( args=splash_args, endpoint='execute', cache_args=['lua_source', 'js_source'], )) meta = meta or {} meta['avoid_dup_content'] = True return cls(url, callback=callback, meta=meta, **kwargs)
def parse_first(self, response): allowed = allowed_re( response.url, self.settings.getbool('HARD_URL_CONSTRAINT')) if allowed not in self.allowed: self.allowed.add(allowed) # Reset link extractors to pick up with the latest self.allowed regexps self._reset_link_extractors() self.logger.info('Updated allowed regexps: %s', self.allowed) yield from self.parse(response)
def text_cdr_item(self, response, *, follow_urls, metadata): if self.settings.get('FILES_STORE'): media_urls = self.media_urls(response, follow_urls) else: media_urls = [] return text_cdr_item( response, crawler_name=self.settings.get('CDR_CRAWLER'), team_name=self.settings.get('CDR_TEAM'), # will be downloaded by UndercrawlerMediaPipeline objects=media_urls, metadata=metadata, )
def _looks_like_logout(self, link, response): if not self.settings.getbool('AUTOLOGIN_ENABLED') or not \ response.meta.get('autologin_active'): return False return link_looks_like_logout(link)
def _take_screenshot(self, response) -> Optional[str]: screenshot = response.data.get('png') if self.use_splash else None if not screenshot: return None if self._screenshot_dest is None: self._screenshot_dest = Path( self.settings.get('SCREENSHOT_DEST', 'screenshots')) self._screenshot_dest.mkdir(parents=True, exist_ok=True) path = self._screenshot_dest.joinpath( '{prefix}{uuid}.png'.format( prefix=self.settings.get('SCREENSHOT_PREFIX', ''), uuid=uuid.uuid4())) path.write_bytes(b64decode(screenshot)) self.logger.debug('Saved %s screenshot to %s' % (response, path)) return str(path)
def get_scrapy_settings(self, item_pipeline=None, hostname=None): """ Get a scrapy settings dictionary to use for crawling web applications. :param item_pipeline: The item pipeline configuration to configure in the settings. :param hostname: The hostname to request by default in all Scrapy requests. :return: A scrapy settings dictionary to use for crawling web applications. """ item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline() return scrapy.settings.Settings(values={ "CONCURRENT_ITEMS": self.concurrent_items, "CONCURRENT_REQUESTS": self.concurrent_requests, "DEFAULT_REQUEST_HEADERS": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", "Host": hostname, }, "DEPTH_LIMIT": self.depth_limit, "DEPTH_PRIORITY": self.depth_priority, "DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory", "EXTENSIONS": { "scrapy.extensions.telnet.TelnetConsole": None, }, "DOWNLOADER_MIDDLEWARES": { "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None, "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None, }, "SPIDER_MIDDLEWARES": { "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None, }, "DOWNLOAD_MAXSIZE": self.max_size, "HTTPERROR_ALLOW_ALL": self.allow_all_errors, "ITEM_PIPELINES": item_pipeline, "LOG_LEVEL": config.log_crawling_level, "TELNETCONSOLE_ENABLED": self.enable_telnet, "USER_AGENT": self.user_agent, })
def __crawl(self, spider_kwargs=None, settings=None): """ Perform a crawl based on the contents of self._crawling_config. :param spider_kwargs: Keyword arguments to use to create a spider class. :param settings: Scrapy settings to use to crawl the remote endpoint. :return: None """ print("SPIDER KWARGS ARE %s." % (spider_kwargs,)) config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"] spider = self.get_spider_class_for_domain(**spider_kwargs) process = CrawlerProcess(settings) process.crawl(spider) process.start()
def crawling_config(self): """ Get a dictionary containing the spider and Scrapy settings to use to crawl an endpoint. :return: A dictionary containing the spider and Scrapy settings to use to crawl an endpoint. """ return self._crawling_config # Representation and Comparison
def crawl_endpoint_to_file( self, ip_address=None, port=None, hostname=None, use_ssl=False, use_sni=False, start_urls=[], in_separate_process=True, ): """ Start crawling the given endpoint using the given list of URLs and write the results to a local file. :param ip_address: The IP address to crawl. :param port: The port where the application resides. :param hostname: The hostname to submit alongside all requests to the remote endpoint. :param use_ssl: Whether or not to use SSL to connect to the remote web service. :param use_sni: Whether or not to use SNI to connect to the remote web service. :param start_urls: A list of URLs to start crawling from. :param in_separate_process: Whether or not to spawn off a separate process for the crawl. This enables us to call this method multiple times in the same process, as a Twisted reactor can only be started and stopped once per process. :return: A tuple containing (1) the string containing the local file path where crawling results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file. """ temp_file_path = FilesystemHelper.get_temporary_file_path() local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port) spider_kwargs = { "input_ip_address": ip_address, "input_start_urls": start_urls, "input_file_path": local_file_path, "input_hostname": hostname, "input_use_ssl": use_ssl, "input_use_sni": use_sni, "input_port": port, } pipeline_settings = self.__get_local_storage_item_pipeline() requested_hostname = hostname if hostname is not None else ip_address settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname) crawling_config = { "spider_kwargs": spider_kwargs, "settings": settings, } if in_separate_process: process = Process(target=self.__crawl, kwargs=crawling_config) process.start() process.join() process.terminate() else: self.__crawl(**crawling_config) return local_file_path, ScrapyResultWrapper.from_file(local_file_path)