我们从Python开源项目中,提取了以下4个代码示例,用于说明如何使用scrapy.crawler.Crawler()。
def default(self, o): if isinstance(o, datetime.datetime): return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT)) elif isinstance(o, datetime.date): return o.strftime(self.DATE_FORMAT) elif isinstance(o, datetime.time): return o.strftime(self.TIME_FORMAT) elif isinstance(o, decimal.Decimal): return str(o) elif isinstance(o, defer.Deferred): return str(o) elif isinstance(o, BaseItem): return dict(o) elif isinstance(o, Request): return "<%s %s %s>" % (type(o).__name__, o.method, o.url) elif isinstance(o, Response): return "<%s %s %s>" % (type(o).__name__, o.status, o.url) elif isinstance(o, Crawler): return o.stats.get_stats() else: return super(ScrapyJSONEncoder, self).default(o)
def run_spider(): settings = Settings() settings.set('ITEM_PIPELINES', { '__main__.JsonWriterPipeline': 100 }) # enable remote sever certificate verification # see http://doc.scrapy.org/en/latest/topics/settings.html#downloader-clientcontextfactory settings.set('DOWNLOADER_CLIENTCONTEXTFACTORY', 'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory' ) # uncomment below line to enable the logging for debug # configure_logging() crawler = Crawler(JenkinsJobSpider, settings) crawler.signals.connect(callback, signal=signals.spider_closed) crawler.crawl() reactor.run()
def test_spider_output_handling(self): spider = self.MySpider() scraper = Scraper(Crawler(spider)) scraper.open_spider(spider) scraper._process_spidermw_output(RssItem(), None, None, None) scraper._process_spidermw_output(ExtendableItem(), None, None, None) scraper._process_spidermw_output(RssedItem(), None, None, None) scraper.close_spider(spider)
def make_queue(redis_server, cls: type, slots=None, skip_cache=True, settings=None, hints=None) -> BaseRequestQueue: global logging_configured if not logging_configured: configure_logging(settings=settings) logging_configured = True crawler = Crawler(Spider, settings=settings) if slots is None: slots = {} spider = Spider.from_crawler(crawler, 'test_dd_spider') if hints: spider.hint_urls = hints return cls(server=redis_server, spider=spider, key=SCHEDULER_QUEUE_KEY, slots_mock=slots, skip_cache=skip_cache)