我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用scrapy.crawler()。
def __init__(self, feed_file=None, feed_title=None, feed_link=None, feed_description=None, crawler_settings=None): settings = crawler_settings if crawler_settings else dict(self.default_settings) if feed_file: settings['FEED_FILE'] = feed_file if feed_title: settings['FEED_TITLE'] = feed_title if feed_link: settings['FEED_LINK'] = feed_link if feed_description: settings['FEED_DESCRIPTION'] = feed_description self.crawler = get_crawler(settings_dict=settings) self.spider = scrapy.Spider.from_crawler(self.crawler, 'example.com') self.spider.parse = lambda response: () item_processor = settings.get('ITEM_PROCESSOR') if not item_processor: item_processor = RaisedItemPipelineManager elif isinstance(item_processor, six.string_types): item_processor = load_object(item_processor) self.ipm = item_processor.from_crawler(self.crawler)
def test_autologin_request(): crawler = make_crawler( base_settings(), SPLASH_URL='http://192.168.99.100:8050') mw = AutologinMiddleware('http://127.0.0.1:8089', crawler) al_request = mw._login_request(scrapy.Request('http://example.com')) data = json.loads(al_request.body.decode('utf-8')) assert al_request.dont_filter assert al_request.meta['proxy'] is None assert data['url'] == 'http://example.com' assert data['settings']['USER_AGENT'] == crawler.settings.get('USER_AGENT') assert data['settings'].get('SPLASH_URL') is None al_request = mw._login_request(SplashRequest('http://example.com')) data = json.loads(al_request.body.decode('utf-8')) assert data['url'] == 'http://example.com' assert data['settings']['SPLASH_URL'] == crawler.settings.get('SPLASH_URL')
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # definir el spider para el crawler crawler.crawl(EuropythonSpyder()) # iniciar scrapy print "STARTING ENGINE" crawler.start() #iniciar el crawler llamando al spider definido print "ENGINE STOPPED"
def main(): from scrapy.xlib.pydispatch import dispatcher """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # define spyder for the crawler crawler.crawl(PydataSpiderDetails()) print "STARTING ENGINE" crawler.start() #start the crawler print "ENGINE STOPPED"
def test_media_pipeline(tmpdir, max_cache): crawler = make_crawler(FILES_STORE='file://{}'.format(tmpdir), FILES_MAX_CACHE=max_cache) with MockServer(WithFile) as s: root_url = s.root_url yield crawler.crawl(url=root_url) spider = crawler.spider assert len(spider.collected_items) == 3 root_item = find_item('/', spider.collected_items) assert len(root_item['objects']) == 2 file_item = find_item( '/file.pdf', root_item['objects'], 'obj_original_url') assert file_item['obj_original_url'] == root_url + '/file.pdf' assert not file_item['obj_stored_url'].endswith('.pdf') with tmpdir.join(file_item['obj_stored_url']).open('rb') as f: assert f.read() == FILE_CONTENTS assert file_item['content_type'] == 'application/pdf' headers = dict(file_item['response_headers']) headers.pop('date') headers.pop('server') assert headers == {'content-type': 'application/pdf', 'content-hype': 'very/high'} forbidden_item = find_item( '/forbidden.pdf', root_item['objects'], 'obj_original_url') with tmpdir.join(forbidden_item['obj_stored_url']).open('rb') as f: assert f.read() == FILE_CONTENTS * 2 page_item = find_item('/page?b=2&a=1', spider.collected_items) file_item_q = find_item( '/file.pdf?allow=true', page_item['objects'], 'obj_original_url') assert file_item_q['obj_stored_url'] == file_item['obj_stored_url'] another_page_item = find_item('/another-page', spider.collected_items) file_item_q = find_item( '/file.pdf', another_page_item['objects'], 'obj_original_url') assert file_item_q['obj_stored_url'] == file_item['obj_stored_url'] assert file_item_q['obj_original_url'] == file_item['obj_original_url']
def startCrawler(): """ Initiates process of the web crawler above. Arguments: None Return: None """ # Starts a Twisted reactors to configure logs and set shutdown handlers process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(TwitterSpider) process.start()
def main(): from scrapy.crawler import CrawlerProcess, Crawler process = CrawlerProcess() process.crawl(EducatieSpider) process.start()
def main(): from scrapy.crawler import CrawlerProcess process = CrawlerProcess() process.crawl(DialogSpider) process.start()
def __enter__(self): responses = self.crawler.signals.send_catch_log(signal=signals.spider_opened, spider=self.spider) for _, failure in responses: if failure: failure.raiseException() return self
def __exit__(self, exc_type, exc_val, exc_tb): responses = self.crawler.signals.send_catch_log(signal=signals.spider_closed, spider=self.spider, reason=None) for _, failure in responses: if failure: failure.raiseException()
def test_skip(settings): crawler = make_crawler(settings, _AUTOLOGIN_FORCE_SKIP=True) with MockServer(Login) as s: yield crawler.crawl(url=s.root_url) spider = crawler.spider assert set(spider.visited_urls) == {'/', '/login'} assert all(not r.meta['autologin_active'] for r in spider.responses)
def test_login(settings, extra_settings=None): """ No logout links, just one page after login. """ crawler = make_crawler(settings, **AL_SETTINGS) with MockServer(Login) as s: yield crawler.crawl(url=s.root_url) spider = crawler.spider assert len(spider.visited_urls) == 2 assert set(spider.visited_urls) == {'/', '/hidden'} response = spider.responses[0] assert urlsplit(response.url).path.rstrip('/') == '' assert response.meta['autologin_active'] assert response.meta['autologin_response']['status'] == 'solved'
def test_login_error(settings, extra_settings=None): """ Trying to login with wrong credentials """ al_settings = dict(AL_SETTINGS) al_settings['AUTOLOGIN_PASSWORD'] = 'wrong' crawler = make_crawler(settings, **al_settings) with MockServer(Login) as s: yield crawler.crawl(url=s.root_url) spider = crawler.spider assert len(spider.visited_urls) == 2 assert set(spider.visited_urls) == {'/', '/login'} response = spider.responses[0] assert urlsplit(response.url).path.rstrip('/') == '' assert not response.meta['autologin_active'] assert response.meta['autologin_response']['status'] == 'error'
def test_pass_via_meta(settings): crawler = make_crawler(settings, spider_cls=PassMetaSpider, AUTOLOGIN_DOWNLOAD_DELAY=0.01) with MockServer(Login) as s: yield crawler.crawl(url=s.root_url) spider = crawler.spider assert len(spider.visited_urls) == 2 assert set(spider.visited_urls) == {'/', '/hidden'}
def test_login_with_logout(settings, spider_cls=TestSpider): """ Login with logout. """ crawler = make_crawler(settings, spider_cls=spider_cls, **AL_SETTINGS) with MockServer(LoginWithLogout) as s: yield crawler.crawl(url=s.root_url) spider = crawler.spider mandatory_urls = {'/', '/hidden', '/one', '/two', '/three', '/slow'} spider_urls = set(spider.visited_urls) assert mandatory_urls.difference(spider_urls) == set() assert spider_urls.difference( mandatory_urls | {'/l0gout1', '/l0gout2'}) == set()
def test_custom_headers(settings): crawler = make_crawler(settings, USER_AGENT='MyCustomAgent', **AL_SETTINGS) with MockServer(LoginIfUserAgentOk) as s: yield crawler.crawl(url=s.root_url) spider = crawler.spider assert len(spider.visited_urls) == 2 assert spider.visited_urls[1] == '/hidden'
def parse(self, response): for item in super(StoppingSpider, self).parse(response): yield item if not self.state.get('was_stopped'): self.state['was_stopped'] = True self.crawler.stop()
def test_resume(settings): crawler = make_crawler( settings, spider_cls=StoppingSpider, JOBDIR=tempfile.mkdtemp(), SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleFifoDiskQueue', SCHEDULER_MEMORY_QUEUE='scrapy.squeues.FifoMemoryQueue', LOG_UNSERIALIZABLE_REQUESTS=True, **AL_SETTINGS) with MockServer(Login) as s: yield crawler.crawl(url=s.root_url) # resuming crawl yield crawler.crawl(url=s.root_url) spider = crawler.spider assert len(spider.visited_urls) == 1 assert set(spider.visited_urls) == {'/hidden'}
def test_disable_logout(settings): crawler = make_crawler(settings, **AL_SETTINGS) with MockServer(LoginWithContentAfterLogout) as s: yield crawler.crawl(url=s.root_url) spider = crawler.spider assert set(spider.visited_urls) == {'/', '/hidden'} crawler = make_crawler( settings, AUTOLOGIN_CHECK_LOGOUT=False, **AL_SETTINGS) with MockServer(LoginWithContentAfterLogout) as s: yield crawler.crawl(url=s.root_url) spider = crawler.spider spider_urls = set(spider.visited_urls) assert set(spider.visited_urls) == {'/', '/hidden', '/target'}