我们从Python开源项目中,提取了以下5个代码示例,用于说明如何使用scrapy.item()。
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # definir el spider para el crawler crawler.crawl(EuropythonSpyder()) # iniciar scrapy print "STARTING ENGINE" crawler.start() #iniciar el crawler llamando al spider definido print "ENGINE STOPPED"
def main(): from scrapy.xlib.pydispatch import dispatcher """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # define spyder for the crawler crawler.crawl(PydataSpiderDetails()) print "STARTING ENGINE" crawler.start() #start the crawler print "ENGINE STOPPED"
def __setattr__(self, name, value): if name in self.fields: raise AttributeError("Use item[{!r}] = {!r} to set field value".format(name, value)) super(BaseItem, self).__setattr__(name, value)
def process_response(self, response): item = EuropythonItem() print response item['title'] = response.xpath("//div[contains(@class, 'grid-100')]//h1/text()").extract() item['author'] = response.xpath("//div[contains(@class, 'talk-speakers')]//a[1]/text()").extract() item['description'] = response.xpath("//div[contains(@class, 'cms')]//p//text()").extract() item['date'] = response.xpath("//section[contains(@class, 'talk when')]/strong/text()").extract() item['tags'] = response.xpath("//div[contains(@class, 'all-tags')]/span/text()").extract() return item
def parse_details(self, response): print 'link parseado %s' %response.url hxs = scrapy.Selector(response) item = PydatascheduleItem() item['speaker'] = hxs.select('//div[@class="col-md-8"]/h4/a/text()').extract()[0].strip() item['url'] = response.url item['talk'] = hxs.select('//div[@class="col-md-8"]/h2/text()').extract()[0].strip() item['time'] = hxs.select('//div[@class="col-md-8"]/h4/text()').extract()[0].replace("\n","").strip() item['description'] = hxs.select('//div[@class="description"]/p/text()').extract()[0] return item