我们从Python开源项目中,提取了以下5个代码示例,用于说明如何使用scrapy.Field()。
def parse(self, response): for sel in response.xpath('//*[@id="tb_content"]/div[3]/table/tbody/tr'): item = WangdaizhijiaItem() item['pm'] = sel.xpath('td')[0].xpath('span/text()').extract()[0] item['ptmc'] = sel.xpath('td/a[@target="_blank"]/span/text()').extract() item['cjl'] = sel.xpath('td/text()').extract()[0] item['pjll'] = sel.xpath('td/text()').extract()[1] item['pjjkqx'] = sel.xpath('td/text()').extract()[2] item['ljdhje'] = sel.xpath('td/text()').extract()[3] yield item # pm = scrapy.Field() #?? # ptmc = scrapy.Field() #???? # cjl = scrapy.Field() #??? # pjll = scrapy.Field() #???? # pjjkqx = scrapy.Field() # ?????? # ljdhje = scrapy.Field() #?????? # //*[@id="tb_content"]/div[3]/table/tbody
def test_single_item_in_the_feed(self): class SuperItem(ExtendableItem): some_field = scrapy.Field() def __init__(self): super(SuperItem, self).__init__() self.rss = RssItem() for item_name, item in self.items.items(): with CrawlerContext(**self.feed_settings) as context: context.ipm.process_item(item, context.spider) with open(self.feed_settings['feed_file']) as data, \ open(os.path.join(os.path.dirname(__file__), 'expected_rss', '{}.rss'.format(item_name))) as expected: self.assertUnorderedXmlEquivalentOutputs(data=data.read(), expected=expected.read()) super_item = SuperItem() super_item.rss = item with CrawlerContext(**self.feed_settings) as context: context.ipm.process_item(super_item, context.spider) with open(self.feed_settings['feed_file']) as data, \ open(os.path.join(os.path.dirname(__file__), 'expected_rss', '{}.rss'.format(item_name))) as expected: self.assertUnorderedXmlEquivalentOutputs(data=data.read(), expected=expected.read())
def parse(self, response): fp = open('ele' + '.json', 'wb') fp.write(response.body) fp.close() jresult = json.loads(response.body) items = [] for itemjson in jresult: name = '??' recent_order_num = '??' average_cost = '??' if 'name' in itemjson: name = itemjson['name'] if 'recent_order_num' in itemjson: recent_order_num = itemjson['recent_order_num'] if 'average_cost' in itemjson: average_cost = itemjson['average_cost'] item = ELEItem(name=name,recent_order_num=recent_order_num,average_cost=average_cost) # item = ELEItem() #item.name = scrapy.Field(dict(name=itemjson['name'])) # item.average_cost = itemjson['average_cost'] # item.recent_order_num = itemjson['recent_order_num'] items.append(item) return items
def test_item_validation(self): invalid_item = RssItem() invalid_item.enclosure.url = 'http://example.com/content' with self.assertRaisesRegexp(InvalidRssItemAttributesError, 'required attributes .*? not set'): with CrawlerContext(**self.feed_settings) as context: context.ipm.process_item(invalid_item, context.spider) class NonStandardElement(ItemElement): first_attribute = ItemElementAttribute(required=True, is_content=True) second_attribute = ItemElementAttribute(required=True) class NonStandardItem(RssItem): element = NonStandardElement() invalid_item = NonStandardItem() with self.assertRaisesRegexp(InvalidElementValueError, 'Could not assign'): invalid_item.element = 'valid value' invalid_item.element.first_attribute = 'valid value' with self.assertRaisesRegexp(InvalidRssItemAttributesError, 'required attributes .*? not set'): with CrawlerContext(**self.feed_settings) as context: context.ipm.process_item(invalid_item, context.spider) class InvalidSuperItem1(ExtendableItem): pass class InvalidSuperItem2(ExtendableItem): field = scrapy.Field() class InvalidSuperItem3(ExtendableItem): rss = scrapy.Field() for invalid_item_cls in (InvalidSuperItem1, InvalidSuperItem2, InvalidSuperItem3): with self.assertRaisesRegexp(InvalidRssItemError, "Item must have 'rss'"): with CrawlerContext(**self.feed_settings) as context: context.ipm.process_item(invalid_item_cls(), context.spider)
def document_to_item(document_class): class DocumentAsItemClass(Item): def concrete(self): return document_class(**self) exclude_fields = dir(EmptyDocument) document_fields = [field for field in dir(document_class) if field not in exclude_fields] for field in document_fields + ['id']: DocumentAsItemClass.fields[field] = Field() return DocumentAsItemClass