Python scrapy 模块,Field() 实例源码

我们从Python开源项目中,提取了以下5个代码示例,用于说明如何使用scrapy.Field()

项目:spider_wdzj_com    作者:basicworld    | 项目源码 | 文件源码
def parse(self, response):
        for sel in response.xpath('//*[@id="tb_content"]/div[3]/table/tbody/tr'):
            item = WangdaizhijiaItem()
            item['pm'] = sel.xpath('td')[0].xpath('span/text()').extract()[0]
            item['ptmc'] = sel.xpath('td/a[@target="_blank"]/span/text()').extract()
            item['cjl'] = sel.xpath('td/text()').extract()[0]
            item['pjll'] = sel.xpath('td/text()').extract()[1]
            item['pjjkqx'] = sel.xpath('td/text()').extract()[2]
            item['ljdhje'] = sel.xpath('td/text()').extract()[3]
            yield item

            # pm = scrapy.Field() #??
            # ptmc = scrapy.Field() #????
            # cjl = scrapy.Field() #???
            # pjll = scrapy.Field() #????
            # pjjkqx = scrapy.Field() # ??????
            # ljdhje = scrapy.Field() #??????
            # //*[@id="tb_content"]/div[3]/table/tbody
项目:scrapy_rss    作者:woxcab    | 项目源码 | 文件源码
def test_single_item_in_the_feed(self):
        class SuperItem(ExtendableItem):
            some_field = scrapy.Field()

            def __init__(self):
                super(SuperItem, self).__init__()
                self.rss = RssItem()

        for item_name, item in self.items.items():
            with CrawlerContext(**self.feed_settings) as context:
                context.ipm.process_item(item, context.spider)
            with open(self.feed_settings['feed_file']) as data, \
                 open(os.path.join(os.path.dirname(__file__),
                                   'expected_rss', '{}.rss'.format(item_name))) as expected:
                self.assertUnorderedXmlEquivalentOutputs(data=data.read(), expected=expected.read())

            super_item = SuperItem()
            super_item.rss = item
            with CrawlerContext(**self.feed_settings) as context:
                context.ipm.process_item(super_item, context.spider)
            with open(self.feed_settings['feed_file']) as data, \
                 open(os.path.join(os.path.dirname(__file__),
                                   'expected_rss', '{}.rss'.format(item_name))) as expected:
                self.assertUnorderedXmlEquivalentOutputs(data=data.read(), expected=expected.read())
项目:PYLearning    作者:azheng51714    | 项目源码 | 文件源码
def parse(self, response):

        fp = open('ele' + '.json', 'wb')
        fp.write(response.body)
        fp.close()

        jresult = json.loads(response.body)
        items = []
        for itemjson in jresult:
            name = '??'
            recent_order_num = '??'
            average_cost = '??'
            if 'name' in itemjson:
                name =  itemjson['name']
            if 'recent_order_num' in itemjson:
                recent_order_num = itemjson['recent_order_num']
            if 'average_cost' in itemjson:
                average_cost = itemjson['average_cost']

            item = ELEItem(name=name,recent_order_num=recent_order_num,average_cost=average_cost)
           # item = ELEItem()
            #item.name = scrapy.Field(dict(name=itemjson['name']))
           # item.average_cost = itemjson['average_cost']
           # item.recent_order_num = itemjson['recent_order_num']
            items.append(item)



        return items
项目:scrapy_rss    作者:woxcab    | 项目源码 | 文件源码
def test_item_validation(self):
        invalid_item = RssItem()
        invalid_item.enclosure.url = 'http://example.com/content'

        with self.assertRaisesRegexp(InvalidRssItemAttributesError, 'required attributes .*? not set'):
            with CrawlerContext(**self.feed_settings) as context:
                context.ipm.process_item(invalid_item, context.spider)

        class NonStandardElement(ItemElement):
            first_attribute = ItemElementAttribute(required=True, is_content=True)
            second_attribute = ItemElementAttribute(required=True)

        class NonStandardItem(RssItem):
            element = NonStandardElement()

        invalid_item = NonStandardItem()
        with self.assertRaisesRegexp(InvalidElementValueError, 'Could not assign'):
            invalid_item.element = 'valid value'
        invalid_item.element.first_attribute = 'valid value'

        with self.assertRaisesRegexp(InvalidRssItemAttributesError, 'required attributes .*? not set'):
            with CrawlerContext(**self.feed_settings) as context:
                context.ipm.process_item(invalid_item, context.spider)

        class InvalidSuperItem1(ExtendableItem):
            pass

        class InvalidSuperItem2(ExtendableItem):
            field = scrapy.Field()

        class InvalidSuperItem3(ExtendableItem):
            rss = scrapy.Field()

        for invalid_item_cls in (InvalidSuperItem1, InvalidSuperItem2, InvalidSuperItem3):
            with self.assertRaisesRegexp(InvalidRssItemError, "Item must have 'rss'"):
                with CrawlerContext(**self.feed_settings) as context:
                    context.ipm.process_item(invalid_item_cls(), context.spider)
项目:vigilante    作者:VigilantePolitico    | 项目源码 | 文件源码
def document_to_item(document_class):
    class DocumentAsItemClass(Item):
        def concrete(self):
            return document_class(**self)
    exclude_fields = dir(EmptyDocument)
    document_fields = [field for field in dir(document_class) if field not in exclude_fields]

    for field in document_fields + ['id']:
        DocumentAsItemClass.fields[field] = Field()

    return DocumentAsItemClass