我们从Python开源项目中,提取了以下1个代码示例,用于说明如何使用scrapy.linkextractors()。
def parse(self, response): item = DoubanTopMoviesItem() item['title_ch'] = response.xpath('//div[@class="hd"]//span[@class="title"][1]/text()').extract() # ???title-title-other ??3?????????title-other???????????????? # en_list = response.xpath('//div[@class="hd"]//span[@class="title"][2]/text()').extract() # item['title_en'] = [en.replace('\xa0/\xa0','').replace(' ','') for en in en_list] # ht_list = response.xpath('//div[@class="hd"]//span[@class="other"]/text()').extract() # item['title_ht'] = [ht.replace('\xa0/\xa0','').replace(' ','') for ht in ht_list] # detail_list = response.xpath('//div[@class="bd"]/p[1]/text()').extract() # item['detail'] = [detail.replace(' ', '').replace('\xa0', '').replace('\n', '') for detail in detail_list] # ?????????quote?????????? # item['quote'] = response.xpath('//span[@class="inq"]/text()').extract() item['rating_num'] = response.xpath('//div[@class="star"]/span[2]/text()').extract() # ??????“XXX???”???????????XXX?? count_list = response.xpath('//div[@class="star"]/span[4]/text()').extract() item['rating_count'] = [re.findall('\d+',count)[0] for count in count_list] item['image_urls'] = response.xpath('//div[@class="pic"]/a/img/@src').extract() item['topid'] = response.xpath('//div[@class="pic"]/em/text()').extract() yield item # ??????????? # new_url = response.xpath('//link[@rel="next"]/@href').extract_first() # if new_url: # next_url = self.base_url+new_url # yield scrapy.Request(next_url, callback=self.parse) ######-------??start_urls?LinkExtractor ???????--------##### # from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors import LinkExtractor # class SpDoubanSpider(CrawlSpider): # ? # ????????????????? # rules = [Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*')), # callback='parse_item', follow=True) # ] # def parse_item(self, response): # # item ?????? # yield item ######-------??start_urls?LinkExtractor ???????--------#####