我们从Python开源项目中,提取了以下3个代码示例,用于说明如何使用scrapy.spiders()。
def start_requests(self): """This function generates the initial request of ArchiveSpider. See 'http://doc.scrapy.org/en/latest/topics/spiders.html#\ scrapy.spiders.Spider.start_requests'. The most import part of the function is to set a request meta, 'archive_meta', according to its site 'archive_rules'. The meta would be used to parse article URLs from response and generate next request! """ for page in self.page_templates: url = page.format(p_num=self.p_kw['start']) meta = dict(archive_meta=dict( last_urls=dict(), p_num=self.p_kw['start'], next_tries=0, max_next_tries=self.p_kw['max_next_tries'], page=page)) logger.debug('Page format meta info:\n%s', pprint.pformat(meta)) yield scrapy.Request(url, callback=self.parse, meta=meta)
def __init__(self, domains, urls, *args, **kwargs): """Constructor for FeedSpider. Parameters ---------- domains : list A list of domains for the site. urls : list A list of feed URLs of the site. provider : string The provider of RSS feed. url_regex : string URL pattern regular expression. If you use this spider to store item into database, additional keywords are required: platform_id : int The id of a platform instance. session : object An instance of SQLAlchemy session. Other keywords are used to specify how to parse the XML, see http://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders\ .XMLFeedSpider. """ self.platform_id = kwargs.pop('platform_id', None) self.session = kwargs.pop('session', None) self.url_regex = kwargs.pop('url_regex', None) self.provider = kwargs.pop('provider', 'self') self.iterator = kwargs.pop('iterator', 'iternodes') self.itertag = kwargs.pop('iterator', 'item') self.allowed_domains = domains self.start_urls = urls super(FeedSpider, self).__init__(*args, **kwargs)
def parse(self, response): item = DoubanTopMoviesItem() item['title_ch'] = response.xpath('//div[@class="hd"]//span[@class="title"][1]/text()').extract() # ???title-title-other ??3?????????title-other???????????????? # en_list = response.xpath('//div[@class="hd"]//span[@class="title"][2]/text()').extract() # item['title_en'] = [en.replace('\xa0/\xa0','').replace(' ','') for en in en_list] # ht_list = response.xpath('//div[@class="hd"]//span[@class="other"]/text()').extract() # item['title_ht'] = [ht.replace('\xa0/\xa0','').replace(' ','') for ht in ht_list] # detail_list = response.xpath('//div[@class="bd"]/p[1]/text()').extract() # item['detail'] = [detail.replace(' ', '').replace('\xa0', '').replace('\n', '') for detail in detail_list] # ?????????quote?????????? # item['quote'] = response.xpath('//span[@class="inq"]/text()').extract() item['rating_num'] = response.xpath('//div[@class="star"]/span[2]/text()').extract() # ??????“XXX???”???????????XXX?? count_list = response.xpath('//div[@class="star"]/span[4]/text()').extract() item['rating_count'] = [re.findall('\d+',count)[0] for count in count_list] item['image_urls'] = response.xpath('//div[@class="pic"]/a/img/@src').extract() item['topid'] = response.xpath('//div[@class="pic"]/em/text()').extract() yield item # ??????????? # new_url = response.xpath('//link[@rel="next"]/@href').extract_first() # if new_url: # next_url = self.base_url+new_url # yield scrapy.Request(next_url, callback=self.parse) ######-------??start_urls?LinkExtractor ???????--------##### # from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors import LinkExtractor # class SpDoubanSpider(CrawlSpider): # ? # ????????????????? # rules = [Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*')), # callback='parse_item', follow=True) # ] # def parse_item(self, response): # # item ?????? # yield item ######-------??start_urls?LinkExtractor ???????--------#####