我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用scrapy.loader()。
def parse(self, response): for outer in response.css('#comapreTable tr:not(:first-child)'): if outer.css('td[align="center"]'): ccode = outer.css('td[align="center"]>a::attr(id)').extract_first() cname = outer.css('td[align="center"]>a::text').extract_first() for inner in outer.xpath('td[div[@align="left"]/a]'): loader = ItemLoader(item=EolZhuanyeItem(), selector=inner) loader.add_value('ccode', ccode) loader.add_value('cname', cname) loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0])) loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip)) loader.add_css('name', 'a::text', MapCompose(unicode.strip)) item = loader.load_item() yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
def parse_single_song(self, response): loader = response.meta['loader'] selector = Selector(response) singer = selector.xpath('//title/text()').extract() loader.add_value('singer', singer) loader.add_value('_id', response.meta['song_id']) comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100) source_data, source_url = api_song_url(response.meta['song_id']) comment_id = generate_comment_index()['comment_index'] loader.add_value('comment_id', comment_id) yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers, formdata=comment_data, callback=self.parse_comments, meta={'comment_id': comment_id}) yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers, formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
def get_details(self, response): self.log('Starting the second parsing phase') loader = ItemLoader(item=LibraryOrFrameworkItem(), response=response) # Load the values obtained in the first phase loader.add_value('name', response.meta['name']) language = response.meta['language'] loader.add_value('stable_release', response.meta['stable_version']) loader.add_value('release_date', response.meta['rel_date']) descr = response.xpath('//*[@id="mw-content-text"]/div/p[1] | //*[@id="mw-content-text"]/p[1]').extract_first() cleaned_descr = cleanhtml(descr) loader.add_value('description', cleaned_descr) license_found = False for row in response\ .xpath('//*[@id="mw-content-text"]/div/table[position()<=3]/tr'): header = row.xpath('./th/a/text() | ./th/text()').extract_first() key, value = self.get_key_value(header, row) if key: if key == 'license': # If we find the license in the main page, we will use it license_found = True loader.add_value(key, value) # If we not found the license in the main page # We will use the license found on the start page if not license_found: loader.add_value('license', response.meta['license']) return { "item": loader.load_item(), "language": language # We need to return the language separately in order to manage the many to many relation } # Given a couple (key, elem), obtained during the scraping, he returns the valid couple (key1, value1) # to add to the db. If key is not valid, he will return the tuple (None, None)
def parse(self, response): for quote in response.css(".quote"): loader = ItemLoader(item=QuoteItem(), selector=quote) loader.add_css("text", ".text") loader.add_css("by", ".authoor") loader.add_css("tags", ".tag") yield loader.load_item()
def parse_item(self, response): loader = ItemLoader(GaokaopaiZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=ur'-([^-]+)\.html') loader.add_css('name', u'.modTitle>h1::text') def parse_category(): for e in response.css(u'.catType>a'): yield { 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'), 'name': e.css('::text').extract_first(), } loader.add_value('category', list(parse_category())) loader.add_css('detail', u'.zhiyeShow') item = loader.load_item() return FormRequest( url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html', formdata={'code': item['code'][0]}, meta={'item': item}, dont_filter=True, callback=self.parse_majors )
def parse_item(self, response): loader = ItemLoader(EolZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=r'/(\w+)\.shtml') loader.add_css('name', 'h1#pagetitle::text') loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()') loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()') loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n')) yield loader.load_item()
def parse_song_list(self, response): selector = Selector(response) song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract() song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract() title = selector.xpath('//title/text()').extract() for index, id_ in enumerate(song_id_list): l = ItemLoader(item=PlayListItem()) l.add_value('song_name', song_name_list[index]) l.add_value('title', title) yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET', headers=self.headers, callback=self.parse_single_song)
def parse_detail(self,response): # article_item = JobBoleArticleItem() # #???????? front_image_url = response.meta.get("front_image_url", "") #????? # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0] # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0] # match_re = re.match(r".*?(\d+).*",fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comments_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # match_re = re.match(r".*?(\d+).*",comments_nums) # if match_re: # comments_nums = int(match_re.group(1)) # else: # comments_nums = 0 # content = response.xpath('//div[@class="entry"]').extract()[0] # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("??")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date,"%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comments_nums"] = comments_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content #??item loader??item item_loader = ArticleItemLoader(item = JobBoleArticleItem(),response = response) item_loader.add_value("url",response.url) item_loader.add_xpath("title",'//div[@class="entry-header"]/h1/text()') item_loader.add_value("url_object_id",get_md5(response.url)) item_loader.add_xpath("create_date",'//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_xpath("praise_nums",'//span[contains(@class, "vote-post-up")]/h10/text()') item_loader.add_value("front_image_url",[front_image_url]) item_loader.add_xpath("fav_nums",'//span[contains(@class, "bookmark-btn")]/text()') item_loader.add_xpath("comments_nums",'//a[@href="#article-comment"]/span/text()') item_loader.add_xpath("tags",'//p[@class="entry-meta-hide-on-mobile"]/a/text()') item_loader.add_xpath("content",'//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_item(self, response): loader = ItemLoader(GaokaopaiZhuanyeItem(), response) loader.add_value('url', response.url) loader.add_css('name', u'.majorTitle>h1::text') loader.add_xpath('code', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)') loader.add_xpath('degree', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)') loader.add_xpath('period', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)') loader.add_xpath('courses', u'//div[@class="course"]/h3[.="?????"]/following-sibling::p/text()') def parse_related(): for e in response.xpath(u'//div[@class="course"]/h3[.="?????"]/following-sibling::a'): yield { 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'), 'name': e.css('::text').extract_first(), } loader.add_value('related', list(parse_related())) def parse_category(): category = [] for i in [u"????", u"????", u"????"]: x = u'//h3[.="{}"]/following-sibling::ul[1]/li[@class="current"]/a'.format(i) e = response.xpath(x) category.append({ 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'/zhuanye([-0-9]*)\.html').strip('-'), 'name': e.css('::text').extract_first(), }) return category loader.add_value('category', parse_category()) loader.add_css('detail', u'.majorCon') item = loader.load_item() return Request( url='http://www.gaokaopai.com/zhuanye-jiuye-{}.html'.format(item['code'][0]), meta={'item': item}, callback=self.parse_jiuye )
def parse_item(self, response): loader = ItemLoader(ChsiDaxueItem(), response) loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml') loader.add_value('url', response.url) loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url))) loader.add_css('name', u'.topImg::text') loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)') data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip) loader.add_xpath('type', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) loader.add_xpath('membership', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean) loader.add_xpath('address', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) loader.add_xpath('phone', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) loader.add_xpath('website', u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href', data_clean) loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) def parse_votes(): xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank' get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0) return { 'overall': get_vote(u'?????'), 'environment': get_vote(u'???????'), 'life': get_vote(u'?????'), } loader.add_value('votes', parse_votes()) def parse_trending(): css = u'{}>table tr:not(:first-child)' def get_trending(what): majors = [] for e in response.css(css.format(what)): majors.append({ 'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'), 'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(), 'vote': float(e.css(u'.avg_rank::text').extract_first()), 'count': int(e.css(u'.c_f00::text, .red::text').extract_first()), }) return majors return { 'count': get_trending(u'#topNoofPTable'), 'index': get_trending(u'#topIndexTable'), 'like': get_trending(u'.r_r_box_zymyd'), } loader.add_value('trending', parse_trending()) item = loader.load_item() for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response): yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)