我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.http()。
def login(self,response): cookie_jar = CookieJar() cookie_jar.extract_cookies(response,response.request) for k,v in cookie_jar._cookies.items(): for i,j in v.items(): for m,n in j.items(): self.cookie_dict[m] = n.value req = Request( url='http://dig.chouti.com/login', method='POST', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, body='phone=13331167937&password=zds819918&oneMonth=1', cookies=self.cookie_dict, callback=self.check_login ) yield req
def parse(self, response): selector = Selector(response) articles = selector.xpath('//ul[@class="article-list thumbnails"]/li') for article in articles: item = Jianshu2Item() url = article.xpath('div/h4/a/@href').extract() likeNum = article.xpath('div/div/span[2]/text()').extract() posturl = 'http://www.jianshu.com'+url[0] if len(likeNum) == 0: item['likeNum'] = 0 else: item['likeNum'] = int(likeNum[0].split(' ')[-1]) request = Request(posturl,callback=self.parse_donate) request.meta['item'] = item yield request next_link = selector.xpath('//*[@id="list-container"]/div[@class="load-more"]/button/@data-url').extract()[0] if next_link: next_link = self.url + str(next_link) yield Request(next_link,callback=self.parse)
def post_login(self, response): self.logger.info('---- login start ----') # ????????????????????? formhash ????, ???????? formhash = response.xpath('//input[@name="formhash"]/@value').extract()[0] self.logger.info('formhash: ' + formhash) # FormRequeset.from_response?Scrapy???????, ??post?? # ?????, ???after_login???? return [scrapy.FormRequest.from_response(response, formdata={ 'formhash': formhash, 'referer': 'http://www.mayattt.com/index.php', 'loginfield': 'username', 'username': 'mayajeiker', 'password': 'friendship', 'questionid': '0', 'cookietime': '12592000', }, callback=self.after_login )] # ????????????
def parse(self, response): # ?request.content ??? Element items = response.xpath('//form[@name="moderate"]/*/div[@class="spaceborder"]/table/tr') for item in items: url_str = 'http://www.mayattt.com/'+item.xpath('./td[@class="f_title"]/a/@href').extract()[0] title_str = '' date_str = '' try: title_str = item.xpath('./td[@class="f_title"]/a/text()').extract()[0] date_str = item.xpath('./td[@class="f_last"]/span/a/text()').extract()[0] except: self.logger.error('get list page failure!') pass yield Request(url_str, headers=self.headers, callback=self.parseImage, meta={'title': title_str, 'date': date_str}) # ??????? ??url , ??item?
def parse(self, response): se=Selector(response) #???????HtmlXPathSelector??? if(re.match("http://desk.zol.com.cn/fengjing/\d+x\d+/\d+.html", response.url)):#??url??????????url???? src=se.xpath("//ul[@class='pic-list2 clearfix']/li")#???ul?????li for i in range(len(src)):#??li?? imgURLs=se.xpath("//ul[@class='pic-list2 clearfix']/li[%d]/a/img/@src"%i).extract() #?????????? titles=se.xpath("//ul[@class='pic-list2 clearfix']/li[%d]/a/img/@title"%i).extract() if imgURLs: realUrl=imgURLs[0].replace("t_s208x130c5","t_s2560x1600c5") #???????????????? file_name=u"%s.jpg"%titles[0] #???????? path=os.path.join("D:\pics",file_name)#??????????????F??pics???? type = sys.getfilesystemencoding() print file_name.encode(type) item=WebcrawlerScrapyItem() #??item??????item??,?????????????item??? item['name']=file_name item['url']=realUrl print item["name"],item["url"] yield item #??item,???????item urllib.urlretrieve(realUrl,path) #?????????????????????????????????????? all_urls=se.xpath("//a/@href").extract()#???????url for url in all_urls: if url.startswith("/fengjing/1920x1080/"):#?????????????? yield Request("http://desk.zol.com.cn"+url,callback=self.parse)
def parse_detail(self, response): content = response.css('#work span::text').extract() reg = "^(http|https|ftp)://.*(.com|.cn|.html|.htm|.asp|.jsp)" url = response.url reg_url_name = ".*?(\d+)" get_url = re.match(reg_url_name, url) if get_url: self.get_name = get_url.group(1) reference_url_list = [] for each_line in content: get_reference_url = re.match(reg, each_line) if get_reference_url: reference_url_list.append(get_reference_url.group(0)) self.count = 0 if reference_url_list: for each_url in reference_url_list: yield Request(url=each_url, dont_filter=True, callback=self.parse_reference) self.count += 1 else: pass
def extract_links(self, response): """Generate (url, source_anchor) tuples extracted from the page""" for link in response.css('a'): # extract the href & urljoin it to the current response url = response.urljoin(link.xpath('@href').extract_first()) # Only follow http(s) URLs (i.e., no `javascript:` or `mailto:`). if url.startswith('http'): # merge text content of all child nodes of the link anchor = " ".join(s.strip() for s in link.css('*::text').extract() if s.strip()) yield (url, anchor) for frame in (response.css("frame") + response.css("iframe")): relative_url = frame.css("::attr(src)").extract_first() url = response.urljoin(relative_url) if url.startswith("http"): anchor = frame.css("::attr(name)").extract_first() yield (url, anchor)
def post_get_playlist(self, response): collection = self.db.playlist result = json.loads(response.body, encoding='utf-8')['result'] # inserted = collection.update({'id': result['id']}, result, upsert=True) # upsert=True??insert or update # logger.info('Update or Insert to playlist database[%s]' % (str(inserted),)) if result['id'] not in self.playlist_id_buffer: collection.insert(result) for song in result['tracks']: artists = [] for detail in song['artists']: artists.append(detail['name']) comment_url = 'http://music.163.com/weapi/v1/resource/comments/%s/?csrf_token=' % (song['commentThreadId'],) # ??FormRequest???POST?????????????? # Request(url, method='POST', body=json.dumps(data)) yield FormRequest(comment_url, formdata=self.post_data, callback=self.parse, meta={'m_id': song['id'], 'm_name': song['name'], 'artists': artists})
def parse_list(self, response): url = response.meta['splash']['args']['url'] pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/') if (pattern.match(url)): page = int(pattern.split(url)[1]) url = pattern.findall(url)[0] page += 1 url = url + str(page) else: url = url + '/2' print '+++++++++++++++++++++++++ Next url:', url req = SplashRequest(url = url, callback = self.parse_list) yield req pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}') for item_url in pattern_detail.findall(response.body): req = Request(url = item_url, callback = self.parse_item) yield req
def parse(self, response): #print '=========================', response.url pattern_list = re.compile(r'http://www.mogujie.com/book/\w+/\d+') #print '+++++++++++++++++++++++++', pattern_list.findall(response.body) ''' for item_list in pattern_list.findall(response.body): req = Request(url = item_list, callback = self.parse_list) yield req ''' ''' req = Request(url = 'http://www.mogujie.com/book/clothing/50249/', callback = self.parse_list, meta={ 'splash': { 'endpoint': 'render.html' }, #'dont_send_headers': True, }) ''' for item_list in pattern_list.findall(response.body): #req = SplashRequest(url = 'http://www.mogujie.com/book/clothing/50249/', callback = self.parse_list) req = SplashRequest(url = item_list, callback = self.parse_list) yield req
def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = 'http://localhost/books.toscrape.com/index.html' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] self.link_extractor = LinkExtractor() self.cookies_seen = set() self.previtem = 0 self.items = 0 self.timesec = datetime.datetime.utcnow()
def start_requests(self): ##################################################################################### # topic_dict = {'1':[u'??', u'??'], '2':[u'??',u'??']} topic_dict = {'1':[u'??'], '2':[u'??'], '3':[u'????'], '4':[u'??']} index = 0 for id, kws_list in topic_dict.iteritems(): for kw in kws_list: print kw wd_code = urllib.quote(kw.encode('gbk')) search_url = 'http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw='+wd_code+'&un=&rn=10&pn=0&sd=&ed=&sm=1&only_thread=1' # http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw=%B1%B1%BE%A9&un=&rn=10&pn=0&sd=&ed=&sm=1&only_thread=1 # http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw=%B1%B1%BE%A9&un=&rn=10&pn=0&sd=&ed=&sm=1 # print search_url self.Flag_List.append(True) self.Maxpage_List.append(self.MAX_PAGE_NUM) print search_url yield scrapy.Request(search_url,meta={'topic_id': id,'index':index, 'kw':kw},) index += 1 #####################################################################################
def parse(self, response): # print response.request.headers # print u'~~~~', ("pp3288" in response.body) # print u'~~~~', unicode(response.body, "utf8").encode("utf8") #????????????????parse_albumm???? for box in response.xpath(self.config["xpathAlbumList"]): url = box.xpath(self.config["xpathAlbumURL"]).extract()[0] title = box.xpath(self.config["xpathAlbumTitle"]).extract()[0] if not self.config.has_key("specificAlbums") or url in self.config["specificAlbums"]: if not url.startswith("http") and self.config.has_key("baseAddress"): url = self.config["baseAddress"] + url # print u'?????', title, url request = scrapy.Request(url, headers=self.headers, callback=self.parse_album, cookies={'title': title}) yield request # break #TODO????????????????parse_album_list pass #?????????
def get_url(self, level, key): base_url = 'http://gaokao.chsi.com.cn/zyk/zybk/' if level == 0: page = 'ccCategory.action' elif level == 1: page = 'mlCategory.action' elif level == 2: page = 'xkCategory.action' elif level == 3: page = 'specialityesByCategory.action' else: raise Exception('invalid level') return '{}{}?key={}'.format(base_url, page, key)
def start_requests(self): # with open(getattr(self, "file", "company.csv"), "rU") as f: # reader = csv.reader(f) # for line in reader: # request = Request('http://www.qichacha.com/search?key='+line[0].decode('gbk').encode('utf-8'),headers=self.headers) # #request.meta['fields'] = line # yield request with open(("company.csv"), "rU") as f: reader = csv.reader(f) for line in reader: request = Request('http://www.qichacha.com/search?key='+line[0],headers=self.headers) #request.meta['fields'] = line yield request # def start_requests(self): # yield Request('http://www.qichacha.com/search?key=%E5%89%8D%E6%B5%B7%E4%BA%BA%E5%AF%BF%E4%BF%9D%E9%99%A9%E8%82%A1%E4%BB%BD%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8',headers=self.headers)
def parse(self, response): item=AutopjtItem() #?Xpath???????? item["name"]=response.xpath("//a[@class='pic']/@title").extract() item["price"]=response.xpath("//span[@class='price_n']/text()").extract() item["link"]=response.xpath("//a[@class='pic']/@href").extract() item["comnum"]=response.xpath("//a[@name='P_pl']/text()").extract() #???item yield item #??????75? for i in range(1,76): #??????? url="http://category.dangdang.com/pg"+str(i)+"-cid4002203.html" #?yieldRequest?????? #??? yield Request(url, callback=self.parse) #15.5 #1 # Obey robots.txt rules
def test_nosplash(): mw = _get_mw() cookie_mw = _get_cookie_mw() req = scrapy.Request("http://example.com") old_meta = copy.deepcopy(req.meta) assert cookie_mw.process_request(req, None) is None assert mw.process_request(req, None) is None assert old_meta == req.meta # response is not changed response = Response("http://example.com", request=req) response2 = mw.process_response(req, response, None) response3 = cookie_mw.process_response(req, response, None) assert response2 is response assert response3 is response assert response3.url == "http://example.com"
def test_magic_response2(): # check 'body' handling and another 'headers' format mw = _get_mw() req = SplashRequest('http://example.com/', magic_response=True, headers={'foo': 'bar'}, dont_send_headers=True) req = mw.process_request(req, None) assert 'headers' not in req.meta['splash']['args'] resp_data = { 'body': base64.b64encode(b"binary data").decode('ascii'), 'headers': {'Content-Type': 'text/plain'}, } resp = TextResponse("http://mysplash.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp2 = mw.process_response(req, resp, None) assert resp2.data == resp_data assert resp2.body == b'binary data' assert resp2.headers == {b'Content-Type': [b'text/plain']} assert resp2.status == 200 assert resp2.url == "http://example.com/"
def test_magic_response_http_error(): mw = _get_mw() req = SplashRequest('http://example.com/foo') req = mw.process_request(req, None) resp_data = { "info": { "error": "http404", "message": "Lua error: [string \"function main(splash)\r...\"]:3: http404", "line_number": 3, "type": "LUA_ERROR", "source": "[string \"function main(splash)\r...\"]" }, "description": "Error happened while executing Lua script", "error": 400, "type": "ScriptError" } resp = TextResponse("http://mysplash.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp = mw.process_response(req, resp, None) assert resp.data == resp_data assert resp.status == 404 assert resp.url == "http://example.com/foo"
def test_slot_policy_per_domain(): mw = _get_mw() meta = {'splash': { 'slot_policy': scrapy_splash.SlotPolicy.PER_DOMAIN }} req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) req1 = mw.process_request(req1, None) req2 = scrapy.Request("http://example.com/path2", meta=meta) req2 = mw.process_request(req2, None) req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) req3 = mw.process_request(req3, None) assert req1.meta.get('download_slot') assert req3.meta.get('download_slot') assert req1.meta['download_slot'] == req2.meta['download_slot'] assert req1.meta['download_slot'] != req3.meta['download_slot']
def test_adjust_timeout(): mw = _get_mw() req1 = scrapy.Request("http://example.com", meta = { 'splash': {'args': {'timeout': 60, 'html': 1}}, # download_timeout is always present, # it is set by DownloadTimeoutMiddleware 'download_timeout': 30, }) req1 = mw.process_request(req1, None) assert req1.meta['download_timeout'] > 60 req2 = scrapy.Request("http://example.com", meta = { 'splash': {'args': {'html': 1}}, 'download_timeout': 30, }) req2 = mw.process_request(req2, None) assert req2.meta['download_timeout'] == 30
def parse(self, response): selector = Selector(response) ID = response.meta["ID"] text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() info = InfoItem() if text0: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # ??? num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ??? num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ??? if num_tweets: info["num_tweets"] = int(num_tweets[0]) if num_follows: info["num_follows"] = int(num_follows[0]) if num_fans: info["num_fans"] = int(num_fans[0]) url_information1 = "http://weibo.cn/%s/info" % ID yield Request(url=url_information1, meta={"item":info,"ID":ID}, dont_filter=True, callback=self.parse1)
def parse3(self, response): """ ????????????ID """ selector = Selector(response) text2 = selector.xpath('body//table/tr/td/a/@href').extract() next_urls = [] for elem in text2: elem = re.findall('uid=(\d+)', elem) if elem: next_urls.append(int(elem[0])) self.next_ID.pop() self.next_ID.append(random.choice(next_urls)) self.temp = next_urls[0] try: next_url = "http://weibo.cn/u/%s" % self.next_ID[-1] yield Request(url=next_url, dont_filter=True, callback=self.parse) except: self.next_ID.pop() self.next_ID.append(self.temp) next_url = "http://weibo.cn/u/%s" % self.temp yield Request(url=next_url, dont_filter=True, callback=self.parse)
def parse3_fans(self, response): """ ????????????ID """ selector = Selector(response) text2 = selector.xpath('body//table/tr/td/a/@href').extract() url_main = response.meta["url_main"] ID_ = response.meta["ID"] for elem in text2: elem = re.findall('uid=(\d+)', elem) if elem: ID = int(elem[0]) if ID not in self.friends_id: # ??ID???????????? self.friends_id.add(ID) url_next = selector.xpath( u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract() if url_next: yield Request(url="http://weibo.cn%s" % url_next[0], meta={"url_main":url_main,"ID":ID_}, callback=self.parse3_fans) else: self.fans_finish = True if self.fans_finish and self.follows_finish: yield Request(url=url_main, meta={"ID":ID_}, dont_filter=True, callback=self.parse)
def parse3_follows(self, response): """ ????????????ID """ selector = Selector(response) text2 = selector.xpath('body//table/tr/td/a/@href').extract() url_main = response.meta["url_main"] ID_ = response.meta["ID"] for elem in text2: elem = re.findall('uid=(\d+)', elem) if elem: ID = int(elem[0]) if ID not in self.friends_id: # ??ID???????????? self.friends_id.add(ID) url_next = selector.xpath( u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract() if url_next: yield Request(url="http://weibo.cn%s" % url_next[0], meta={"url_main":url_main,"ID":ID_}, callback=self.parse3_follows) else: self.follows_finish = True if self.fans_finish and self.follows_finish: yield Request(url=url_main, meta={"ID":ID_}, dont_filter=True, callback=self.parse)
def parse(self, response): hxs = scrapy.Selector(response) slots_tutorials = hxs.xpath('//td[@class="slot slot-tutorial"]') for slot in slots_tutorials: speakers_tutorials = slot.xpath('//span[@class="speaker"]/text()').extract() urls_tutorials = slot.xpath('//span[@class="title"]//@href').extract() talks_tutorials = slot.xpath('//span[@class="title"]//a/text()').extract() indexSpeaker=0 for speaker in speakers_tutorials: yield Request(url=''.join(('http://www.pydata.org', urls_tutorials[indexSpeaker])), callback=self.parse_details, meta={'speaker': speaker.strip(), 'url': urls_tutorials[indexSpeaker], 'talk': talks_tutorials[indexSpeaker]} ) indexSpeaker=indexSpeaker+1
def parse(self,response): # filename = 'xueshu.html' # with open(filename, 'wb') as f: # f.write(response.body) for sel in response.xpath('//div[@srcid]'): item=XueshuItem() for cell in sel.xpath('div[1]'): item['title']=cell.xpath('h3//a//text()').extract() item['link']=cell.xpath('h3/a/@href').extract() item['author']=cell.xpath('div[1]/span[1]//a/text()').extract() link='http://xueshu.baidu.com'+cell.xpath('h3/a/@href').extract()[0] item['publish']=cell.xpath('div[1]/span[2]/a/@title').extract() item['year']=cell.xpath('div[1]/span[3]/text()').extract() item['cite']=cell.xpath('div[1]/span[4]/a/text()').extract() item['abstract']=self.get_abstract(link) # self.log(self.get_abstract(link)) item['subject']=sel.xpath('div[2]/div[1]//a/text()').extract() yield item
def parse(self, response): article_nodes = response.css('#block-content-article .mainer .item a.title') for article_node in article_nodes: article_url = urlparse.urljoin(response.url, str(article_node.css("::attr(href)").extract_first( ""))) # "http://www.acfun.cn" + str(article_node.css("::attr(href)").extract_first("")) yield Request(url=article_url, callback=self.parse_detail, dont_filter=True) next_nodes = response.css(".pager") next_node = next_nodes[len(next_nodes) - 1] next_url = str(next_node.css("::attr(href)").extract_first("")) if next_url: next_url = urlparse.urljoin(response.url, next_url) yield Request(url=next_url, callback=self.parse, dont_filter=True)
def check_login(self): req = Request( url='http://dig.chouti.com/', method='GET', callback=self.show, cookies=self.cookie_dict, dont_filter=True ) yield req
def show(self, response): # print(response) hxs = HtmlXPathSelector(response) news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]') for new in news_list: # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract() link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first() yield Request( url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,), method='POST', cookies=self.cookie_dict, callback=self.do_favor ) page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract() for page in page_list: page_url = 'http://dig.chouti.com%s' % page import hashlib hash = hashlib.md5() hash.update(bytes(page_url, encoding='utf-8')) key = hash.hexdigest() if key in self.has_request_set: pass else: self.has_request_set[key] = page_url yield Request( url=page_url, method='GET', callback=self.show )
def parse_page(self, response): item = BroadItem() soup = BeautifulSoup(response.text, "lxml") title = response.xpath('//title/text()').extract() if len(title) > 0: item['title'] = ''.join(title[0].replace('|', ',').\ replace('\"', '').replace('\'', '').\ replace('(', '[').replace(')', ']').\ replace('#', '').split()) else: item['title'] = '' print item['title'] print response.url item['url'] = response.url item['date'] = obtain_d(response) print item['date'] divs = soup.findAll('div') div_dic = {} for div in divs: ps = div.findAll('p') div_dic[len(ps)] = div if len(div_dic) == 0: item['content'] = "none" else: div_dic = sorted(div_dic.iteritems(), key=lambda d:d[0], reverse=True) ps = div_dic[0][1].findAll('p') images = div_dic[0][1].findAll('img') item['image_urls'] = '' for img in images: try: if 'http' in img['src']: item['image_urls'] += img['src'] + '\n' except Exception as e: pass text = "" for p in ps: text += p.text item['content'] = text.replace('"', '\'\'') return item
def parse_item0(self,response): provinceUrlList=re.findall(r'<b><a href="(/w/.*?)" title=".*?">.*?</a></b>',response.body) for url in provinceUrlList: yield Request(url="http://www.a-hospital.com{}".format(url),callback=self.parse_item)
def parse_item(self, response): i = HospitalItem() #http://www.a-hospital.com/w/%E5%9B%9B%E5%B7%9D%E7%9C%81%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8 province=urllib.unquote(response.url[len("http://www.a-hospital.com/w/"):]) for name,content in re.findall(r'<li><b><a href=".*?" title=".*?">(.*?)</a>.*?</b>[\s\S]*?<ul>([\s\S]*?)</ul>[\s\S]*?</li>',response.body): i['hospitalName'] = name.decode('utf-8') content=content.decode("utf-8") hospitalAddress=re.findall(u"<b>????</b>[:|?](.*?)</li>",content) hospitalPhoneNumber= re.findall(u"<b>????</b>[:|?](.*?)</li>",content) hospitalLevel = re.findall(u"<b>????</b>[:|?](.*?)</li>",content) hospitalType=re.findall(u"<b>????</b>[:|?](.*?)</li>",content) hospitalFaxNumber=re.findall(u"<b>????</b>[:|?](.*?)</li>",content) hospitalEmail= re.findall(u"<b>????</b>[:|?](.*?)</li>",content) hospitalWebsite= re.findall(u'<b>????</b>[:|?]<a href="(.*?)" class="external free" rel="nofollow" target="_blank">.*?</a></li>',content) if hospitalAddress: i["hospitalAddress"]=hospitalAddress[0] if hospitalPhoneNumber: i['hospitalPhoneNumber']= hospitalPhoneNumber[0] if hospitalLevel: i['hospitalLevel']=hospitalLevel[0] if hospitalType: i['hospitalType']=hospitalType[0] if hospitalFaxNumber: i['hospitalFaxNumber']=hospitalFaxNumber[0] if hospitalEmail: i['hospitalEmail']=hospitalEmail[0] if hospitalWebsite: i['hospitalWebsite']=hospitalWebsite[0] i['hospitalProvince']=province.decode('utf-8') yield i
def start_requests(self): for i in range(1, 11): url = self.base_url + str(i) + '_1' + self.end_Url yield Request(url, self.parse) # ??????? yield Request('http://www.23us.com/quanben/1', self.parse) # ???????
def parse(self, response): max_num = BeautifulSoup(response.text, 'lxml').find( 'div', class_='pagelink').find_all('a')[-1].get_text() baseurl = str(response.url)[:27] for num in range(1, int(max_num) + 1): if baseurl == 'http://www.23us.com/quanben': url = baseurl + '/' + str(num) else: url = baseurl + '_' + str(num) + self.end_Url yield Request(url, callback=self.get_name)
def start_requests(self): for page_num in range(1, 10, 1): # ?????? url = 'http://www.ximalaya.com/dq/' + str(page_num) + '/' yield Request(url=url, headers=self.headers, callback=self.parse) # ?????
def content_parse(self, response): logging.info(response.url) # ?????? sound_ids = response.xpath('//div[@class="personal_body"]/@sound_ids').extract_first().split(',') for i in sound_ids: sound_json_url = 'http://www.ximalaya.com/tracks/{}.json'.format(i) yield Request(url=sound_json_url, headers=self.headers, callback=self.json_parse)
def start_requests(self): for page_num in range(1, 33, 1): # ?????? url = 'http://www.tuzigh.com/forum/299653{id}171299380/6{tid}' + str(page_num) + '0178299/6897{name}.html' yield Request(url=url, headers=self.headers, callback=self.parse) # ?????