我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.Selector()。
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//tr[@class="cells"]').extract() for i, info in enumerate(infos): val = Selector(text = info) ip = val.xpath('//td[2]/text()').extract_first() port = val.xpath('//td[3]/text()').extract_first() country = val.xpath('//td[5]/text()').extract_first() anonymity = val.xpath('//td[4]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//ul[@class="l2"]').extract() for i, info in enumerate(infos): val = Selector(text = info) ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first() port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first() anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first() https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first() country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//tbody/tr').extract() for i, info in enumerate(infos): if i == 0: continue val = Selector(text = info) ip = val.xpath('//td[1]/text()').extract_first() port = val.xpath('//td[2]/text()').extract_first() country = val.xpath('//td[3]/div/text()').extract_first() anonymity = val.xpath('//td[6]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
def parse(self, response): sel=scrapy.Selector(response) links_in_a_page = sel.xpath('//a[@href]') for link_sel in links_in_a_page: item=OschinaItem() link=str(link_sel.re('href="(.*?)"')[0]) if link: if not link.startswith('http'): link=response.url+link yield scrapy.Request(link,callback=self.parse) item['link']=link link_text=link_sel.xpath('text()').extract() if link_text: item['link_text']=str(link_text[0].encode('utf-8').strip()) else: item['link_text']=None yield item
def parse_user_0(self, response): """ ??????-???????????????? """ user_item = UserItem() selector = Selector(response) text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() if text0: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # ??? num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ??? num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ??? if num_tweets: user_item["ctweets"] = int(num_tweets[0]) if num_follows: user_item["cfollows"] = int(num_follows[0]) if num_fans: user_item["cfans"] = int(num_fans[0]) user_item["_id"] = response.meta["user_id"] url_information1 = "http://weibo.cn/%s/info" % response.meta["user_id"] yield Request(url=url_information1, meta={"item": user_item}, callback=self.parse_user_1)
def parse_user_1(self, response): """ ??????2 """ user_item = response.meta["item"] selector = Selector(response) text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract()) # ????????text() nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1) # ?? intro = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1) # ?? auth = re.findall(u'\u8ba4\u8bc1[:|\uff1a](.*?);', text1) # ???? gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1) # ?? place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1) # ??????????? birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1) # ?? sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1) # ??? marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1) # ???? url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1) # ???? if nickname: user_item["nickname"] = nickname[0] if auth: user_item["auth"] = auth[0] if intro: user_item["intro"] = intro[0] user_item['t'] = time.strftime('%Y-%m-%d', time.localtime(time.time())) yield user_item
def get_xicidaili(): url = "http://www.xicidaili.com/nn/%s" for i in range(1, 2): page_url = url % str(i) print(page_url) s = requests.session() req = s.get(page_url, headers=headers) selector = Selector(text=req.text) ip_nodes = selector.xpath("//table//tr") for each in ip_nodes[1:]: ip = each.xpath("./td[2]/text()").extract()[0] port = each.xpath("./td[3]/text()").extract()[0] http_type = each.xpath("./td[6]/text()").extract()[0] if http_type == "HTTP": proxies = { "http": "%s://%s:%s" % ("http", ip, port), "https": "%s://%s:%s" % ("http", ip, port), } try: r = requests.get('http://www.ip138.com/', proxies=proxies, timeout=5) if r.status_code == 200: print("%s:%s is valid" % (ip, port)) except: print("%s:%s is not valid" % (ip, port))
def parse(self,response): sel = scrapy.Selector(response) article_info = sel.xpath("//a") for info in article_info: item = GovcrawlItem() link = info.xpath('@href').extract() if not link: continue position = link[0].find("/") if position < 0 or "?" not in link[0]: continue elif "http" not in link[0]: url = response.url + link[0][position:] else: url = link[0] yield scrapy.Request(url,callback=self.parse) item['link'] = url title = info.xpath('text()').extract() if title: item['title'] = title[0] else: item['title'] = None #print item['link'] yield item
def parse_page(self, response): next_page = response.meta.get('page') + 1 json_data = json.loads(response.text) if json_data.get('type') != 'success': return articles = scrapy.Selector(text=json_data.get('html')).css('article') for article in articles: yield { 'author': article.css('div.author-meta a ::text').extract_first(), 'date': article.css('div.clock-meta a ::text').extract_first(), 'title': article.css('h1.entry-title ::text').extract_first() } yield scrapy.FormRequest( self.scrolling_url, formdata={'action': 'infinite_scroll', 'page': str(next_page), 'order': 'DESC'}, callback=self.parse_page, meta={'page': next_page} )
def parse_item(self, response): item = CrawlmeizituItem() selector = scrapy.Selector(response) image_title = selector.xpath('//h2/a/text()').extract() image_url = selector.xpath('//h2/a/@href').extract() image_tags = selector.xpath('//div[@class="metaRight"]/p/text()').extract() if selector.xpath('//*[@id="picture"]/p/img/@src').extract(): image_src = selector.xpath('//*[@id="picture"]/p/img/@src').extract() else: image_src = selector.xpath('//*[@id="maincontent"]/div/p/img/@src').extract() if selector.xpath('//*[@id="picture"]/p/img/@alt').extract(): pic_name = selector.xpath('//*[@id="picture"]/p/img/@alt').extract() else: pic_name = selector.xpath('//*[@id="maincontent"]/div/p/img/@alt').extract() #//*[@id="maincontent"]/div/p/img/@alt item['title'] = image_title item['url'] = image_url item['tags'] = image_tags item['src'] = image_src item['alt'] = pic_name print(item) time.sleep(1) yield item
def parse(self,response): sel = Selector(response) keys = sel.xpath('//*[@class="menu_main job_hopping"]/h2/text()').extract() i = 1 item = defaultdict(list) for key in keys: if key.strip() != '': print "test" print key.strip() try: print i item[key.strip()].append(sel.xpath('//*[@class="menu_box"][{}]/div[2]/dl/dd/a/text()'.format(i)).extract()) i = i + 1 # item["key"].append(key) except Exception, e: print e else: continue yield item
def fas_browse_suppliers_using_every_sector_filter( context: Context, actor_alias: str): actor = context.get_actor(actor_alias) session = actor.session response = fas_ui_find_supplier.go_to(session, term="") context.response = response sector_filters_selector = "#id_sectors input::attr(value)" content = response.content.decode("utf-8") sector_filters = Selector(text=content).css( sector_filters_selector).extract() results = {} for sector in sector_filters: logging.debug( "%s will browse Suppliers by Industry sector filter '%s'", actor_alias, sector ) response = fas_ui_find_supplier.go_to(session, sectors=[sector]) results[sector] = { "url": response.request.url, "sectors": [sector], "response": response } context.results = results
def fas_browse_suppliers_by_invalid_sectors( context: Context, actor_alias: str): actor = context.get_actor(actor_alias) session = actor.session response = fas_ui_find_supplier.go_to(session, term="") context.response = response sector_selector = "#id_sectors input::attr(value)" content = response.content.decode("utf-8") filters = Selector(text=content).css(sector_selector).extract() sectors = list(set(choice(filters) for _ in range(randrange(1, len(filters))))) sectors.append("this_is_an_invalid_sector_filter") logging.debug( "%s will browse Suppliers by multiple Industry sector filters and will" " inject an invalid filter: '%s'", actor_alias, ", ".join(sectors) ) context.response = fas_ui_find_supplier.go_to(session, sectors=sectors)
def fas_should_see_filtered_search_results(context, actor_alias): results = context.results sector_filters_selector = "#id_sectors input" for industry, result in results.items(): context.response = result["response"] content = result["response"].content.decode("utf-8") filters = Selector(text=content).css(sector_filters_selector).extract() for fil in filters: sector = Selector(text=fil).css("input::attr(value)").extract()[0] checked = True if Selector(text=fil).css("input::attr(checked)").extract() else False if sector in result["sectors"]: with assertion_msg( "Expected search results to be filtered by '%s' sector" " but this filter was not checked!"): assert checked else: with assertion_msg( "Expected search results to be filtered only by " "following sectors '%s', but they are also filtered " "by '%s'!", ", ".join(result['sectors']), sector): assert not checked logging.debug( "%s was presented with '%s' industry search results correctly " "filtered by following sectors: '%s'", actor_alias, industry, ", ".join(result['sectors']))
def fas_should_see_highlighted_search_term(context, actor_alias, search_term): response = context.response content = response.content.decode("utf-8") search_summaries_selector = ".ed-company-search-summary" summaries = Selector(text=content).css(search_summaries_selector).extract() tag = "em" keywords = [surround(keyword, tag) for keyword in search_term.split()] founds = [] for summary in summaries: founds += [(keyword in summary) for keyword in keywords] with assertion_msg( "Expected to see at least 1 search result with highlighted search " "term: '%s'".format(", ".join(keywords))): assert any(founds) logging.debug( "{alias} found highlighted search {term}: '{keywords}' {founds} {times}" " in {results} search results".format( alias=actor_alias, term="terms" if len(keywords) > 1 else "term", keywords=", ".join(keywords), founds=len([f for f in founds if f]), times="times" if len([f for f in founds if f]) > 1 else "time", results=len(summaries)))
def parse_url_list(self,response): sel = scrapy.Selector(response) wait_text = sel.xpath("//p[@id='loading']//text()").extract() if wait_text: #??? meta = response.meta meta['isscreen'] = 1 #scrapy ???URL?????????url??? yield scrapy.Request(response.url, meta=meta, callback=self.parse_validate,dont_filter=True) else: #????html?? url_list = sel.xpath("//h4[@class='weui_media_title']/@hrefs").extract() for li in url_list: href = li.strip() url = 'http://mp.weixin.qq.com%s' % href #print(url) yield scrapy.Request(url, meta=self.meta, callback=self.parse_item)
def enrich_wrapper(func): """ item_loader???pickle ?????????response???selector??, ??????? ???enrich??????selector????????selector :param func: :return: """ @wraps(func) def wrapper(*args, **kwargs): item_loader = args[1] response = args[2] selector = Selector(text=response.text) item_loader.selector = selector result = func(*args, **kwargs) item_loader.selector = None return result return wrapper
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//tr[@class="cells"]').extract() for i, info in enumerate(infos): self.log(info) val = Selector(text = info) ip = val.xpath('//td[2]/text()').extract_first() port = val.xpath('//td[3]/text()').extract_first() country = val.xpath('//td[5]/text()').extract_first() anonymity = val.xpath('//td[4]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
def parse_url_list(self, response): sel = scrapy.Selector(response) print(sel) # first_url_list = sel.xpath('//title[1]//text()').extract() # print(first_url_list) article_xpath = ".//*[@id='news']/ul/li/div/a[1]/@href" article_url_list = sel.xpath(article_xpath).extract() for article_url in article_url_list: print(article_url) yield scrapy.Request(article_url,self.parse_article) #yield self.parse_article(url) #content = selenium_request(article_url_list) #print(content)
def parse_info(self, response): selector = scrapy.Selector(response) item = WeiboWebInfoItem() info = selector.xpath("body/div[@class='u']/div[@class='tip2']") info_text = info.extract_first() try: item['ID'] = re.findall("uid=(.*?)\">", info_text)[0] item['TweetsNum'] = re.findall("??\[(.*?)\]</span>", info_text)[0] item['FollowerNum'] = re.findall("??\[(.*?)\]</span>", info_text)[0] item['FanNum'] = re.findall("??\[(.*?)\]</span>", info_text)[0] tweet_url, follower_url = url_generator_for_id(item['ID']) item['URL'] = tweet_url except: pass basic_info_url = 'http://weibo.cn/%s/info' % item['ID'] yield scrapy.Request(basic_info_url, meta={"item": item}, callback=self.parse_basic_info)
def parse(self, response): sel = scrapy.Selector(response) dataList = sel.xpath("//div[@class='m-fly-item s-oneway']") items = [] for index,each in enumerate(dataList): flight_each = "//div[@id='list-box']/div["+str(index+1)+"]" detail_span = "//div[@class='fl-detail-nav']/ul/li[1]/span[@class='nav-label']" f_route_div = "//div[@class='m-fl-info-bd']/div" airports = sel.xpath(flight_each + f_route_div + '/p[3]//text()').extract() company = sel.xpath(flight_each + f_route_div + '/p[1]//text()').extract() flight_time = sel.xpath(flight_each + f_route_div + '/p[2]//text()').extract() passtime = sel.xpath(flight_each + f_route_div + '/p[4]//text()').extract() price = sel.xpath(flight_each + "//div[@class='fl-price-box']//em//text()").extract() item = FindtripItem() item['site'] = 'Qua' item['company'] = company item['flight_time'] = flight_time item['airports'] = airports item['passtime'] = passtime item['price'] = price items.append(item) return items
def parse_detail(self, response): res_dir = response.meta["RESDIR"] print 'res_dir:', res_dir rensel = scrapy.Selector(response) text = rensel.xpath('//script/text()').extract() tmp1 = re.findall(r'"url":\"(.*?)\"', str(text)) if len(tmp1) > 0: uid_p_list = [] for i in tmp1: uid_p_list.append(i.strip().replace('\\', '')) for i in uid_p_list[1:]: pid = i.split('/')[-3] print i r = Redis(host='192.168.5.24', port='6379') print r.llen(self.MCOUNTRY) r.lpush(self.MCOUNTRY,i)
def parse_job(self, response): """Parse a joblink into a JobItem. """ s = Selector(response) item = JobItem() item['url'] = response.url item['site'] = 'Remote.co' item['title'] = s.css('h1::text').extract_first() item['company'] = s.xpath( '//strong[@itemprop="name"]/text()').extract_first() job = s.css('.job-description') job.xpath('p[1]') item['text'] = s.xpath( '//div[@class="job_description"]//text()').extract() try: posted = s.xpath('//time//text()').extract_first() item['date_posted'] = utilities.naturaltime( posted.replace('Posted ', '')).isoformat() except Exception as e: self.logger.error(e) yield item
def parse_job(self, response): """Parse a joblink into a JobItem. """ s = Selector(response) item = JobItem() item['url'] = response.url item['site'] = 'RemoteWorking' item['title'] = s.css('h1::text').extract_first() item['text'] = s.xpath( '//div[@itemprop="description"]//text()').extract() try: posted = s.xpath('//li[@class="date-posted"]//text()').extract_first() item['date_posted'] = utilities.naturaltime( posted.replace('Posted ', '')).isoformat() except Exception as e: self.logger.error(e) yield item
def parse_job(self, response): """Parse a joblink into a JobItem. """ s = Selector(response) item = JobItem() item['url'] = response.url item['site'] = 'Jobspresso' item['title'] = s.xpath( '//h2[@class="page-title"]//text()').extract_first() item['text'] = s.xpath( '//div[@itemprop="description"]//text()').extract() try: posted = s.xpath('//date/text()').extract_first() item['date_posted'] = parse_time(posted).isoformat() except Exception as e: self.logger.error(e) yield item
def parse(self, response): """Get the pagination links and hand them off. """ s = Selector(response) pagination = s.css('.pagination') pagelinks = [response.url] pagelinks.extend(pagination.xpath( '//a[contains(@href, "l-remote/p-")]/@href').extract()) # for pagelink in pagelinks: for pagelink in pagelinks[:1]: request = Request( urljoin(self.root, pagelink), callback=self.parse_jobspage, dont_filter=True, ) yield request
def parse_job(self, response): """Parse a joblink into a JobItem. """ s = Selector(response) item = JobItem() item['url'] = response.url.split('?')[0] item['site'] = 'CareerBuilder' item['title'] = s.css('h1::text').extract_first() item['text'] = s.css('.job-facts::text').extract() item['text'].extend(s.css('.item').css('.tag::text').extract()) item['text'].extend(s.css('.description::text').extract()) try: posted = s.xpath( '//h3[@id="job-begin-date"]/text()').extract_first() item['date_posted'] = utilities.naturaltime( posted.replace('Posted ', '')).isoformat() except Exception as e: self.logger.error(e) yield item
def parse(self, response): hxs = scrapy.Selector(response) slots_tutorials = hxs.xpath('//td[@class="slot slot-tutorial"]') for slot in slots_tutorials: speakers_tutorials = slot.xpath('//span[@class="speaker"]/text()').extract() urls_tutorials = slot.xpath('//span[@class="title"]//@href').extract() talks_tutorials = slot.xpath('//span[@class="title"]//a/text()').extract() indexSpeaker=0 for speaker in speakers_tutorials: yield Request(url=''.join(('http://www.pydata.org', urls_tutorials[indexSpeaker])), callback=self.parse_details, meta={'speaker': speaker.strip(), 'url': urls_tutorials[indexSpeaker], 'talk': talks_tutorials[indexSpeaker]} ) indexSpeaker=indexSpeaker+1
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//tbody/tr').extract() for i, info in enumerate(infos): if i == 0: continue val = Selector(text = info) ip = val.xpath('//td[1]/text()').extract_first() port = val.xpath('//td[2]/text()').extract_first() country = val.xpath('//td[6]/text()').extract_first() anonymity = val.xpath('//td[3]/text()').extract_first() https = val.xpath('//td[4]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
def parse(self, response): def getdomain(url): proto, rest = urllib.splittype(url) host, rest = urllib.splithost(rest) return "http://"+host sel=scrapy.Selector(response) links_in_a_page=sel.xpath('//a[@href]') for link_sel in links_in_a_page: item=XinhuaItem() link=str(link_sel.re('href="(.*?)"')[0]) if link: if not link.startswith('http'): link=response.url+link #link=getdomain(response.url)+link yield scrapy.Request(link,callback=self.parse) p1=re.compile(r'.*\d{4}-\d{2}/\d{2}.*') if re.match(p1,link): print ("Y: "+link) item['link']=link yield item else: print ("F: "+link)
def parse(self, response): def getdomain(url): proto, rest = urllib.splittype(url) host, rest = urllib.splithost(rest) return "http://"+host sel=scrapy.Selector(response) links_in_a_page = sel.xpath('//a[@href]') for link_sel in links_in_a_page: item=QqurlItem() link=str(link_sel.re('href="(.*?)"')[0]) if link: if not link.startswith('http'): if link.startswith('javascript'): continue if link.startswith('//support'): continue link=getdomain(response.url)+link if re.match('.*comment.*',link): continue yield scrapy.Request(link,callback=self.parse) if not re.match('.*comment.*',link): if re.match('^http.*qq.com.*\.s?html?$',link): item['link']=link yield item
def parse(self, response): def getdomain(url): #proto,rest=urllib.splittype(url) #host,rest=urllib.splithost(rest) return "http:" sel = scrapy.Selector(response) links_in_a_page=sel.xpath('//a[@href]') for link_sel in links_in_a_page: item=SohuItem() link=str(link_sel.re('href="(.*?)"')[0]) if link: if not link.startswith('http'): link=getdomain(response.url)+link yield scrapy.Request(link,callback=self.parse) p1=re.compile(r'.*/a/.*') p2=re.compile(r'.*#comment_area$') p3=re.compile(r'.*news.sohu.com.*s?html?$') if (re.match(p3,link) or re.match(p1,link)) and (not re.match(p2,link)): #print ('T: '+link) item['link']=link yield item else: pass #print ('F: '+link)
def alternative_parse_method(self, response): # An alternative would be to build a Scrapy selector from the JS string # and extract the data using CSS selectors script = response.xpath('//script[contains(., "var data =")]/text()').extract_first() sel = scrapy.Selector(root=js2xml.parse(script)) for quote in sel.css('var[name="data"] > array > object'): yield { 'text': quote.css('property[name="text"] > string::text').extract_first(), 'author': quote.css('property[name="author"] property[name="name"] > string::text').extract_first(), 'tags': quote.css('property[name="tags"] string::text').extract(), } link_next = response.css('li.next a::attr("href")').extract_first() if link_next: yield scrapy.Request(response.urljoin(link_next))
def parse(self, response): self.driver.get(response.url) sel = scrapy.Selector(text=self.driver.page_source) for quote in sel.css('div.quote'): yield { 'text': quote.css('span.text::text').extract_first(), 'author': quote.css('span small::text').extract_first(), 'tags': quote.css('div.tags a.tag::text').extract(), } next_page = sel.css('li.next > a::attr(href)').extract_first() if next_page: yield scrapy.Request(response.urljoin(next_page))
def parse(self, response): selector = scrapy.Selector(response) #item = CrawlmeizituItemPage() next_pages = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract() next_pages_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract() all_urls = [] if '???' in next_pages_text: next_url = "http://www.meizitu.com/a/{}".format(next_pages[-2]) with open('..//url.txt', 'a+') as fp: fp.write('\n') fp.write(next_url) fp.write("\n") request = scrapy.http.Request(next_url, callback=self.parse) time.sleep(2) yield request all_info = selector.xpath('//h3[@class="tit"]/a') #?????????? for info in all_info: links = info.xpath('//h3[@class="tit"]/a/@href').extract() for link in links: request = scrapy.http.Request(link, callback=self.parse_item) time.sleep(1) yield request # next_link = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract() # next_link_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract() # if '???' in next_link_text: # nextPage = "http://www.meizitu.com/a/{}".format(next_link[-2]) # item['page_url'] = nextPage # yield item #??????????
def parse_detail(self,response): item = CrawldetailsItem() sel = Selector(response) try: item["kd"] = response.meta['kd'] item["title"] = self.get_text(sel,'//*[@id="job_detail"]/dt/h1/@title') item["company"] = sel.xpath('//*[@id="container"]/div[2]/dl/dt/a/div/h2/text()').extract()[0].strip() item["city"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[2]/text()').extract()[0] item["address"] = sel.xpath('//*[@id="container"]/div[2]/dl/dd/div[1]/text()').extract()[0] industry = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[1]').extract()[0] item["industry"] = BeautifulSoup(industry).get_text().encode("utf-8").split(' ')[1].strip() scale = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[2]').extract()[0] item["scale"] = BeautifulSoup(scale).get_text().encode("utf-8").split(' ')[1].strip() phase = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[2]/li').extract()[0] item["phase"] = BeautifulSoup(phase).get_text().encode("utf-8").split(' ')[1].strip() item["salary"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[1]/text()').extract()[0] item["experience"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[3]/text()').extract()[0] item["education"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[4]/text()').extract()[0] item["description"] = self.get_text(sel,'//*[@id="job_detail"]/dd[2]') item["url"] = response.url item["published"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[3]/text()').extract()[0][:-8] item["tag"] = self.get_text(sel, '//*[@id="job_detail"]/dd[1]/p[2]/text()') except Exception, e: print e yield item
def get_case_studies_details(response: Response): content = response.content.decode("utf-8") article_selector = "#company-projects > article" articles = Selector(text=content).css(article_selector).extract() result = [] for article in articles: title = Selector(text=article).css("h3::text").extract()[0] summary = Selector(text=article).css("p::text").extract()[0] href = Selector(text=article).css("a::attr(href)").extract()[0] slug = href.split("/")[-2] assert slug, "Could not extract case study slug from {}".format(article) logging.debug("Got case study slug: %s", slug) result.append((title, summary, href, slug)) assert result, "No Case Study details extracted from {}".format(articles) return result
def fas_get_company_profile_url(response: Response, name: str) -> str: content = response.content.decode("utf-8") links_to_profiles_selector = "#ed-search-list-container a" href_selector = "a::attr(href)" links_to_profiles = Selector(text=content).css( links_to_profiles_selector).extract() profile_url = None for link in links_to_profiles: if escape_html(name).lower() in escape_html(link).lower(): profile_url = Selector(text=link).css(href_selector).extract()[0] with assertion_msg( "Couldn't find link to '%s' company profile page in the response", name): assert profile_url return profile_url
def fas_follow_case_study_links_to_related_sectors(context, actor_alias): actor = context.get_actor(actor_alias) session = actor.session content = context.response.content.decode("utf-8") links_css_selector = "#company-showcase .case-study-info a" links_to_sectors = Selector(text=content).css(links_css_selector).extract() with assertion_msg("Expected to find at least 1 link to Industry sector" "associated with Company Showcase Case Study"): assert links_css_selector results = {} fas_url = get_absolute_url("ui-supplier:landing") for link in links_to_sectors: industry = Selector(text=link).css("a::text").extract()[0] href = Selector(text=link).css("a::attr(href)").extract()[0] url = urljoin(fas_url, href) sectors = [value for _, value in parse_qsl(urlsplit(href).query)] logging.debug( "%s will look for Suppliers in '%s' Industry sectors '%s'", actor_alias, industry, ", ".join(sectors) ) response = make_request(Method.GET, url=url, session=session) results[industry] = { "url": url, "sectors": sectors, "response": response } context.results = results
def fas_should_see_unfiltered_search_results(context, actor_alias): response = context.response content = response.content.decode("utf-8") sector_filters_selector = "#id_sectors input" filters = Selector(text=content).css(sector_filters_selector).extract() for fil in filters: sector = Selector(text=fil).css("input::attr(value)").extract()[0] selector = "input::attr(checked)" checked = True if Selector(text=fil).css(selector).extract() else False with assertion_msg( "Expected search results to be unfiltered but this " "filter was checked: '%s'", sector): assert not checked logging.debug("%s was shown with unfiltered search results", actor_alias)
def parse_location(self,response): loc_hxs = scrapy.Selector(response) loc_xs = loc_hxs.xpath('//div[@id="aside"]/script[1]').extract()[0] coord_text = re.findall(r'lng:\w+.\w+,lat:\w+.\w+',loc_xs)[0] item = response.meta['item'] item['location'] = coord_text.encode('gbk') return item #print coord_text
def parse(self,response): reload(sys) sys.setdefaultencoding('utf8') print '__________' if response.status == 403: print 'meet 403, sleep 600 sconds' import time time.sleep(1200) yield Request(response.url,callback=self.parse) #404,???????????? elif response.status == 404: print 'meet 404,return' else: hxs = scrapy.Selector(response) for i in range(1,31): item = SoufangItem() name_ = hxs.xpath('/html/body/div[4]/div[1]/ul/li['+str(i)+']/div[1]/div[1]/a/text()').extract() name = ''.join(name_) http = hxs.xpath('/html/body/div[4]/div[1]/ul/li['+str(i)+']/div[1]/div[1]/a/@href').extract() href = ''.join(http) #href = href + 'xiangqing/' item['name'] = name.encode('gbk') item['link'] = href.encode('gbk') yield Request(href,callback=self.parse_detail,meta={'item':item}) print name, href print '__________'
def parse_detail(self,response): #print 'in' loc_hxs = scrapy.Selector(response) loudongzongshu = loc_hxs.xpath('/html/body/div[5]/div[2]/div[2]/div[5]/span[2]/text()').extract() loudongzongshu = ''.join(loudongzongshu) fangwuzongshu = loc_hxs.xpath('/html/body/div[5]/div[2]/div[2]/div[6]/span[2]/text()').extract() fangwuzongshu = ''.join(fangwuzongshu) item = response.meta['item'] item['address'] = loudongzongshu.encode('gbk') item['zonghushu'] = fangwuzongshu.encode('gbk') return item
def parse_detail(self,response): loc_hxs = scrapy.Selector(response) build_num_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[2]/text()').extract() build_num = ''.join(build_num_) total_households_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[4]/text()').extract() total_households = ''.join(total_households_) plot_ratio_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[6]/text()').extract() plot_ratio = ''.join(plot_ratio_) green_ratio_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[8]/text()').extract() green_ratio = ''.join(green_ratio_) property_fee_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[10]/text()').extract() property_fee = ''.join(property_fee_) item = response.meta['item'] item['build_num'] = build_num.encode('gbk') item['total_households'] = total_households.encode('gbk') item['plot_ratio'] = plot_ratio.encode('gbk') item['greening_ratio'] = green_ratio.encode('gbk') item['properity_fee'] = property_fee.encode('gbk') return item
def test_parse_content(self): content = requests.get('http://xiaoguotu.to8to.com/topic/11.html') response = Response('http://xiaoguotu.to8to.com/topic/11.html') response.text = content.content.decode("utf-8") selector = Selector(response) title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0] description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0] items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p') article = [] text = '' for index, item_selector in enumerate(items_selector): try: text = item_selector.xpath('span/text()').extract()[0] except IndexError: try: img_url = item_selector.xpath('img/@src').extract()[0] img_width = 0 try: img_width = item_selector.xpath('img/@width').extract()[0] except IndexError: pass img_height = 0 try: img_height = item_selector.xpath('img/@height').extract()[0] except IndexError: pass article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height}) except IndexError: continue design_topic_item = DesignTopicItem() design_topic_item['title'] = title design_topic_item['description'] = description design_topic_item['article'] = article design_topic_item['html_url'] = response.url return design_topic_item
def parse(self, response): sel = scrapy.Selector(response) #print(sel.xpath('//title').extract()) fligint_div = "//ul[@class='news-list2']/li[1]/div[@class='gzh-box2']/div[@class='img-box']/a[1]/@href" first_url_list = sel.xpath(fligint_div).extract() self.first_url = first_url_list[0] print(self.first_url) yield scrapy.Request(self.first_url,meta=self.meta, callback=self.parse_url_list)