我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.selector.Selector()。
def parse_book(self, response): item = BookItem() sel = Selector(response) e = sel.xpath("//div[@id='wrapper']") item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract() item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract() item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract() item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract() item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract() item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract() item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract() item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract() item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract() item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract() item['tag'] = response.xpath("//*[@id = 'db-tags-section']/descendant::a/text()").extract() request = scrapy.Request(response.url + "/comments/hot", callback=self.parse_review) # ??????????? request.meta['item'] = item return request # ???????????
def parse_item(self, response): item = BookItem() sel = Selector(response) e = sel.xpath("//div[@id='wrapper']") item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract() item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract() item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract() item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract() item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract() item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract() item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract() item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract() item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract() item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract() return item
def parse(self, response): item = BookItem() sel = Selector(response) e = sel.xpath("//div[@id='wrapper']") item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract() item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract() item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract() item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract() item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract() item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract() item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract() item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract() item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract() item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract() item['tag'] = response.xpath("//*[@id = 'db-tags-section']/descendant::a/text()").extract() request = scrapy.Request(response.url + "/comments/hot", callback=self.parse_review) # ??????????? request.meta['item'] = item return request # ???????????
def parse(self, response): list_types = Selector(response).xpath('//div[@class="listado_1"]//ul/li/a') for types in list_types: href= types.xpath("./@href").extract() text = types.xpath("./text()").extract() if Terms.filterBytype(text[0]): type = Terms.getType(text[0]) initiative_url = Utils.createUrl(response.url,href[0]) yield scrapy.Request(initiative_url,errback=self.errback_httpbin,callback=self.initiatives, meta={'type': type}) """ urlsa = "" urlsa = "http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Indice%20de%20Iniciativas?_piref73_1335503_73_1335500_1335500.next_page=/wc/servidorCGI&CMD=VERLST&BASE=IW12&PIECE=IWC2&FMT=INITXD1S.fmt&FORM1=INITXLUS.fmt&DOCS=100-100&QUERY=%28I%29.ACIN1.+%26+%28161%29.SINI." yield scrapy.Request(urlsa, errback=self.errback_httpbin, callback=self.oneinitiative, meta={'type': u"Proposición no de Ley en Comisión"}) """
def initiatives(self, response): type = response.meta['type'] first_url = Selector(response).xpath('//div[@class="resultados_encontrados"]/p/a/@href').extract()[0] num_inis = Selector(response).xpath('//div[@class="SUBTITULO_CONTENIDO"]/span/text()').extract() split = first_url.partition("&DOCS=1-1") for i in range(1,int(num_inis[0])+1): new_url = split[0]+"&DOCS="+str(i)+"-"+str(i)+split[2] initiative_url = Utils.createUrl(response.url,new_url) CheckItems.addElement(initiative_url) if Blacklist.getElement(initiative_url): if not Blacklist.getElement(initiative_url): yield scrapy.Request(initiative_url,errback=self.errback_httpbin, callback=self.oneinitiative, meta = {'type':type}) else: yield scrapy.Request(initiative_url,errback=self.errback_httpbin, callback=self.oneinitiative, meta = {'type':type})
def parse_items(self, response): hxs = Selector(response) jobs = hxs.xpath('//div[contains(@class, "searchResultTitle")]') items = [] for job in jobs: item = Job() item["title"] = job.xpath('.//h2/a[contains(@id, "TITLE")]/text()').extract()[0].strip() company = job.xpath('.//p/span[contains(@id, "CONTACT_OFFICE")]/text()').extract() item["company"] = company[0].strip() if company else "n/a" item["location"] = job.xpath('.//p/span[contains(@id, "FREE_LOCATION")]/text()').extract()[0].strip() item["url"] = job.xpath('.//h2/a[contains(@id, "TITLE")]/@href').extract()[0] item["date_posted"] = job.xpath('.//p/span[contains(@id, "POSTED_DATE")]/text()').extract()[0].strip() salary = job.xpath('.//p/span[contains(@id, "SALARY")]/text()').extract() item["salary"] = salary[0].strip() if salary else "n/a" item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d") item["job_board"] = "dice" items.append(item) return items
def parse(self, response): hxs = Selector(response) jobs = hxs.xpath('//div[contains(@class, "-job-item")]') items = [] for job in jobs: item = Job() item["title"] = job.xpath('.//a[@class="job-link"]/text()').extract()[0] item["company"] = job.xpath('.//div[@class="-name"]/text()').extract()[0].strip() item["location"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-location"]/text()').extract()[0].strip()) item["url"] = job.xpath('.//a[@class="job-link"]/@href').extract()[0] item["date_posted"] = job.xpath('.//p[contains(@class, "-posted-date")]/text()').extract()[0].strip() item["salary"] = job.xpath('.//span[@class="-salary"]/text()').extract_first(default='n/a').strip() item["tags"] = job.css('.-tags p a.post-tag::text').extract() item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d") item["job_board"] = "stackOverflow" items.append(item) return items
def parse_page(self, response): sel = Selector(text = response.body) infos = sel.xpath('//tr[@class="odd"]').extract() for info in infos: val = Selector(text = info) ip = val.xpath('//td[2]/text()').extract_first() port = val.xpath('//td[3]/text()').extract_first() country = val.xpath('//td[4]/a/text()').extract_first() anonymity = val.xpath('//td[5]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
def parse_page(self, response): super(ProxyRoxSpider, self).parse_page(response) data = response.xpath('//tr[@class="fat"]').extract() for i, d in enumerate(data): sel = Selector(text = d) ip_port = sel.xpath('//td/a/text()').extract_first() ip = ip_port.split(':')[0] port = ip_port.split(':')[1] country = sel.xpath('//td/span[@class="region"]/text()').extract_first() anonymity = sel.xpath('//td/span/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name ) self.add_proxy(proxy = proxy)
def parse_page(self, response): super(ProxyDBSpider, self).parse_page(response) data = response.xpath('//tbody/tr').extract() for i, d in enumerate(data): sel = Selector(text = d) ip_port = sel.xpath('//td/a/text()').extract_first() ip = ip_port.split(':')[0] port = ip_port.split(':')[1] country = sel.xpath('//td/img/@title').extract_first() anonymity = sel.xpath('//td/span[@class="text-success"]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name ) self.add_proxy(proxy = proxy)
def parse_salaries(self, response): """ The values about person salary is in another table in another page, that function grab all the table headers and values and assign to the entity[entity_id] The id was passed in the response.meta """ item = VereadorItem() item['name'] = response.meta['name'] item['id'] = response.meta['entity_id'] item['mesano'] = response.meta['mesano'] for salary in response.xpath('//*[@id="holerite"]').extract(): selector = Selector(text=salary) table = selector.xpath('//tr[@class="holerite_valor"]/td/text()').extract() item["salary_gross"] = table[0] item["salary_liquid"] = selector.xpath('//tr[@class="holerite_valor"]/td/strong/text()').extract_first() return item
def parse(self, response): sel = Selector(response) self.item = AccountItem() self.item['oj'] = 'poj' self.item['username'] = self.username if self.is_login: try: self.item['rank'] = sel.xpath('//center/table/tr')[1].\ xpath('.//td/font/text()').extract()[0] self.item['accept'] = sel.xpath('//center/table/tr')[2].\ xpath('.//td/a/text()').extract()[0] self.item['submit'] = sel.xpath('//center/table/tr')[3].\ xpath('.//td/a/text()').extract()[0] yield Request(self.accepted_url % self.username, callback = self.accepted ) self.item['status'] = 'Authentication Success' except: self.item['status'] = 'Unknown Error' else: self.item['status'] = 'Authentication Failed' yield self.item
def parse_item(self, response): item = DoubanmovieItem() sel = Selector(response) title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0] year = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0] commit_num = sel.xpath( '//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').extract()[0] star = sel.xpath( '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0] director = sel.xpath( '//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0] screenwriter = sel.xpath( '//*[@id="info"]/span[2]/span[2]/a/text()').extract()[0] item['title'] = title item['date'] = year item['star'] = star item['commit_num'] = commit_num item['director'] = director item['screenwriter'] = screenwriter return item
def parse(self, response): selector = Selector(response) articles = selector.xpath('//ul[@class="article-list thumbnails"]/li') for article in articles: item = Jianshu2Item() url = article.xpath('div/h4/a/@href').extract() likeNum = article.xpath('div/div/span[2]/text()').extract() posturl = 'http://www.jianshu.com'+url[0] if len(likeNum) == 0: item['likeNum'] = 0 else: item['likeNum'] = int(likeNum[0].split(' ')[-1]) request = Request(posturl,callback=self.parse_donate) request.meta['item'] = item yield request next_link = selector.xpath('//*[@id="list-container"]/div[@class="load-more"]/button/@data-url').extract()[0] if next_link: next_link = self.url + str(next_link) yield Request(next_link,callback=self.parse)
def parse_article(self,response): hxs = Selector(response) keyword = response.meta['keyword'] movie_name = hxs.xpath('//*[@id="content"]/h1/span[1]/text()').extract() movie_roles_paths = hxs.xpath('//*[@id="info"]/span[3]/span[2]') movie_roles = [] for movie_roles_path in movie_roles_paths: movie_roles = movie_roles_path.select('.//*[@rel="v:starring"]/text()').extract() movie_classification= hxs.xpath('//span[@property="v:genre"]/text()').extract() douban_item = DoubanItem() douban_item['movie_keyword'] = keyword douban_item['movie_name'] = ''.join(movie_name).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';').replace(' ','') douban_item['movie_roles'] = ';'.join(movie_roles).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') douban_item['movie_classification'] = ';'.join(movie_classification).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') article_link = hxs.xpath('//*[@id="review_section"]/div/div/div/h3/a/@href').extract() tmp = "https://movie.douban.com/review/" for item in article_link: if tmp in item: yield Request(item,meta={'item': douban_item},callback=self.parse_item,cookies=[{'name': 'COOKIE_NAME','value': 'VALUE','domain': '.douban.com','path': '/'},])
def parse(self, response): se=Selector(response) #???????HtmlXPathSelector??? if(re.match("http://desk.zol.com.cn/fengjing/\d+x\d+/\d+.html", response.url)):#??url??????????url???? src=se.xpath("//ul[@class='pic-list2 clearfix']/li")#???ul?????li for i in range(len(src)):#??li?? imgURLs=se.xpath("//ul[@class='pic-list2 clearfix']/li[%d]/a/img/@src"%i).extract() #?????????? titles=se.xpath("//ul[@class='pic-list2 clearfix']/li[%d]/a/img/@title"%i).extract() if imgURLs: realUrl=imgURLs[0].replace("t_s208x130c5","t_s2560x1600c5") #???????????????? file_name=u"%s.jpg"%titles[0] #???????? path=os.path.join("D:\pics",file_name)#??????????????F??pics???? type = sys.getfilesystemencoding() print file_name.encode(type) item=WebcrawlerScrapyItem() #??item??????item??,?????????????item??? item['name']=file_name item['url']=realUrl print item["name"],item["url"] yield item #??item,???????item urllib.urlretrieve(realUrl,path) #?????????????????????????????????????? all_urls=se.xpath("//a/@href").extract()#???????url for url in all_urls: if url.startswith("/fengjing/1920x1080/"):#?????????????? yield Request("http://desk.zol.com.cn"+url,callback=self.parse)
def parse(self, response): #obtains links from page to page and passes links to parse_playerURL sel = Selector(response) #define selector based on response object (points to urls in start_urls by default) url_list = sel.xpath('//tbody/tr/td[@class="player"]/a/@href') #obtain a list of href links that contain relative links of players for i in url_list: relative_url = self.clean_str(i.extract()) #i is a selector and hence need to extract it to obtain unicode object print urljoin(response.url, relative_url) #urljoin is able to merge absolute and relative paths to form 1 coherent link req = Request(urljoin(response.url, relative_url),callback=self.parse_playerURL) #pass on request with new urls to parse_playerURL req.headers["User-Agent"] = self.random_ua() yield req next_url=sel.xpath('//div[@class="right-nav pull-right"]/a[@rel="next"]/@href').extract_first() if(next_url): #checks if next page exists clean_next_url = self.clean_str(next_url) reqNext = Request(urljoin(response.url, clean_next_url),callback=self.parse) #calls back this function to repeat process on new list of links yield reqNext
def parse(self, response): page = Selector(response) hrefs = page.xpath('//h4[@class="title"]/a/@href') for href in hrefs: url = href.extract() yield scrapy.Request(url, callback=self.parse_item) div = page.xpath('//div[@class="page-ctrl ctrl-app"]') hrefs = div.xpath('.//a/@href').extract() for href in hrefs: url = response.urljoin(href) print url # yield scrapy.Request(url, self.parse, meta={ # 'splash': { # 'endpoint': 'render.html', # 'args': {'wait': 0.5} # } # })
def parse_item(self, response): page = Selector(response) item = AppstoreItem() item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').extract_first().encode('utf-8') item['url'] = response.url item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1) item['intro'] = page.xpath('//meta[@name="description"]/@content').extract_first().encode('utf-8') divs = page.xpath('//div[@class="open-info"]') recomm = "" for div in divs: url = div.xpath('./p[@class="name"]/a/@href').extract_first() recommended_appid = re.match(r'http://.*/(.*)', url).group(1) name = div.xpath('./p[@class="name"]/a/text()').extract_first().encode('utf-8') recomm += "{0}:{1},".format(recommended_appid, name) item['recommended'] = recomm yield item
def parse_page(self, response): page = Selector(response) lis = page.xpath('//ul[@class="applist"]/li') if lis == None: return url_common = 'http://app.mi.com' for li in lis: item = XiaomiAppstoreCrawlerItem() item['title'] = li.xpath('./h5/a/text()').extract_first().encode('utf-8') url = li.xpath('./h5/a/@href').extract_first() appid = re.match(r'/detail/(.*)', url).group(1) item['appid'] = appid # import pudb; pu.db req = scrapy.Request(url_common + url, callback=self.parse_details) req.meta["item"] = item yield req
def parse_item(self, response): url_trim = response.url.split('?')[0] page = Selector(response) title = page.xpath('//span[@itemprop="name"]/text()').extract_first() images = page.xpath('//img[@id="J_BigImg"]/@src').extract_first() availability = page.xpath('//dd[@class="num clearfix"]/div[@class="J_GoodsStock goods-stock fl"]/text()').extract_first() status = response.status item = FashionItem() item['url'] = url_trim item['title'] = title.encode('utf-8') item['images'] = images item['availability'] = availability.encode('utf-8') item['status'] = status return item
def getMusListToFile(qqid, line, browser, filename): m_url = 'http://g.gogoqq.com/music.htm?uin=%s' % qqid browser.get(m_url) #time.sleep(2) WebDriverWait(browser, 2, 0.5).until(lambda item:item.find_element_by_xpath('//*[@id="list"]').is_displayed()) time.sleep(1) liList = Selector(text = browser.page_source).xpath(u'//*[@id="list"]/li/a') mList = [] for m in liList: mus = m.xpath('text()')[0].extract() print mus mList.append(mus) f = open(filename, 'a') string = line + ' #music#:' + '##m##'.join(mList) f.write(string + '\n') f.close()
def parse(self, response): sel = Selector(response) movie_name = sel.xpath("//div[@class='pl2']/a/text()[1]").extract() movie_url = sel.xpath("//div[@class='pl2']/a/@href").extract() movie_star = sel.xpath("//div[@class='pl2']/div/span[@class='rating_nums']/text()").extract() # item = DoubanNewMovieItem() item = {} # item['movie_name'] = [n.encode('utf-8') for n in movie_name] item['movie_name'] = movie_name item['movie_star'] = [n for n in movie_star] item['movie_url'] = [n for n in movie_url] yield item print(item['movie_name'], item['movie_star'], item['movie_url'])
def parse_category(self, response): self.log("=================================================") sel = Selector(response) shop_type = response.meta['shop_type'] city_id = response.meta['city_id'] cat_url = response.url http_status = response.status self.log("http_url = %s" % cat_url) self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy'])) self.log("shop_type = %s" % shop_type) items = [] shop_list = sel.xpath('//li[@class="t-item-box t-district J_li"]/div[@class="t-item"]/div[@class="t-list"]/ul/li') self.log("shop_list_len = %d" % len(shop_list)) for shop in shop_list: uri = shop.xpath('a/@href').extract()[0] self.log("page_uri = %s" % uri) yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
def parse_category(self, response): self.log("=================================================") sel = Selector(response) shop_type = response.meta['shop_type'] city_id = response.meta['city_id'] cat_url = response.url http_status = response.status self.log("http_url = %s" % cat_url) self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy'])) self.log("shop_type = %s" % shop_type) items = [] #shop_list = sel.xpath('//li[@class="t-item-box t-district J_li"]/div[@class="t-item"]/div[@class="t-list"]/ul/li') region_list = sel.xpath('//div[@id="region-nav"]/a') self.log("region_list_len = %d" % len(region_list)) for region in region_list: uri = region.xpath('@href').extract()[0] self.log("page_uri = %s" % uri) #yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id}) yield scrapy.Request(uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
def parse(self, response): sel = Selector(response) cat_url = response.url http_status = response.status self.log("http_url = %s" % cat_url) self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy'])) item = SpiderDianpingXmtItem() item['chenshi_name'] = "" item['shop_type'] = 0 item['shop_url'] = "" item['shop_name'] = "" item['shop_addr'] = "" item['shop_mobile'] = "" item['shop_intro'] = "" return item
def parse(self, response): sel = Selector(response) if response.meta.has_key("shop_type"): shop_type = response.meta['shop_type'] else: shop_type = self.shop_type_map[response.url]['shop_type'] if response.meta.has_key("city_id"): city_id = response.meta['city_id'] else: city_id = self.shop_type_map[response.url]['city_id'] cat_url = response.url http_status = response.status self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy'])) self.log("shop_type = %s" % shop_type) items = [] shop_list = sel.xpath('//div[@id="region-nav"]/a') for shop in shop_list: uri = shop.xpath('@href').extract()[0] self.log("page_uri = %s" % uri) yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
def parse(self, response): sel = Selector(response) xiaoqu_uri = sel.xpath('//span[@class="title"]/a/@href').extract()[0] xiaoqu_list = xiaoqu_uri.split('/') xiaoqu_id = xiaoqu_list[2] items = [] house_lists = sel.xpath('//div[@class="list-wrap"]/ul[@class="house-lst"]/li') for house in house_lists: item = SpiderScrapyLianjiaItem() item['xiaoqu_id'] = xiaoqu_id item['house_id'] = house.xpath('@data-id').extract()[0] item['title'] = house.xpath('div[@class="info-panel"]/h2/a/text()').extract()[0] item['price'] = house.xpath('div[@class="info-panel"]/div[@class="col-3"]/div[@class="price"]/span/text()').extract()[0] item['view_count'] = house.xpath('div[@class="info-panel"]/div[@class="col-2"]/div[@class="square"]/div/span/text()').extract()[0] #item['size'] = house.xpath('div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/span/text()').extract() items.append(item) return items
def parse_item(self, response): item = Cl1024Item() item['cl_title'] = response.meta['cl_title'] item['cl_url'] = response.meta['cl_url'] item['cl_bankuai'] = response.meta['cl_bankuai'] item['posted'] = response.meta['posted'] # redownloaded = re.search('downloaded:(.+?)<BR>', response.body) # downloaded = redownloaded[12:-4] sel = Selector(response) downloaded = sel.xpath('//td/table/tr/td/text()').extract()[2] item['torrent_downloaded'] = downloaded[17:] item['torrent_url'] = response.url ref = sel.xpath('//input[@name="ref"]/@value').extract_first() reff = sel.xpath('//input[@name="reff"]/@value').extract_first() dl = ('http://www.rmdown.com/download.php?ref=%s&&reff=%s&submit=download' % (ref, reff)).encode('utf-8') item['torrent_download_urls'] = dl yield item
def get_torrent(self, response): sel = Selector(response) cl_title = sel.xpath('//td[@class="h"]/text()[2]').extract_first() cl_bankuai = sel.xpath('//div[@class="t3"]/table/tr/td/b/a[2]/text()').extract_first() cl_url = response.url torrent = re.search('rmdown\.com(.+?)</a>', response.body) torrent_url = 'http://www.' + torrent.group()[:-4] posted = sel.xpath('//div[@class="tipad"]/text()').extract()[1] posted = posted.encode('utf-8')[9:-7] yield Request( url=torrent_url, meta={ 'cl_title': cl_title, 'cl_bankuai': cl_bankuai, 'cl_url': cl_url, 'posted': posted, }, callback=self.parse_item, dont_filter=True)
def get_first_page(self, response): request_state = self.if_too_many_request(response.body, 'first_page') registrant = response.meta['registrant'] if (request_state == False): s = Selector(text=response.body) content = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr/td[@class = "lf"]/a/img[@alt="..."]/../@href' domain_url_list = s.xpath(content).extract() content2 = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr' s_list = s.xpath(content2) domain_url_list2 = [] for s in s_list: url2 = s.xpath('td[@class = "lf"]/a/img[@alt="..."]/../@href').extract()[0] domain_url_list2.append(url2) for url in domain_url_list2: cookie = get_cookie() url = "https://www.benmi.com" + url item = RwhoisRegistrantItem() item['registrant'] = registrant yield scrapy.Request(url, headers=self.head, meta={'cookie': cookie, 'item': item}, cookies={"__cfduid": cookie[1], "cf_clearance": cookie[2], "BenmiUserInfo2": "Benmi-UN=hahaha321", "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "}, callback=self.get_domain_name, dont_filter=True)
def parse(self, response): item = DoubanspiderItem() selector = Selector(response) Movies = selector.xpath('//div[@class="info"]') for eachMovie in Movies: title = eachMovie.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract() movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract() star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract() quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() item['title'] = title item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote # ??item yield item nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() if nextLink: nextLink = nextLink[0] print(nextLink) yield Request(self.url + nextLink,callback=self.parse)
def response_parse(response): global pending_requests # using scrapy selector to extract data from the html selector = Selector(text=response['body']) # get the url of repositories for href in selector.css("#subcategories-div > section > div > div.cat-item > a::attr('href')"): # we count the number of requests using this var pending_requests += 1 # open a new request write_line(''' { "type": "request", "id": "category", "url": "http://www.dmoz.org%s" } ''' % href.extract())
def response_category(response): global pending_requests # this response is no longer pending pending_requests -= 1 # using scrapy selector selector = Selector(text=response['body']) # get div with link and title divs = selector.css('div.title-and-desc') for div in divs: url = div.css("a::attr('href')").extract_first(); title = div.css("a > div.site-title::text").extract_first(); result[title] = url # if finished all requests, we can close the spider if pending_requests == 0: # serialize the extracted data and close the spider open('outputs/dmoz_data.json', 'w').write(json.dumps(result)) write_line('{"type": "close"}')
def parse(self, response): selector = Selector(response=response) articles = selector.xpath('//*[@id="main"]/*/div[@class="post-box"]') timeline = db.get_collection('timeline') for item in articles: try: title = item.xpath('div[@class="post-header"]/p/a/text()').extract()[0] # link URL url = item.xpath('div[@class="post-header"]/p/a/@href').extract()[0] description = item.xpath('*/div[@class="post-expert"]/text()').extract()[0] description = self._join_text(description) # image URL img = item.xpath('*/div[@class="post-info"]/a/img/@data-original').extract()[0] # YYYY-MM-DD #date = item.xpath('*/div[@class="post-date"]/text()').extract()[0].strip() date = item.xpath('div[@class="post-content"]/div[@class="post-footer"]/div[@class="post-date"]/text()').extract()[0] date = datetime.strptime(date, '%Y-%m-%d') self.save(title=title, url=url, description=description, img=img, date=date) except IndexError: continue next_page = selector.xpath(u'//*/div[@class="page-navigator"]/li/a[text()="??? »"]/@href').extract()[0] yield Request(response.urljoin(next_page), self.parse)
def parse_ph_key(self, response): selector = Selector(response) logging.debug('request url:------>' + response.url) # logging.info(selector) divs = selector.xpath('//div[@class="phimage"]') for div in divs: viewkey = re.findall('viewkey=(.*?)"', div.extract()) # logging.debug(viewkey) yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0], callback=self.parse_ph_info) url_next = selector.xpath( '//a[@class="orangeButton" and text()="Next "]/@href').extract() logging.debug(url_next) if url_next: # if self.test: logging.debug(' next page:---------->' + self.host + url_next[0]) yield Request(url=self.host + url_next[0], callback=self.parse_ph_key) # self.test = False
def parse_ph_info(self, response): phItem = PornVideoItem() selector = Selector(response) _ph_info = re.findall('flashvars_.*?=(.*?);\n', selector.extract()) logging.debug('PH???JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url) yield phItem
def parse_downurl(self,response): try: antivirus1 =response.css("#static_antivirus").extract()[0] antivirus = Selector(response).css("#static_antivirus").extract()[0] # ?Static Analysis ------ Antivirus???? antiresult = re.findall("((Microsoft|Kaspersky|ESET\-NOD32)</td>\n\s*<td>\n\s*<span class=\"text\-error\")",antivirus.encode("utf-8"),re.S) # ?????????????????eset???????????????????? if antiresult == []: return # ????????????? url = response.xpath("//a[contains(@class,'btn-primary')]/@href").extract()[0].encode('utf-8') url = urlparse.urljoin("https://malwr.com",url) item = MalwrItem() item['file_urls'] = [url] return item except Exception,e: pass return
def parse_xpath(self, response, xpath): appItemList = [] sel = Selector(response) for url in sel.xpath(xpath).extract(): url = urljoin(response.url, url) log.msg("Catch an application: %s" % url, level=log.INFO) appItem = AppItem() appItem['url'] = url appItemList.append(appItem) return appItemList #def parse_anzhi(self, response, xpath): # appItemList = [] # hxs = HtmlXPathSelector(response) # for script in hxs.select(xpath).extract(): # id = re.search(r"\d+", script).group() # url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,) # appItem = AppItem() # appItem['url'] = url # appItemList.append(appItem) # return appItemList
def parse_articles(self, response): article_ptn = "http://www.theglobeandmail.com/opinion/(.*?)/article(\d+)/" resp_url = response.url article_m = re.match(article_ptn, resp_url) article_id = '' if article_m != None: article_id = article_m.group(2) if article_id == '32753320': print('***URL***', resp_url) soup = BeautifulSoup(response.text, 'html.parser') text = Selector(text=response.text).xpath('//*[@id="content"]/div[1]/article/div/div[3]/div[2]').extract() if text: print("*****in Spider text*****", soup.title.string) yield {article_id: {"title": soup.title.string, "link": resp_url, "article_text": text}} comments_link = response.url + r'comments/' if comments_link == 'http://www.theglobeandmail.com/opinion/a-fascists-win-americas-moral-loss/article32753320/comments/': yield Request(comments_link, callback=self.parse_comments)
def parse(self, response): sel = Selector(response) #items = [] #????url??? item = CSDNBlogItem() article_url = str(response.url) article_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract() item['article_name'] = [n.encode('utf-8') for n in article_name] item['article_url'] = article_url.encode('utf-8') yield item #????????url urls = sel.xpath('//li[@class="next_article"]/a/@href').extract() for url in urls: print url url = "http://blog.csdn.net" + url print url yield Request(url, callback=self.parse)
def parse(self, response): while True: try: products = Selector(response).xpath('//div[@class="grid-uniform grid--center wide--grid--middle"]//div[contains(@class,"grid__item")]') for product in products: item = KithItem() item['name'] = product.xpath('div/div/a[1]/img/@alt').extract()[0] item['link'] = "https://kith.com" + product.xpath('div/div/a[1]/@href').extract()[0] # item['image'] = "https:" + product.xpath('div/div/a[1]/img/@src').extract()[0] item['size'] = "https://kith.com/cart/add.js?id=" + product.xpath('div/div/a[2]/div/*/div[1]/@data-value').extract()[0] + "&quantity=1" yield item yield Request(KithURL, callback=self.parse, dont_filter=True, priority=0) except: pass
def crawl_ips(): #???????ip?? headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"} for i in range(1568): re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) selector = Selector(text=re.text) all_trs = selector.css("#ip_list tr") ip_list = [] for tr in all_trs[1:]: speed_str = tr.css(".bar::attr(title)").extract()[0] if speed_str: speed = float(speed_str.split("?")[0]) all_texts = tr.css("td::text").extract() ip = all_texts[0] port = all_texts[1] proxy_type = all_texts[5] ip_list.append((ip, port, proxy_type, speed)) for ip_info in ip_list: cursor.execute( "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format( ip_info[0], ip_info[1], ip_info[3] ) ) conn.commit()
def finishtext(self,response): finishitem = response.meta['fisnishitem'] finishitem['contenido'] = [] text = Selector(response).xpath('//div[@class="texto_completo"]').extract()[0] text= self.extractbyref(text=text,ref=finishitem['ref']) if text=="": try: text += Selector(response).xpath('//div[@class="texto_completo"]').extract()[0] except: CheckSystem.systemlog("No tiene texto para 'TEXTOFINAL' " + response.url + "ITEM URL "+finishitem['url']) finishitem['contenido'].append(Utils.removeHTMLtags(text)) yield finishitem
def searchDS(self, response , number = None ,ref = None , name = None): try: text = Selector(response).xpath('//div[@class="texto_completo"]').extract() return Utils.removeForDS(text[0]) except: return "URL rota"
def extracttext(self, response, number, ref): textfragment = self.fragmenttxt(response,number) res = "" #Es el texto entero y no hay que fragmentar if not Utils.checkownRef(textfragment,ref): return Utils.removeHTMLtags(textfragment) texto = self.extractbyref(textfragment,ref,number) pages = Selector(response).xpath('//a/@name').extract() #para empezar desde el indice #bbusca mas texto hasfirsttext = False if Utils.isDiferentFirstTime(textfragment,ref): hasfirsttext=True if not hasfirsttext: pages = Utils.convertPagToNum(pages) try: index = pages.index(number) except: index=0 for page in pages[index:]: if int(page) > int(number): textfragment = self.fragmenttxt(response, page) texto += self.extractother(textfragment, ref) #si encuentra el otro rompe bucle if Utils.checkotherRefandnotOwn(textfragment,ref): break res = Utils.removeHTMLtags(texto) return res
def fragmenttxt(self, response,number): pages = Selector(response).xpath('//p/a/@name').extract() text = Selector(response).xpath('//div[@class="texto_completo"]').extract() result = [] control = False try: firstopage = Utils.getnumber(pages[0]) except: firstopage= "1" control = True # selecciona del texto solo la pagina que nos resulta útil splittext = text[0].split("<br><br>") for i in splittext: if Utils.checkPage(i,number): control = True continue elif int(number) < int(firstopage): control = True if control and Utils.checkPage(i,str(int(number)+1)): break if control: result.append(i) return Utils.concatlist(result)