Python scrapy.selector 模块,Selector() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.selector.Selector()

项目:crawl_web    作者:hanxlinsist    | 项目源码 | 文件源码
def parse_book(self, response):
        item = BookItem()
        sel = Selector(response)
        e = sel.xpath("//div[@id='wrapper']")
        item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
        item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
        item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()
        item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
        item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()

        item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
        item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
        item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
        item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
        item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()

        item['tag'] = response.xpath("//*[@id = 'db-tags-section']/descendant::a/text()").extract()

        request = scrapy.Request(response.url + "/comments/hot", callback=self.parse_review) # ???????????
        request.meta['item'] = item

        return request


    # ???????????
项目:crawl_web    作者:hanxlinsist    | 项目源码 | 文件源码
def parse_item(self, response):
        item = BookItem()
        sel = Selector(response)
        e = sel.xpath("//div[@id='wrapper']")
        item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
        item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
        item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()


        item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
        item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()
        item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
        item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
        item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
        item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
        item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()

        return item
项目:crawl_web    作者:hanxlinsist    | 项目源码 | 文件源码
def parse(self, response):
        item = BookItem()
        sel = Selector(response)
        e = sel.xpath("//div[@id='wrapper']")
        item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
        item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
        item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()
        item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
        item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()

        item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
        item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
        item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
        item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
        item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()

        item['tag'] = response.xpath("//*[@id = 'db-tags-section']/descendant::a/text()").extract()

        request = scrapy.Request(response.url + "/comments/hot", callback=self.parse_review) # ???????????
        request.meta['item'] = item

        return request


    # ???????????
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def parse(self, response):

        list_types = Selector(response).xpath('//div[@class="listado_1"]//ul/li/a')
        for types in list_types:
            href=  types.xpath("./@href").extract()
            text = types.xpath("./text()").extract()
            if Terms.filterBytype(text[0]):
                type = Terms.getType(text[0])
                initiative_url = Utils.createUrl(response.url,href[0])
                yield scrapy.Request(initiative_url,errback=self.errback_httpbin,callback=self.initiatives, meta={'type': type})
        """
        urlsa = ""
        urlsa = "http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Indice%20de%20Iniciativas?_piref73_1335503_73_1335500_1335500.next_page=/wc/servidorCGI&CMD=VERLST&BASE=IW12&PIECE=IWC2&FMT=INITXD1S.fmt&FORM1=INITXLUS.fmt&DOCS=100-100&QUERY=%28I%29.ACIN1.+%26+%28161%29.SINI."


        yield scrapy.Request(urlsa, errback=self.errback_httpbin, callback=self.oneinitiative,
                             meta={'type': u"Proposición no de Ley en Comisión"})
        """
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def initiatives(self, response):
        type = response.meta['type']
        first_url = Selector(response).xpath('//div[@class="resultados_encontrados"]/p/a/@href').extract()[0]
        num_inis = Selector(response).xpath('//div[@class="SUBTITULO_CONTENIDO"]/span/text()').extract()
        split = first_url.partition("&DOCS=1-1")
        for i in range(1,int(num_inis[0])+1):
            new_url = split[0]+"&DOCS="+str(i)+"-"+str(i)+split[2]
            initiative_url = Utils.createUrl(response.url,new_url)
            CheckItems.addElement(initiative_url)

            if Blacklist.getElement(initiative_url):
                if not Blacklist.getElement(initiative_url):
                    yield scrapy.Request(initiative_url,errback=self.errback_httpbin,
                                         callback=self.oneinitiative, meta = {'type':type})
            else:
                yield scrapy.Request(initiative_url,errback=self.errback_httpbin,
                                     callback=self.oneinitiative, meta = {'type':type})
项目:job_scraper    作者:wlabatey    | 项目源码 | 文件源码
def parse_items(self, response):
        hxs = Selector(response)
        jobs = hxs.xpath('//div[contains(@class, "searchResultTitle")]')
        items = []
        for job in jobs:
            item = Job()
            item["title"] = job.xpath('.//h2/a[contains(@id, "TITLE")]/text()').extract()[0].strip()
            company = job.xpath('.//p/span[contains(@id, "CONTACT_OFFICE")]/text()').extract()
            item["company"] = company[0].strip() if company else "n/a"
            item["location"] = job.xpath('.//p/span[contains(@id, "FREE_LOCATION")]/text()').extract()[0].strip()
            item["url"] = job.xpath('.//h2/a[contains(@id, "TITLE")]/@href').extract()[0]
            item["date_posted"] = job.xpath('.//p/span[contains(@id, "POSTED_DATE")]/text()').extract()[0].strip()
            salary = job.xpath('.//p/span[contains(@id, "SALARY")]/text()').extract()
            item["salary"] = salary[0].strip() if salary else "n/a"
            item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d") 
            item["job_board"] = "dice"
            items.append(item)
        return items
项目:job_scraper    作者:wlabatey    | 项目源码 | 文件源码
def parse(self, response):
        hxs = Selector(response)
        jobs = hxs.xpath('//div[contains(@class, "-job-item")]')
        items = []
        for job in jobs:
            item = Job()
            item["title"] = job.xpath('.//a[@class="job-link"]/text()').extract()[0]
            item["company"] = job.xpath('.//div[@class="-name"]/text()').extract()[0].strip()
            item["location"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-location"]/text()').extract()[0].strip())
            item["url"] = job.xpath('.//a[@class="job-link"]/@href').extract()[0]
            item["date_posted"] = job.xpath('.//p[contains(@class, "-posted-date")]/text()').extract()[0].strip()
            item["salary"] = job.xpath('.//span[@class="-salary"]/text()').extract_first(default='n/a').strip()
            item["tags"] = job.css('.-tags p a.post-tag::text').extract()
            item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d") 
            item["job_board"] = "stackOverflow"
            items.append(item)
        return items
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def parse_page(self, response):
        sel = Selector(text = response.body)
        infos = sel.xpath('//tr[@class="odd"]').extract()
        for info in infos:
            val = Selector(text = info)
            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[4]/a/text()').extract_first()
            anonymity = val.xpath('//td[5]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def parse_page(self, response):
        super(ProxyRoxSpider, self).parse_page(response)

        data = response.xpath('//tr[@class="fat"]').extract()
        for i, d in enumerate(data):
            sel = Selector(text = d)

            ip_port = sel.xpath('//td/a/text()').extract_first()
            ip = ip_port.split(':')[0]
            port = ip_port.split(':')[1]
            country = sel.xpath('//td/span[@class="region"]/text()').extract_first()
            anonymity = sel.xpath('//td/span/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name
            )

            self.add_proxy(proxy = proxy)
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def parse_page(self, response):
        super(ProxyDBSpider, self).parse_page(response)

        data = response.xpath('//tbody/tr').extract()
        for i, d in enumerate(data):
            sel = Selector(text = d)

            ip_port = sel.xpath('//td/a/text()').extract_first()
            ip = ip_port.split(':')[0]
            port = ip_port.split(':')[1]
            country = sel.xpath('//td/img/@title').extract_first()
            anonymity = sel.xpath('//td/span[@class="text-success"]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name
            )

            self.add_proxy(proxy = proxy)
项目:cmc-transparencia-spider    作者:CodeForCuritiba    | 项目源码 | 文件源码
def parse_salaries(self, response):
        """
        The values about person salary is in another table
        in another page, that function grab all the table headers
        and values and assign to the entity[entity_id]
        The id was passed in the response.meta
        """

        item = VereadorItem()
        item['name'] = response.meta['name']
        item['id'] = response.meta['entity_id']
        item['mesano'] = response.meta['mesano']

        for salary in response.xpath('//*[@id="holerite"]').extract():
            selector = Selector(text=salary)
            table = selector.xpath('//tr[@class="holerite_valor"]/td/text()').extract()
            item["salary_gross"] = table[0]
            item["salary_liquid"] = selector.xpath('//tr[@class="holerite_valor"]/td/strong/text()').extract_first()
            return item
项目:makinami    作者:Coderhypo    | 项目源码 | 文件源码
def parse(self, response):
        sel = Selector(response)

        self.item = AccountItem()
        self.item['oj'] = 'poj'
        self.item['username'] = self.username
        if self.is_login:
            try:
                self.item['rank'] = sel.xpath('//center/table/tr')[1].\
                        xpath('.//td/font/text()').extract()[0]
                self.item['accept'] = sel.xpath('//center/table/tr')[2].\
                        xpath('.//td/a/text()').extract()[0]
                self.item['submit'] = sel.xpath('//center/table/tr')[3].\
                        xpath('.//td/a/text()').extract()[0]
                yield Request(self.accepted_url % self.username,
                              callback = self.accepted
                             )
                self.item['status'] = 'Authentication Success'
            except:
                self.item['status'] = 'Unknown Error'
        else:
            self.item['status'] = 'Authentication Failed'

        yield self.item
项目:Spider_Hub    作者:WiseDoge    | 项目源码 | 文件源码
def parse_item(self, response):
        item = DoubanmovieItem()
        sel = Selector(response)

        title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0]
        year = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0]
        commit_num = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').extract()[0]
        star = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0]
        director = sel.xpath(
            '//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0]
        screenwriter = sel.xpath(
            '//*[@id="info"]/span[2]/span[2]/a/text()').extract()[0]

        item['title'] = title
        item['date'] = year
        item['star'] = star
        item['commit_num'] = commit_num
        item['director'] = director
        item['screenwriter'] = screenwriter

        return item
项目:JianShu-Donate    作者:whatbeg    | 项目源码 | 文件源码
def parse(self, response):
        selector = Selector(response)
        articles = selector.xpath('//ul[@class="article-list thumbnails"]/li')

        for article in articles:
            item = Jianshu2Item()
            url = article.xpath('div/h4/a/@href').extract()
            likeNum = article.xpath('div/div/span[2]/text()').extract()
            posturl = 'http://www.jianshu.com'+url[0]

            if len(likeNum) == 0:
                item['likeNum'] = 0
            else:
                item['likeNum'] = int(likeNum[0].split(' ')[-1])

            request = Request(posturl,callback=self.parse_donate)
            request.meta['item'] = item
            yield request

        next_link = selector.xpath('//*[@id="list-container"]/div[@class="load-more"]/button/@data-url').extract()[0]
        if next_link:
            next_link = self.url + str(next_link)
            yield Request(next_link,callback=self.parse)
项目:DoubanSpyder    作者:muyeby    | 项目源码 | 文件源码
def parse_article(self,response):
        hxs = Selector(response)
        keyword = response.meta['keyword']
        movie_name = hxs.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        movie_roles_paths = hxs.xpath('//*[@id="info"]/span[3]/span[2]')
        movie_roles = []
        for movie_roles_path in movie_roles_paths:
            movie_roles = movie_roles_path.select('.//*[@rel="v:starring"]/text()').extract()
        movie_classification= hxs.xpath('//span[@property="v:genre"]/text()').extract()
        douban_item = DoubanItem()
        douban_item['movie_keyword'] = keyword
        douban_item['movie_name'] = ''.join(movie_name).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';').replace(' ','')
        douban_item['movie_roles'] = ';'.join(movie_roles).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
        douban_item['movie_classification'] = ';'.join(movie_classification).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
        article_link = hxs.xpath('//*[@id="review_section"]/div/div/div/h3/a/@href').extract()
        tmp = "https://movie.douban.com/review/"
        for item in article_link:
            if tmp in item:
                yield Request(item,meta={'item': douban_item},callback=self.parse_item,cookies=[{'name': 'COOKIE_NAME','value': 'VALUE','domain': '.douban.com','path': '/'},])
项目:PythonCrawler-Scrapy-Mysql-File-Template    作者:lawlite19    | 项目源码 | 文件源码
def parse(self, response):
        se=Selector(response) #???????HtmlXPathSelector???
        if(re.match("http://desk.zol.com.cn/fengjing/\d+x\d+/\d+.html", response.url)):#??url??????????url????
            src=se.xpath("//ul[@class='pic-list2  clearfix']/li")#???ul?????li

            for i in range(len(src)):#??li??
                imgURLs=se.xpath("//ul[@class='pic-list2  clearfix']/li[%d]/a/img/@src"%i).extract() #??????????
                titles=se.xpath("//ul[@class='pic-list2  clearfix']/li[%d]/a/img/@title"%i).extract()

                if imgURLs:
                    realUrl=imgURLs[0].replace("t_s208x130c5","t_s2560x1600c5") #????????????????
                    file_name=u"%s.jpg"%titles[0] #????????

                    path=os.path.join("D:\pics",file_name)#??????????????F??pics????

                    type = sys.getfilesystemencoding()
                    print file_name.encode(type)  

                    item=WebcrawlerScrapyItem()  #??item??????item??,?????????????item???
                    item['name']=file_name 
                    item['url']=realUrl
                    print item["name"],item["url"]    

                    yield item  #??item,???????item

                    urllib.urlretrieve(realUrl,path)  #??????????????????????????????????????

            all_urls=se.xpath("//a/@href").extract()#???????url
            for url in all_urls:
                if url.startswith("/fengjing/1920x1080/"):#??????????????
                    yield Request("http://desk.zol.com.cn"+url,callback=self.parse)
项目:FIFA-Player-Ratings    作者:HashirZahir    | 项目源码 | 文件源码
def parse(self, response):
        #obtains links from page to page and passes links to parse_playerURL
        sel = Selector(response)    #define selector based on response object (points to urls in start_urls by default) 
        url_list = sel.xpath('//tbody/tr/td[@class="player"]/a/@href')   #obtain a list of href links that contain relative links of players

        for i in url_list:
            relative_url = self.clean_str(i.extract())    #i is a selector and hence need to extract it to obtain unicode object
            print urljoin(response.url, relative_url)   #urljoin is able to merge absolute and relative paths to form 1 coherent link
            req = Request(urljoin(response.url, relative_url),callback=self.parse_playerURL)   #pass on request with new urls to parse_playerURL
            req.headers["User-Agent"] = self.random_ua()    
            yield req

        next_url=sel.xpath('//div[@class="right-nav pull-right"]/a[@rel="next"]/@href').extract_first()  
        if(next_url):                                                                       #checks if next page exists
            clean_next_url = self.clean_str(next_url)
            reqNext = Request(urljoin(response.url, clean_next_url),callback=self.parse)    #calls back this function to repeat process on new list of links
            yield reqNext
项目:MonkeyKing_crawler_recommender    作者:BitTigerInst    | 项目源码 | 文件源码
def parse(self, response):
    page = Selector(response)

    hrefs = page.xpath('//h4[@class="title"]/a/@href')

    for href in hrefs:
      url = href.extract()
      yield scrapy.Request(url, callback=self.parse_item)

    div = page.xpath('//div[@class="page-ctrl ctrl-app"]')
    hrefs = div.xpath('.//a/@href').extract()

    for href in hrefs:
      url = response.urljoin(href)
      print url
      # yield scrapy.Request(url, self.parse, meta={
      #   'splash': {
      #     'endpoint': 'render.html',
      #     'args': {'wait': 0.5}
      #   }
      # })
项目:MonkeyKing_crawler_recommender    作者:BitTigerInst    | 项目源码 | 文件源码
def parse_item(self, response):
    page = Selector(response)
    item = AppstoreItem()

    item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').extract_first().encode('utf-8')
    item['url'] = response.url
    item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1)
    item['intro'] = page.xpath('//meta[@name="description"]/@content').extract_first().encode('utf-8')

    divs = page.xpath('//div[@class="open-info"]')
    recomm = ""
    for div in divs:
      url = div.xpath('./p[@class="name"]/a/@href').extract_first()
      recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
      name = div.xpath('./p[@class="name"]/a/text()').extract_first().encode('utf-8')
      recomm += "{0}:{1},".format(recommended_appid, name)
    item['recommended'] = recomm
    yield item
项目:MonkeyKing_crawler_recommender    作者:BitTigerInst    | 项目源码 | 文件源码
def parse_item(self, response):
    page = Selector(response)
    item = AppstoreItem()

    item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').extract_first().encode('utf-8')
    item['url'] = response.url
    item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1)
    item['intro'] = page.xpath('//meta[@name="description"]/@content').extract_first().encode('utf-8')

    divs = page.xpath('//div[@class="open-info"]')
    recomm = ""
    for div in divs:
      url = div.xpath('./p[@class="name"]/a/@href').extract_first()
      recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
      name = div.xpath('./p[@class="name"]/a/text()').extract_first().encode('utf-8')
      recomm += "{0}:{1},".format(recommended_appid, name)
    item['recommended'] = recomm
    yield item
项目:MonkeyKing_crawler_recommender    作者:BitTigerInst    | 项目源码 | 文件源码
def parse_page(self, response):
        page = Selector(response)
        lis = page.xpath('//ul[@class="applist"]/li')
        if lis == None:
            return

        url_common = 'http://app.mi.com'

        for li in lis:
            item = XiaomiAppstoreCrawlerItem()
            item['title'] = li.xpath('./h5/a/text()').extract_first().encode('utf-8')
            url = li.xpath('./h5/a/@href').extract_first()
            appid = re.match(r'/detail/(.*)', url).group(1)
            item['appid'] = appid
            # import pudb; pu.db
            req = scrapy.Request(url_common + url, callback=self.parse_details)
            req.meta["item"] = item
            yield req
项目:first-crawler    作者:Xinghaoz    | 项目源码 | 文件源码
def parse_item(self, response):
        url_trim = response.url.split('?')[0]


        page = Selector(response)
        title = page.xpath('//span[@itemprop="name"]/text()').extract_first()
        images = page.xpath('//img[@id="J_BigImg"]/@src').extract_first()
        availability = page.xpath('//dd[@class="num clearfix"]/div[@class="J_GoodsStock goods-stock fl"]/text()').extract_first()
        status = response.status

        item = FashionItem()
        item['url'] = url_trim
        item['title'] = title.encode('utf-8')
        item['images'] = images
        item['availability'] = availability.encode('utf-8')
        item['status'] = status
        return item
项目:QQMusicSpider    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def getMusListToFile(qqid, line, browser, filename):
    m_url = 'http://g.gogoqq.com/music.htm?uin=%s' % qqid
    browser.get(m_url)
    #time.sleep(2)
    WebDriverWait(browser, 2, 0.5).until(lambda item:item.find_element_by_xpath('//*[@id="list"]').is_displayed())
    time.sleep(1)
    liList = Selector(text = browser.page_source).xpath(u'//*[@id="list"]/li/a')
    mList = []
    for m in liList:
        mus = m.xpath('text()')[0].extract()
        print mus  
        mList.append(mus)
    f = open(filename, 'a')
    string = line + '  #music#:' + '##m##'.join(mList)
    f.write(string + '\n')
    f.close()
项目:TvLive    作者:Rano1    | 项目源码 | 文件源码
def parse(self, response):
        sel = Selector(response)
        movie_name = sel.xpath("//div[@class='pl2']/a/text()[1]").extract()
        movie_url = sel.xpath("//div[@class='pl2']/a/@href").extract()
        movie_star = sel.xpath("//div[@class='pl2']/div/span[@class='rating_nums']/text()").extract()

        # item = DoubanNewMovieItem()
        item = {}
        # item['movie_name'] = [n.encode('utf-8') for n in movie_name]
        item['movie_name'] = movie_name
        item['movie_star'] = [n for n in movie_star]
        item['movie_url'] = [n for n in movie_url]

        yield item

        print(item['movie_name'], item['movie_star'], item['movie_url'])
项目:spider_scrapy_lianjia    作者:stamhe    | 项目源码 | 文件源码
def parse_category(self, response):
        self.log("=================================================")
        sel = Selector(response)
        shop_type = response.meta['shop_type']
        city_id = response.meta['city_id']

        cat_url = response.url
        http_status = response.status
        self.log("http_url = %s" % cat_url)
        self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))

        self.log("shop_type = %s" % shop_type)
        items = []
        shop_list = sel.xpath('//li[@class="t-item-box t-district J_li"]/div[@class="t-item"]/div[@class="t-list"]/ul/li')
    self.log("shop_list_len = %d" % len(shop_list))
        for shop in shop_list:
            uri = shop.xpath('a/@href').extract()[0]
            self.log("page_uri = %s" % uri)
            yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
项目:spider_scrapy_lianjia    作者:stamhe    | 项目源码 | 文件源码
def parse_category(self, response):
        self.log("=================================================")
        sel = Selector(response)
        shop_type = response.meta['shop_type']
        city_id = response.meta['city_id']

        cat_url = response.url
        http_status = response.status
        self.log("http_url = %s" % cat_url)
        self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))

        self.log("shop_type = %s" % shop_type)
        items = []
        #shop_list = sel.xpath('//li[@class="t-item-box t-district J_li"]/div[@class="t-item"]/div[@class="t-list"]/ul/li')
        region_list = sel.xpath('//div[@id="region-nav"]/a')
    self.log("region_list_len = %d" % len(region_list))
        for region in region_list:
            uri = region.xpath('@href').extract()[0]
            self.log("page_uri = %s" % uri)
            #yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
            yield scrapy.Request(uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
项目:spider_scrapy_lianjia    作者:stamhe    | 项目源码 | 文件源码
def parse(self, response):
        sel = Selector(response)

        cat_url = response.url
        http_status = response.status
        self.log("http_url = %s" % cat_url)
        self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))

        item = SpiderDianpingXmtItem()
        item['chenshi_name']    = "" 
        item['shop_type']       = 0
        item['shop_url']        = ""
        item['shop_name']       = ""
        item['shop_addr']       = ""
        item['shop_mobile']     = ""
        item['shop_intro']      = ""

        return item
项目:spider_scrapy_lianjia    作者:stamhe    | 项目源码 | 文件源码
def parse(self, response):
        sel = Selector(response)
        if response.meta.has_key("shop_type"):
            shop_type = response.meta['shop_type']
        else:
            shop_type = self.shop_type_map[response.url]['shop_type']

        if response.meta.has_key("city_id"):
            city_id = response.meta['city_id']
        else:
            city_id = self.shop_type_map[response.url]['city_id']

        cat_url = response.url
        http_status = response.status
        self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))

        self.log("shop_type = %s" % shop_type)
        items = []
        shop_list = sel.xpath('//div[@id="region-nav"]/a')
        for shop in shop_list:
            uri = shop.xpath('@href').extract()[0]
            self.log("page_uri = %s" % uri)
            yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
项目:spider_scrapy_lianjia    作者:stamhe    | 项目源码 | 文件源码
def parse(self, response):
        sel = Selector(response)
        xiaoqu_uri = sel.xpath('//span[@class="title"]/a/@href').extract()[0]
        xiaoqu_list = xiaoqu_uri.split('/')
        xiaoqu_id   = xiaoqu_list[2]
        items = []
        house_lists = sel.xpath('//div[@class="list-wrap"]/ul[@class="house-lst"]/li')
        for house in house_lists:
            item = SpiderScrapyLianjiaItem()
            item['xiaoqu_id']   = xiaoqu_id
            item['house_id']    = house.xpath('@data-id').extract()[0]
            item['title']       = house.xpath('div[@class="info-panel"]/h2/a/text()').extract()[0]
            item['price']       = house.xpath('div[@class="info-panel"]/div[@class="col-3"]/div[@class="price"]/span/text()').extract()[0]
            item['view_count']  = house.xpath('div[@class="info-panel"]/div[@class="col-2"]/div[@class="square"]/div/span/text()').extract()[0]
            #item['size']        = house.xpath('div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/span/text()').extract()
            items.append(item)

        return items
项目:cl1024    作者:wuchujiecode    | 项目源码 | 文件源码
def parse_item(self, response):
        item = Cl1024Item()
        item['cl_title'] = response.meta['cl_title']
        item['cl_url'] = response.meta['cl_url']
        item['cl_bankuai'] = response.meta['cl_bankuai']
        item['posted'] = response.meta['posted']
        # redownloaded = re.search('downloaded:(.+?)<BR>', response.body)
        # downloaded = redownloaded[12:-4]
        sel = Selector(response)
        downloaded = sel.xpath('//td/table/tr/td/text()').extract()[2]
        item['torrent_downloaded'] = downloaded[17:]
        item['torrent_url'] = response.url
        ref = sel.xpath('//input[@name="ref"]/@value').extract_first()
        reff = sel.xpath('//input[@name="reff"]/@value').extract_first()

        dl = ('http://www.rmdown.com/download.php?ref=%s&&reff=%s&submit=download' % (ref, reff)).encode('utf-8')
        item['torrent_download_urls'] = dl

        yield item
项目:cl1024    作者:wuchujiecode    | 项目源码 | 文件源码
def get_torrent(self, response):
        sel = Selector(response)
        cl_title = sel.xpath('//td[@class="h"]/text()[2]').extract_first()
        cl_bankuai = sel.xpath('//div[@class="t3"]/table/tr/td/b/a[2]/text()').extract_first()
        cl_url = response.url
        torrent = re.search('rmdown\.com(.+?)</a>', response.body)
        torrent_url = 'http://www.' + torrent.group()[:-4]
        posted = sel.xpath('//div[@class="tipad"]/text()').extract()[1]
        posted = posted.encode('utf-8')[9:-7]
        yield Request(
            url=torrent_url,
            meta={
                'cl_title': cl_title,
                'cl_bankuai': cl_bankuai,
                'cl_url': cl_url,
                'posted': posted,
            },
            callback=self.parse_item,
            dont_filter=True)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def get_first_page(self, response):
        request_state = self.if_too_many_request(response.body, 'first_page')
        registrant = response.meta['registrant']
        if (request_state == False):
            s = Selector(text=response.body)
            content = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr/td[@class = "lf"]/a/img[@alt="..."]/../@href'
            domain_url_list = s.xpath(content).extract()
            content2 = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr'
            s_list = s.xpath(content2)
            domain_url_list2 = []
            for s in s_list:
                url2 = s.xpath('td[@class = "lf"]/a/img[@alt="..."]/../@href').extract()[0]
                domain_url_list2.append(url2)
            for url in domain_url_list2:
                cookie = get_cookie()
                url = "https://www.benmi.com" + url
                item = RwhoisRegistrantItem()
                item['registrant'] = registrant
                yield scrapy.Request(url, headers=self.head, meta={'cookie': cookie, 'item': item},
                                     cookies={"__cfduid": cookie[1], "cf_clearance": cookie[2],
                                              "BenmiUserInfo2": "Benmi-UN=hahaha321",
                                              "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "},
                                     callback=self.get_domain_name, dont_filter=True)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def get_first_page(self, response):
        request_state = self.if_too_many_request(response.body, 'first_page')
        registrant = response.meta['registrant']
        if (request_state == False):
            s = Selector(text=response.body)
            content = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr/td[@class = "lf"]/a/img[@alt="..."]/../@href'
            domain_url_list = s.xpath(content).extract()
            content2 = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr'
            s_list = s.xpath(content2)
            domain_url_list2 = []
            for s in s_list:
                url2 = s.xpath('td[@class = "lf"]/a/img[@alt="..."]/../@href').extract()[0]
                domain_url_list2.append(url2)
            for url in domain_url_list2:
                cookie = get_cookie()
                url = "https://www.benmi.com" + url
                item = RwhoisRegistrantItem()
                item['registrant'] = registrant
                yield scrapy.Request(url, headers=self.head, meta={'cookie': cookie, 'item': item},
                                     cookies={"__cfduid": cookie[1], "cf_clearance": cookie[2],
                                              "BenmiUserInfo2": "Benmi-UN=hahaha321",
                                              "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "},
                                     callback=self.get_domain_name, dont_filter=True)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def get_first_page(self, response):
        request_state = self.if_too_many_request(response.body, 'first_page')
        registrant = response.meta['registrant']
        if (request_state == False):
            s = Selector(text=response.body)
            content = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr/td[@class = "lf"]/a/img[@alt="..."]/../@href'
            domain_url_list = s.xpath(content).extract()
            content2 = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr'
            s_list = s.xpath(content2)
            domain_url_list2 = []
            for s in s_list:
                url2 = s.xpath('td[@class = "lf"]/a/img[@alt="..."]/../@href').extract()[0]
                domain_url_list2.append(url2)
            for url in domain_url_list2:
                cookie = get_cookie()
                url = "https://www.benmi.com" + url
                item = RwhoisRegistrantItem()
                item['registrant'] = registrant
                yield scrapy.Request(url, headers=self.head, meta={'cookie': cookie, 'item': item},
                                     cookies={"__cfduid": cookie[1], "cf_clearance": cookie[2],
                                              "BenmiUserInfo2": "Benmi-UN=hahaha321",
                                              "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "},
                                     callback=self.get_domain_name, dont_filter=True)
项目:Spider    作者:iamyaojie    | 项目源码 | 文件源码
def parse(self, response):

        item = DoubanspiderItem()
        selector = Selector(response)
        Movies = selector.xpath('//div[@class="info"]')
        for eachMovie in Movies:
            title = eachMovie.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()
            movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract()
            star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
            quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            item['title'] = title
            item['movieInfo'] = ';'.join(movieInfo)
            item['star'] = star
            item['quote'] = quote
            # ??item
            yield item
        nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
        if nextLink:
            nextLink = nextLink[0]
            print(nextLink)
            yield Request(self.url + nextLink,callback=self.parse)
项目:scrapy-streaming    作者:scrapy-plugins    | 项目源码 | 文件源码
def response_parse(response):
    global pending_requests
    # using scrapy selector to extract data from the html
    selector = Selector(text=response['body'])
    # get the url of repositories
    for href in selector.css("#subcategories-div > section > div > div.cat-item > a::attr('href')"):
        # we count the number of requests using this var
        pending_requests += 1
        # open a new request
        write_line('''
            {
                "type": "request",
                "id": "category",
                "url": "http://www.dmoz.org%s"
            }
        ''' % href.extract())
项目:scrapy-streaming    作者:scrapy-plugins    | 项目源码 | 文件源码
def response_category(response):
    global pending_requests
    # this response is no longer pending
    pending_requests -= 1

    # using scrapy selector
    selector = Selector(text=response['body'])
    # get div with link and title
    divs = selector.css('div.title-and-desc')

    for div in divs:
        url = div.css("a::attr('href')").extract_first();
        title = div.css("a > div.site-title::text").extract_first();
        result[title] = url

    # if finished all requests, we can close the spider
    if pending_requests == 0:
        # serialize the extracted data and close the spider
        open('outputs/dmoz_data.json', 'w').write(json.dumps(result))
        write_line('{"type": "close"}')
项目:fuli    作者:nixir    | 项目源码 | 文件源码
def parse(self, response):
        selector = Selector(response=response)
        articles = selector.xpath('//*[@id="main"]/*/div[@class="post-box"]')
        timeline = db.get_collection('timeline')
        for item in articles:
            try:
                title = item.xpath('div[@class="post-header"]/p/a/text()').extract()[0]
                # link URL
                url = item.xpath('div[@class="post-header"]/p/a/@href').extract()[0]
                description = item.xpath('*/div[@class="post-expert"]/text()').extract()[0]
                description = self._join_text(description)
                # image URL
                img = item.xpath('*/div[@class="post-info"]/a/img/@data-original').extract()[0]
                # YYYY-MM-DD
                #date = item.xpath('*/div[@class="post-date"]/text()').extract()[0].strip()
                date = item.xpath('div[@class="post-content"]/div[@class="post-footer"]/div[@class="post-date"]/text()').extract()[0]
                date = datetime.strptime(date, '%Y-%m-%d')
                self.save(title=title, url=url, description=description,
                          img=img, date=date)
            except IndexError:
                continue

        next_page = selector.xpath(u'//*/div[@class="page-navigator"]/li/a[text()="??? »"]/@href').extract()[0]
        yield Request(response.urljoin(next_page), self.parse)
项目:WebHubBot    作者:xiyouMc    | 项目源码 | 文件源码
def parse_ph_key(self, response):
        selector = Selector(response)
        logging.debug('request url:------>' + response.url)
        # logging.info(selector)
        divs = selector.xpath('//div[@class="phimage"]')
        for div in divs:
            viewkey = re.findall('viewkey=(.*?)"', div.extract())
            # logging.debug(viewkey)
            yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],
                          callback=self.parse_ph_info)
        url_next = selector.xpath(
            '//a[@class="orangeButton" and text()="Next "]/@href').extract()
        logging.debug(url_next)
        if url_next:
            # if self.test:
            logging.debug(' next page:---------->' + self.host + url_next[0])
            yield Request(url=self.host + url_next[0],
                          callback=self.parse_ph_key)
            # self.test = False
项目:WebHubBot    作者:xiyouMc    | 项目源码 | 文件源码
def parse_ph_info(self, response):
        phItem = PornVideoItem()
        selector = Selector(response)
        _ph_info = re.findall('flashvars_.*?=(.*?);\n', selector.extract())
        logging.debug('PH???JSON:')
        logging.debug(_ph_info)
        _ph_info_json = json.loads(_ph_info[0])
        duration = _ph_info_json.get('video_duration')
        phItem['video_duration'] = duration
        title = _ph_info_json.get('video_title')
        phItem['video_title'] = title
        image_url = _ph_info_json.get('image_url')
        phItem['image_url'] = image_url
        link_url = _ph_info_json.get('link_url')
        phItem['link_url'] = link_url
        quality_480p = _ph_info_json.get('quality_480p')
        phItem['quality_480p'] = quality_480p
        logging.info('duration:' + duration + ' title:' + title + ' image_url:'
                     + image_url + ' link_url:' + link_url)
        yield phItem
项目:Malwr    作者:ydc1992    | 项目源码 | 文件源码
def parse_downurl(self,response):
        try:
            antivirus1 =response.css("#static_antivirus").extract()[0]
            antivirus = Selector(response).css("#static_antivirus").extract()[0]
            # ?Static Analysis ------ Antivirus????
            antiresult  = re.findall("((Microsoft|Kaspersky|ESET\-NOD32)</td>\n\s*<td>\n\s*<span class=\"text\-error\")",antivirus.encode("utf-8"),re.S)
            # ?????????????????eset????????????????????
            if antiresult == []:
                return
            # ?????????????
            url = response.xpath("//a[contains(@class,'btn-primary')]/@href").extract()[0].encode('utf-8')
            url = urlparse.urljoin("https://malwr.com",url)

            item = MalwrItem()
            item['file_urls'] = [url]
            return item
        except Exception,e:
            pass
        return
项目:Android-Repackaged-App-Detection-System    作者:M157q    | 项目源码 | 文件源码
def parse_xpath(self, response, xpath):
        appItemList = []
        sel = Selector(response)
        for url in sel.xpath(xpath).extract():
            url = urljoin(response.url, url)
            log.msg("Catch an application: %s" % url, level=log.INFO)
            appItem = AppItem()
            appItem['url'] = url
            appItemList.append(appItem)
        return appItemList

    #def parse_anzhi(self, response, xpath):
    #    appItemList = []
    #    hxs = HtmlXPathSelector(response)
    #    for script in hxs.select(xpath).extract():
    #        id = re.search(r"\d+", script).group()
    #        url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,)
    #        appItem = AppItem()
    #        appItem['url'] = url
    #        appItemList.append(appItem)
    #    return appItemList
项目:Hanhan_NLP    作者:hanhanwu    | 项目源码 | 文件源码
def parse_articles(self, response):
        article_ptn = "http://www.theglobeandmail.com/opinion/(.*?)/article(\d+)/"
        resp_url = response.url
        article_m = re.match(article_ptn, resp_url)
        article_id = ''
        if article_m != None:
            article_id = article_m.group(2)

        if article_id == '32753320':
            print('***URL***', resp_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            text = Selector(text=response.text).xpath('//*[@id="content"]/div[1]/article/div/div[3]/div[2]').extract()


            if text:
                print("*****in Spider text*****", soup.title.string)
                yield {article_id: {"title": soup.title.string, "link": resp_url, "article_text": text}}
                comments_link = response.url + r'comments/'
                if comments_link == 'http://www.theglobeandmail.com/opinion/a-fascists-win-americas-moral-loss/article32753320/comments/':
                    yield Request(comments_link, callback=self.parse_comments)
项目:RealSpider    作者:RealSanqian    | 项目源码 | 文件源码
def parse(self, response):
        sel = Selector(response)

        #items = []
        #????url???
        item = CSDNBlogItem()

        article_url = str(response.url)
        article_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract()

        item['article_name'] = [n.encode('utf-8') for n in article_name]
        item['article_url'] = article_url.encode('utf-8')

        yield item

        #????????url
        urls = sel.xpath('//li[@class="next_article"]/a/@href').extract()
        for url in urls:
            print url
            url = "http://blog.csdn.net" + url
            print url
            yield Request(url, callback=self.parse)
项目:Sneaker-Notify    作者:YuLin12345    | 项目源码 | 文件源码
def parse(self, response):
        while True:
            try:
                products = Selector(response).xpath('//div[@class="grid-uniform grid--center wide--grid--middle"]//div[contains(@class,"grid__item")]')

                for product in products:
                    item = KithItem()
                    item['name'] = product.xpath('div/div/a[1]/img/@alt').extract()[0]
                    item['link'] = "https://kith.com" + product.xpath('div/div/a[1]/@href').extract()[0]
                    # item['image'] = "https:" + product.xpath('div/div/a[1]/img/@src').extract()[0]
                    item['size'] = "https://kith.com/cart/add.js?id=" + product.xpath('div/div/a[2]/div/*/div[1]/@data-value').extract()[0] + "&quantity=1"
                    yield item

                yield Request(KithURL, callback=self.parse, dont_filter=True, priority=0)   

            except:
                pass
项目:ArticleSpider    作者:mtianyan    | 项目源码 | 文件源码
def crawl_ips():
    #???????ip??
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
    for i in range(1568):
        re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)

        selector = Selector(text=re.text)
        all_trs = selector.css("#ip_list tr")


        ip_list = []
        for tr in all_trs[1:]:
            speed_str = tr.css(".bar::attr(title)").extract()[0]
            if speed_str:
                speed = float(speed_str.split("?")[0])
            all_texts = tr.css("td::text").extract()

            ip = all_texts[0]
            port = all_texts[1]
            proxy_type = all_texts[5]

            ip_list.append((ip, port, proxy_type, speed))

        for ip_info in ip_list:
            cursor.execute(
                "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format(
                    ip_info[0], ip_info[1], ip_info[3]
                )
            )

            conn.commit()
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def finishtext(self,response):
        finishitem = response.meta['fisnishitem']
        finishitem['contenido'] = []

        text = Selector(response).xpath('//div[@class="texto_completo"]').extract()[0]
        text= self.extractbyref(text=text,ref=finishitem['ref'])
        if text=="":
            try:
                text += Selector(response).xpath('//div[@class="texto_completo"]').extract()[0]
            except:
                CheckSystem.systemlog("No tiene texto para 'TEXTOFINAL' " + response.url + "ITEM URL "+finishitem['url'])

        finishitem['contenido'].append(Utils.removeHTMLtags(text))
        yield finishitem
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def searchDS(self,  response , number = None ,ref = None , name = None):
        try:
            text = Selector(response).xpath('//div[@class="texto_completo"]').extract()
            return Utils.removeForDS(text[0])
        except:
            return "URL rota"
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def extracttext(self, response, number, ref):
        textfragment = self.fragmenttxt(response,number)
        res = ""
        #Es el texto entero y no hay que fragmentar
        if not Utils.checkownRef(textfragment,ref):
            return Utils.removeHTMLtags(textfragment)

        texto = self.extractbyref(textfragment,ref,number)
        pages = Selector(response).xpath('//a/@name').extract()

        #para empezar desde el indice
        #bbusca mas texto
        hasfirsttext = False
        if Utils.isDiferentFirstTime(textfragment,ref):
            hasfirsttext=True
        if not hasfirsttext:
            pages = Utils.convertPagToNum(pages)
            try:
                index = pages.index(number)
            except:
                index=0
            for page in pages[index:]:
                if int(page) > int(number):
                    textfragment = self.fragmenttxt(response, page)
                    texto += self.extractother(textfragment, ref)
                        #si encuentra el otro rompe bucle
                    if Utils.checkotherRefandnotOwn(textfragment,ref):
                        break
        res = Utils.removeHTMLtags(texto)

        return res
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def fragmenttxt(self, response,number):
        pages = Selector(response).xpath('//p/a/@name').extract()
        text = Selector(response).xpath('//div[@class="texto_completo"]').extract()
        result = []
        control = False


        try:
            firstopage = Utils.getnumber(pages[0])
        except:
            firstopage= "1"
            control = True

        # selecciona del texto solo la pagina que nos resulta útil
        splittext = text[0].split("<br><br>")
        for i in splittext:
            if Utils.checkPage(i,number):
                control = True
                continue
            elif int(number) < int(firstopage):
                control = True
            if control  and Utils.checkPage(i,str(int(number)+1)):
                break
            if control:
                result.append(i)


        return Utils.concatlist(result)