Python lxml.html 模块，fromstring() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用lxml.html.fromstring()。

项目：X-ray-classification 作者：bendidi | 项目源码 | 文件源码

def main():
    for url in url_list :
        try:
            r = requests.get(url)
        except : continue
        tree = html.fromstring(r.text)

        script = tree.xpath('//script[@language="javascript"]/text()')[0]

        json_string = regex.findall(script)[0]
        json_data = json.loads(json_string)

        next_page_url = tree.xpath('//footer/a/@href')

        links = [domain + x['nodeRef'] for x in json_data]
        for link in links:
            extract(link)

项目：graph 作者：noxern | 项目源码 | 文件源码

def slack(text: hug.types.text):
    """Returns JSON containing an attachment with an image url for the Slack integration"""
    title = text

    if text == 'top250':
        top250_res = requests.get(IMDB_URL + '/chart/toptv', headers={'Accept-Language': 'en'})
        top250_page = html.fromstring(top250_res.text)
        candidates = top250_page.xpath('//*[@data-caller-name="chart-top250tv"]//tr/td[2]/a')

        title = random.choice(candidates).text

    return dict(
        response_type='in_channel',
        attachments=[
            dict(image_url=GRAPH_URL + f'/graph?title={quote(title)}&uuid={uuid.uuid4()}')
        ]
    )

项目：django-ses-gateway 作者：MicroPyramid | 项目源码 | 文件源码

def sns_notification(body):
    json_body = body.decode('utf8')
    js = json.loads(json_body.replace('\n', ''))
    if js["Type"] == "Notification":
        arg_info = js["Message"]
        arg_info = json.loads(arg_info)
        content = arg_info['content']
        subject = arg_info['mail']['commonHeaders']['subject']
        html_content = content.partition('Content-Type: text/html; charset=UTF-8')[2]
        if 'Content-Transfer-Encoding' in html_content:
            html_content = html_content.partition('Content-Transfer-Encoding: quoted-printable')[2]
        text = html_content.replace('\r\n', '')
        table = html.fromstring(text)
        content = ''
        for item in table:
            if item.text:
                content += item.text.strip()
        mail_content = str(content)
        from_mail = arg_info['mail']['source']
        to_mail = arg_info['mail']['destination'][0]
        hash_code = arg_info['mail']['destination'][0].split('@')[0]
        return subject, from_mail, to_mail, hash_code, mail_content

项目：Magic-Spoiler 作者：Cockatrice | 项目源码 | 文件源码

def scrape_mtgs_images(url='http://www.mtgsalvation.com/spoilers/183-hour-of-devastation', mtgscardurl='http://www.mtgsalvation.com/cards/hour-of-devastation/', exemptlist=[]):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    cards = {}
    cardstree = tree.xpath('//*[contains(@class, "log-card")]')
    for child in cardstree:
        if child.text in exemptlist:
            continue
        childurl = mtgscardurl + child.attrib['data-card-id'] + '-' + child.text.replace(
            ' ', '-').replace("'", "").replace(',', '').replace('-//', '')
        cardpage = requests.get(childurl)
        tree = html.fromstring(cardpage.content)
        cardtree = tree.xpath('//img[contains(@class, "card-spoiler-image")]')
        try:
            cardurl = cardtree[0].attrib['src']
        except:
            cardurl = ''
            pass
        cards[child.text] = {
            "url": cardurl
        }
        time.sleep(.2)
    return cards

项目：Magic-Spoiler 作者：Cockatrice | 项目源码 | 文件源码

def scrape_masterpieces(url='http://www.mtgsalvation.com/spoilers/181-amonkhet-invocations', mtgscardurl='http://www.mtgsalvation.com/cards/amonkhet-invocations/'):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    cards = []
    cardstree = tree.xpath('//*[contains(@class, "log-card")]')
    for child in cardstree:
        childurl = mtgscardurl + \
            child.attrib['data-card-id'] + '-' + child.text.replace(' ', '-')
        cardpage = requests.get(childurl)
        tree = html.fromstring(cardpage.content)
        cardtree = tree.xpath('//img[contains(@class, "card-spoiler-image")]')
        try:
            cardurl = cardtree[0].attrib['src']
        except:
            cardurl = ''
            pass
        card = {
            "name": child.text,
            "url": cardurl
        }
        cards.append(card)
    return cards

项目：scraping-python 作者：python-madrid | 项目源码 | 文件源码

def parse_lista_diputados(response):
    tree = fromstring(response.content)

    # listado de diputados
    diputados = tree.xpath('//div[@class="listado_1"]/ul/li/a/@href')
    for diputado in diputados:
        diputado_url = urljoin(response.url, diputado)
        response = requests.get(diputado_url)
        parse_diputado(response)

    # proxima pagina
    pagina_siguiente = tree.xpath('//a[contains(., "Página Siguiente")]/@href')
    if pagina_siguiente:
        pagina_siguiente_url = pagina_siguiente[0]
        response = requests.get(pagina_siguiente_url)
        parse_lista_diputados(response)

项目：national-geographic-wallpaper 作者：atareao | 项目源码 | 文件源码

def set_vokrugsveta_wallpaper():
    try:
        r = requests.get(URL04)
        if r.status_code == 200:
            doc = fromstring(r.text)
            results = doc.cssselect('a.article__pic')
            url = 'http://www.vokrugsveta.ru/' + results[0].get('href')
            print(url)
            r = requests.get(url, stream=True)
            if r.status_code == 200:
                doc = fromstring(r.text)
                results = doc.cssselect('img')
                for index, result in enumerate(results):
                    print(index, result.get('src'))
                i_url = 'http://www.vokrugsveta.ru/' + results[2].get('src')
                if download(i_url) is True:
                    set_background(comun.POTD)

            print(url)
    except Exception as e:
        print(e)

项目：searx-stats2 作者：dalf | 项目源码 | 文件源码

def get_searx_version(response_container):
    response_html = response_container.content.decode()
    try:
        dom = html.fromstring(response_html)
    except etree.XMLSyntaxError:
        # not a valid HTML document
        # TODO workaround with regex ?
        return ''

    searx_full_version = extract_text_from_dom(dom, "/html/head/meta[@name='generator']/@content")
    if searx_full_version is None:
        searx_version = ''
    else:
        s = searx_full_version.split('/')
        if len(s) == 2:
            searx_version = s[1]
        else:
            searx_version = searx_full_version
    return searx_version

项目：TapNews 作者：AaronTL | 项目源码 | 文件源码

def extract_news(news_url):
    # Fetch html
    session_requests = requests.session()
    response = session_requests.get(news_url, headers=getHeaders())

    news = {}

    try:
        # Parse html
        tree = html.fromstring(response.content)
        # Extract information
        news = tree.xpath(GET_CNN_NEWS_XPATH)
        news = ''.join(news)
    except Exception as e:
        print # coding=utf-8
        return {}

    return news

项目：OpenManga-sync 作者：nv95 | 项目源码 | 文件源码

def get(self, user_id):
        r = requests.get('http://grouple.co/user/%s/bookmarks' % user_id)
        tree = html.fromstring(r.text)
        tds = tree.xpath('//table')[0].xpath('//tr')[1:]
        mangas = list()
        for o in tds:
            item = o.xpath('.//a')[0]
            manga = {
                'name': item.xpath('./text()')[0],
                'path': item.xpath('./@href')[0],
                'summary': item.xpath('./@title')[0].split(': ', 1)[-1]
            }
            item = item.xpath('../a')[1]
            manga.update({
                'preview': item.xpath('./@rel')[0],
                'id': java_hash_code(manga['path']),
                'provider': provider_name(manga['path'])
            })
            if manga['provider'] is not None:
                mangas.append(manga)
        return {'all': mangas}

项目：TrampoCampinas 作者：mthbernardes | 项目源码 | 文件源码

def procura_emprego():
    busca = raw_input("[+] - Digite o nome da vaga ou uma palavra-chave: ").replace(' ','+').lower()
    url = "http://empregacampinas.com.br/page/1/?s="+busca
    #prox_pagina = 0
    while True:
        try:
            r = requests.get(url, timeout=2)
            tree = html.fromstring(r.content)
            vagas = tree.xpath('//*[@id="article"]/div/div/div/div/a/h2/text()')
            link = tree.xpath('//*[@id="article"]/div/div/div/div/a[@title]/@href')
            if len(vagas) > 1:
                qtd_vagas = len(vagas) - 1
            else:
                qtd_vagas = len(vagas)

            pagina = url.split('/')[4]
            info_vaga(qtd_vagas,pagina,vagas,link)
            #PEGA NOVA URL
            url = tree.xpath('//*[@class="nextpostslink"]/@href')[0]
        except:
            menu()

项目：corpoelec-pls 作者：Datkros | 项目源码 | 文件源码

def scrap_twitlonger(twitlonger):
    ''' 
        Takes a twitlonger post ID, scraps the body of the post
        and then returns a string depending on the contents of 
        the post. If the hour is stated in said post, it's added
        If it's not, then it's implied it's current time.

        Note to self: Implement GMT - whatever our president
        decides to change it to.
    '''
    page = requests.get('http://www.twitlonger.com/show/%s' %twitlonger)
    tree = html.fromstring(page.content)
    texto = tree.xpath('/html/body/div[2]/div[1]/div[3]/div/p[1]/text()')
    hora = re.search('[0-9]+:[0-9]+',texto[0])
    circuitos = texto[0].split(str('detallados a continuación: ').decode('utf-8'))[1].split(str(' #ElNiñoNoEsJuego').decode('utf-8'))[0]
    if hora:
        return "La luz se ira a las " + hora.group(0) + " en " + circuitos
    else:
        hora = re.search('En momentos',texto[0])
        if hora:
            return "La luz se ira a las " + str(datetime.datetime.now().time()) + " en " + circuitos

项目：base_function 作者：Rockyzsu | 项目源码 | 文件源码

def lxml_test():
    url = "http://www.caixunzz.com"
    req = urllib2.Request(url=url)
    resp = urllib2.urlopen(req)
    #print resp.read()
    '''
    parse_body=html.fromstring(resp.read())
    href=parse_body.xpath('//a[@class="label"]/@href')
    print href
    #not working from above
    '''

    tree = etree.HTML(resp.read())
    href = tree.xpath('//a[@class="label"]/@href')
    #print href.tag
    for i in href:
        #print html.tostring(i)
        #print type(i)
        print i

    print type(href)

#not working yet

项目：wadl2rst 作者：annegentle | 项目源码 | 文件源码

def scrape_url(url):
    #url = 'api-ref-compute-v2.1.html'
    page = requests.get('http://developer.openstack.org/' + url)
    tree = html.fromstring(page.content)
    #Create a list of HTTP verbs
    verbs = tree.xpath('//a[@class="operation-anchor"]/following::span[1]/text()')
    operations = tree.xpath('//a[@class="operation-anchor"]/following::div[1]/text()')
    #Match up Verbs and Operations and output a printed list
    methods = zip(verbs, operations)
    print len(verbs)
    print len(operations)
    if len(verbs) == len(operations):
        for verbs, operations in methods:
            print verbs + ' ' + operations
    else:
        print "Number of verbs doesn't match number of operations for ", page.url

项目：VIA4CVE 作者：cve-search | 项目源码 | 文件源码

def __init__(self):
    self.name = SOURCE_NAME
    _file, r = conf.getFeedData(SOURCE_NAME, SOURCE_FILE, unpack=False)
    zipobj   = zipfile.ZipFile(BytesIO(_file))
    self.cves = defaultdict(dict)

    for filename in zipobj.namelist():
      with zipobj.open(filename) as infile:
        page = fromstring(infile.read().decode("utf-8"))
        vendor = page.xpath("//table[1]//tr[1]//td[2]")
        if vendor: vendor = vendor[0].text.lower()
        rows = page.xpath("//table[2]//tr//td")
        # CVE - Source ID
        IDs = [[rows[i].text, [x.text for x in rows[i+1].iterchildren()]] for i in range(0, len(rows), 2)]
        for e in IDs:
          vendorID = e[0] if not e[0].startswith(vendor.upper()+':') else e[0][len(vendor)+1:]
          for cve in e[1]:
            if vendor not in self.cves[cve]:           self.cves[cve][vendor] = []
            if vendorID not in self.cves[cve][vendor]: self.cves[cve][vendor].append(vendorID)

项目：gctag 作者：Fenghuapiao | 项目源码 | 文件源码

def get_html_text(url):
    response = requests.get(url)
    origin_text = response.text
    origin_text = re.sub(r'<script.*?>.*?</script>', '', origin_text, flags=re.I | re.M | re.DOTALL)
    origin_text = re.sub(r'<style.*?>.*?</style>', '', origin_text, flags=re.I | re.M | re.DOTALL)

    doc = html.fromstring(origin_text)
    text = doc.xpath('//body//text()')
    text = [i.strip() for i in text if i.strip()]
    text = ' '.join(text)
    seg = jieba.cut(text)

    stopwords = read_stopwords('./utils/stopwords.txt') # callable read_stopwords()
    seg = [i.strip() for i in seg if i.strip() and not i.strip().isdigit()
           and i.strip() not in stopwords]
    seg = ' '.join(seg)

    return seg

项目：X-Ray_Calibre_Plugin 作者：szarroug3 | 项目源码 | 文件源码

def _get_quotes(self):
        '''Gets book's quote data'''
        if self._page_source is None:
            return

        quotes_page = self._page_source.xpath('//a[@class="actionLink" and contains(., "More quotes")]')
        quotes = []
        if len(quotes_page) > 0:
            resp = open_url(self._connection, quotes_page[0].get('href'))
            if not resp:
                return
            quotes_page = html.fromstring(resp)
            if quotes_page is None:
                return
            for quote in quotes_page.xpath('//div[@class="quoteText"]'):
                quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore'))
        else:
            for quote in self._page_source.xpath('//div[@class=" clearFloats bigBox" and contains(., "Quotes from")]//div[@class="bigBoxContent containerWithHeaderContent"]//span[@class="readable"]'):
                quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore'))

        return quotes

项目：X-Ray_Calibre_Plugin 作者：szarroug3 | 项目源码 | 文件源码

def _get_book_info_from_tooltips(self, book_info):
        '''Gets books ASIN, title, authors, image url, description, and rating information'''
        if isinstance(book_info, tuple):
            book_info = [book_info]
        books_data = []
        link_pattern = 'resources[Book.{0}][type]=Book&resources[Book.{0}][id]={0}'
        tooltips_page_url = '/tooltips?' + "&".join([link_pattern.format(book_id) for book_id, image_url in book_info])
        tooltips_page_info = json.loads(open_url(self._connection, tooltips_page_url))['tooltips']

        for book_id, image_url in book_info:
            book_data = tooltips_page_info['Book.{0}'.format(book_id)]
            if not book_data:
                continue
            book_data = html.fromstring(book_data)
            parsed_data = self._parse_tooltip_info(book_data, book_id, image_url)
            if not parsed_data:
                continue
            books_data.append(parsed_data)

        return books_data

项目：BlogSpider 作者：hack4code | 项目源码 | 文件源码

def __call__(self, doc, encoding='UTF-8'):
        if isinstance(doc,
                      (str, bytes)):
            doc = fromstring(bytes(bytearray(doc,
                                             encoding=encoding)),
                             parser=HTMLParser(encoding=encoding))
        if not isinstance(doc,
                          HtmlElement):
            return None
        for cls in self.EXTRACTORS:
            extract = cls()
            tags_ = extract(doc)
            if tags_:
                tags = []
                for idx, tag in enumerate(tags_):
                    if idx < 2 and len(tag) > 16:
                        break
                    elif len(tag) < 16:
                        tags.append(tag)
                else:
                    if tags:
                        logger.info('TagExtractor got tags %s',
                                    tags)
                    return tags