Python lxml.html 模块,parse() 实例源码


项目:SPBU-DBMS-Project    作者:Betekhtin    | 项目源码 | 文件源码
def myopen_http(method, url, values):
   if not url:
      raise ValueError("cannot submit, no URL provided")
   ## FIXME: should test that it's not a relative URL or something
      from urllib import urlencode, urlopen
   except ImportError: # Python 3
      from urllib.request import urlopen
      from urllib.parse import urlencode
   if method == 'GET':
      if '?' in url:
         url += '&'
         url += '?'
         url += urlencode(values)
         data = None
      data = urlencode(values).encode('utf-8')

   return urlopen(url, data)
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def items(self):
        Request URL and parse response. Yield a ``Torrent`` for every torrent
        on page. If in multipage mode, Torrents from next pages are
        automatically chained.
        if self._multipage:
            while True:
                # Pool for more torrents
                items = super(Paginated, self).items()
                # Stop if no more torrents
                first = next(items, None)
                if first is None:
                    raise StopIteration()
                # Yield them if not
                    yield first
                    for item in items:
                        yield item
                # Go to the next page
            for item in super(Paginated, self).items():
                yield item
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def created(self):
        Attempt to parse the human readable torrent creation datetime.
        timestamp, current = self._created
        if timestamp.endswith('ago'):
            quantity, kind, ago = timestamp.split()
            quantity = int(quantity)
            if 'sec' in kind:
                current -= quantity
            elif 'min' in kind:
                current -= quantity * 60
            elif 'hour' in kind:
                current -= quantity * 60 * 60
            return datetime.datetime.fromtimestamp(current)
        current = datetime.datetime.fromtimestamp(current)
        timestamp = timestamp.replace('Y-day', str( - datetime.timedelta(days=1)))
        timestamp = timestamp.replace('Today',
            return dateutil.parser.parse(timestamp)
            return current
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def items(self):
        Request URL and parse response. Yield a ``Torrent`` for every torrent
        on page. If in multipage mode, Torrents from next pages are
        automatically chained.
        if self._multipage:
            while True:
                # Pool for more torrents
                items = super(Paginated, self).items()
                # Stop if no more torrents
                first = next(items, None)
                if first is None:
                    raise StopIteration()
                # Yield them if not
                    yield first
                    for item in items:
                        yield item
                # Go to the next page
            for item in super(Paginated, self).items():
                yield item
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def created(self):
        Attempt to parse the human readable torrent creation datetime.
        timestamp, current = self._created
        if timestamp.endswith('ago'):
            quantity, kind, ago = timestamp.split()
            quantity = int(quantity)
            if 'sec' in kind:
                current -= quantity
            elif 'min' in kind:
                current -= quantity * 60
            elif 'hour' in kind:
                current -= quantity * 60 * 60
            return datetime.datetime.fromtimestamp(current)
        current = datetime.datetime.fromtimestamp(current)
        timestamp = timestamp.replace('Y-day', str( - datetime.timedelta(days=1)))
        timestamp = timestamp.replace('Today',
            return dateutil.parser.parse(timestamp)
            return current
项目:qal    作者:OptimalBPM    | 项目源码 | 文件源码
def _file_to_tree(_data_format, _reference):
        """Reads a file and chooses the right parser to make it an lxml element tree"""
        print("format_to_tree : " + _data_format)
        if _data_format == 'HTML':
            from lxml import html

            return html.parse(_reference)
        if _data_format == 'XML':
            from lxml import etree

            return etree.parse(_reference)
        if _data_format == 'JSON':
            from lxml import etree
            from json_lxml import element
            with open(_reference, "r") as _f:
                _top_element = json.load(_f)
                return etree.ElementTree(element("top",_top_element))
            raise Exception("_file_to_tree: " + _data_format + " is not supported")
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def _parse_url(self, url):
        Downloads and parses a URL, returns xml root.

            from lxml.html import parse
        except ImportError:
            raise ImportError("Please install lxml if you want to use the "
                              "{0!r} class".format(self.__class__.__name__))
            doc = parse(url)
        except _network_error_classes:
            raise RemoteDataError("Unable to parse URL "
            root = doc.getroot()
            if root is None:
                raise RemoteDataError("Parsed URL {0!r} has no root"
        return root
项目:LSTM-GA-StockTrader    作者:MartinLidy    | 项目源码 | 文件源码
def _parse_url(self, url):
        Downloads and parses a URL, returns xml root.

            from lxml.html import parse
        except ImportError: # pragma: no cover
            raise ImportError("Please install lxml if you want to use the "
                              "{0!r} class".format(self.__class__.__name__))
        doc = parse(self._read_url_as_StringIO(url))
        root = doc.getroot()
        if root is None: # pragma: no cover
            raise RemoteDataError("Parsed URL {0!r} has no root"
        return root
项目:LSTM-GA-StockTrader    作者:MartinLidy    | 项目源码 | 文件源码
def get_available_datasets():
    Get the list of datasets available from the Fama/French data library.

    A list of valid inputs for get_data_famafrench.
        from lxml.html import parse
    except ImportError:
        raise ImportError("Please install lxml if you want to use the "
                          "get_datasets_famafrench function")

    root = parse(_URL + 'data_library.html')

    l = filter(lambda x: x.startswith(_URL_PREFIX) and x.endswith(_URL_SUFFIX),
               [e.attrib['href'] for e in root.findall('.//a') if 'href' in e.attrib])

    return lmap(lambda x: x[len(_URL_PREFIX):-len(_URL_SUFFIX)], l)
项目:qcri    作者:douville    | 项目源码 | 文件源码
def parse(filename, options=None):
    Parse Selenium IDE - Test Results Plugin output files.

    options = options or {}
        parsed_html = html.parse(filename)
    except html.HTMLSyntaxError:
        raise importer.ParserError('TEST invalid XML syntax')

    suite = parsed_html.find("//table[@id='suiteSummaryTable']/thead/tr/td")
    if suite is None:
        raise importer.ParserError('Test Suite not found')
    suite = suite.text
    if not suite.startswith(_SUITE_HEADER):
        raise importer.ParserError('invalid test results')
    # get suite name from 'Test Suite: <testname>'
    suitename = suite[len(_SUITE_HEADER) + 1:].strip()
    root = parsed_html.getroot()
    suitetbls = root.find_class('test_case')
    if suitetbls is None:
        raise importer.ParserError('no test cases found')

    return [_parse_test(tbl, suitename) for tbl in suitetbls]
项目:el_pais_editoriales    作者:rinze    | 项目源码 | 文件源码
def get_article_info(url):
    Returns a dictionary with the article info.
    The dictionary contains the following fields:
    - date
    - title
    - tags (list of tags at the end of the article)
    - url
    content = urllib2.urlopen(url)
    tree = html.parse(content)
    title = tree.xpath('//h1[@id="articulo-titulo"]/text()')[0]
    date = tree.xpath('//time//a/text()')[0].strip()
    tags = tree.xpath('//li[@itemprop="keywords"]/a/text()')
    url = url

    result = {'date': date, 'title': title, 'tags': tags, 'url': url}
项目:riko    作者:nerevu    | 项目源码 | 文件源码
def parse_rss(url=None, **kwargs):
        f = fetch(decode(url), **kwargs)
    except (ValueError, URLError):
        parsed = rssparser.parse(url)
        content = if speedparser else f

            parsed = rssparser.parse(content)

    return parsed
项目:riko    作者:nerevu    | 项目源码 | 文件源码
def xml2etree(f, xml=True, html5=False):
    if xml:
        element_tree = etree.parse(f)
    elif html5 and html5parser:
        element_tree = html5parser.parse(f)
    elif html5parser:
        element_tree = html.parse(f)
        # html5lib's parser returns an Element, so we must convert it into an
        # ElementTree
        element_tree = ElementTree(html.parse(f))

    return element_tree
项目:SPBU-DBMS-Project    作者:Betekhtin    | 项目源码 | 文件源码
def myopen_http(method, url, values):
   if not url:
      raise ValueError("cannot submit, no URL provided")
   ## FIXME: should test that it's not a relative URL or something
      from urllib import urlencode, urlopen
   except ImportError: # Python 3
      from urllib.request import urlopen
      from urllib.parse import urlencode
   if method == 'GET':
      if '?' in url:
         url += '&'
         url += '?'
         url += urlencode(values)
         data = None
      data = urlencode(values).encode('utf-8')

   return urlopen(url, data)

# main_domain_stat='file:///Users/Zharkov/Downloads/test2.htm'
# page=html.parse(main_domain_stat)
# e = page.getroot().\
#         find_class('cl_hr').\
#         pop()
# t=e.getchildren().pop()
# print(e, t)
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def items(self):
        Request URL and parse response. Yield a ``Torrent`` for every torrent
        on page.
        os.system("curl %s -o /tmp.html -s" % str(self.url))
        request = urlopen("file:///tmp.html")
        document = html.parse(request)
        root = document.getroot()
        items = [self._build_torrent(row) for row in
        for item in items:
            yield item
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def info(self):
        if self._info is None:
            os.system("curl %s -o /tmp.html -s" % str(self.url))
            request = urlopen("file:///tmp.html")
            document = html.parse(request)
            root = document.getroot()
            if root.cssselect('#details > .nfo > pre') != []:
                info = root.cssselect('#details > .nfo > pre')[0].text_content()
                info = None
            self._info = info
        return self._info
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def files(self):
        if not self._files:
            path = '/ajax_details_filelist.php?id={id}'.format(
            url = self.url.path(path)
            os.system("curl %s -o /tmp.html -s" % str(self.url))
            request = urlopen("file:///tmp.html")
            document = html.parse(request)
            root = document.getroot()
            rows = root.findall('.//tr')
            for row in rows:
                name, size = [unicode(v.text_content())
                              for v in row.findall('.//td')]
                self._files[name] = size.replace('\xa0', ' ')
        return self._files
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def info(self):
        if self._info is None:
            request = urlopen(str(self.url))
            document = html.parse(request)
            root = document.getroot()
            if root.cssselect('#details > .nfo > pre') != []:
                info = root.cssselect('#details > .nfo > pre')[0].text_content()
                info = None
            self._info = info
        return self._info
项目:autotorrent    作者:anthonyheddings    | 项目源码 | 文件源码
def files(self):
        if not self._files:
            path = '/ajax_details_filelist.php?id={id}'.format(
            url = self.url.path(path)
            request = urlopen(str(self.url))
            document = html.parse(request)
            root = document.getroot()
            rows = root.findall('.//tr')
            for row in rows:
                name, size = [unicode(v.text_content())
                              for v in row.findall('.//td')]
                self._files[name] = size.replace('\xa0', ' ')
        return self._files
项目:leo    作者:tomschr    | 项目源码 | 文件源码
def parse():
   """Parse the command line """
   parser = argparse.ArgumentParser(description='Query Leo', 
                                    usage='%(prog)s [OPTIONS] QUERYSTRING')
   parser.add_argument( '-D', '--with-defs',
      help="Include any definitions in the result (default: %(default)s)",
   parser.add_argument( '-E', '--with-examples',
      help="Include examples in the result (default: %(default)s)",
   parser.add_argument( '-P', '--with-phrases',
      help="Include phrases in the result (default: %(default)s)",
   #parser.add_argument( '-F', '--with-forums',
   #   action="store_true",
   #   default=False,
   #   help="Include forums in the result (default: %(default)s)",
   #   )
      help="Query string",
   return parser.parse_args()
项目:leo    作者:tomschr    | 项目源码 | 文件源码
def getLeoPage(url):
   """Return root node of Leo's result HTML page
   return html
项目:europarl    作者:chozelinek    | 项目源码 | 文件源码
def read_html(self, infile):
        """Parse a HTML file."""
        with open(infile, encoding='utf-8', mode='r') as input:
            return html.parse(input)
项目:europarl    作者:chozelinek    | 项目源码 | 文件源码
def add_root_attributes(self, root, tree, infile):
        root.attrib['id'] = os.path.splitext(os.path.basename(infile))[0]
        root.attrib['lang'] = self.language.lower()
        date_string = re.match(
            r'^(.+?,? \d.+?) - (.+)$',
            tree.xpath('//td[@class="doc_title" and @align="left" and @valign="top"]')[0].text)
        date = dateparser.parse(
        place =
        root.attrib['date'] = str(date)
        root.attrib['place'] = place
        root.attrib['edition'] = tree.xpath('//td[@class="doc_title" and @align="right" and @valign="top"]')[0].text
项目:europarl    作者:chozelinek    | 项目源码 | 文件源码
def read_html(self, infile):
        """Parse a HTML file."""
        with open(infile, encoding='utf-8', mode='r') as input:
            return html.parse(input)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def get_lxml_elements(url, element):
    from lxml.html import parse
    doc = parse(url)
    return doc.xpath('.//{0}'.format(element))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            mutliple tables on a page.

            * If `match` does not match any text in the document.

        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        raise AbstractMethodError(self)
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def mark_contribs(html_file, marked_html_file) :

    h = html.parse(html_file)
#   text = "".join([ p.text_content() for p in h.xpath("//p") ])

    pars = h.xpath("//p")

    for par in pars :

        # Get the paragraph's text fixing the hyphenation
        text = par.text_content().replace("-\n", "")

        sentences = tokenizer.tokenize(text.strip())
        scores = map(calc_score, sentences)

        intervals = max_subarray(scores, 1.0)
        mask = positive_ones(len(sentences), intervals)


        texts = []
#       text = ''
#       marked_sentences = []
        for i, s in enumerate(sentences) :
            if mask[i] :
                marked = etree.Element("font", style="background-color:yellow", score=str(scores[i]))
                marked.text = s
                marked.tail = ''

            else :
                if len(par):
                    marked = par[-1]
                    marked.tail += ' ' + s

        par.text = ' '.join(texts)

    h.write(marked_html_file, pretty_print=True, method="html")
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def get_section(html_file, section_name, possible_next_sections):

    h = html.parse(html_file)
    pars = h.xpath("//p")

    begin = end = -1
    for i, par in enumerate(pars) :

        if (begin>0) and (end>0) :

        par_text = par.text_content().lower()
        if begin<0 and (par_text.find(section_name, 0, 20) >= 0) :
            begin = i

        if begin>=0 :
            for next_section in possible_next_sections :
                if (par_text.find(next_section, 0, 20) >= 0) :
                    end = i

    text = ""
    if (begin<0) or (end<0) :
        raise SectionNotFound("Section %s not found."%section_name)

        text = "".join([par.text_content() for par in pars[begin:end]])

    return text
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def totxt(self, paperid):
        Converts HTML to pure text by extracting all text elements from the the HTML.  
        infile  = config.HTML_PATH % paperid
        outfile = config.TXT_PATH % paperid

        h = html.parse(infile)
        pars = h.xpath("//p")
        text = ''.join([par.text_content() for par in pars])
        text = text.replace("-\n", "")

        with open(outfile, 'w') as f :
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def get_section(self, html_file, possible_section_names, possible_next_sections):

        # Open and parse HTML, then extract all textual content from each paragraph 
        h = html.parse(html_file) #, parser=etree.XMLParser(encoding="utf-8"))
        pars = [paragraph.text_content().lower().encode("UTF-8") for paragraph in h.xpath("//p")]   # .encode("utf-8")

        # First we go backwards trying to find the latest occurrence of 
        # one of the possible names of the section of interest 
        begin = None
        for i in reversed(xrange(len(pars))) :
            if match_any(pars[i], possible_section_names) :
                begin = i

        # If the start wasn't found, just halt right away   
        if (begin is None) :
            return ""

        # Otherwise we can look for the end of the section starting from the start
        # of the found section.
        end = None
        for j in xrange(begin+1, len(pars)) :
            if match_any(pars[j], possible_next_sections) :
                end = j

        # End of section not found, so it's not safe to keep this content, 
        # so we return an empty string.
        if (end is None) :
            return ""

        # Otherwise join all paragraphs inside the section found
        return unicode("".join([fix_hyphens(p) for p in pars[begin:end]]), "UTF-8")
项目:ovcurriculum    作者:etandel    | 项目源码 | 文件源码
def get_sections(curriculum_code):
    r = requests.get(BASE_URL.format(curriculum_code))
    tree = parse_html(BytesIO(r.content))
    return list(map(build_section,
项目:deb-python-jsonschema    作者:openstack    | 项目源码 | 文件源码
def fetch_or_load(spec_path):
    Fetch a new specification or use the cache if it's current.

    :argument cache_path: the path to a cached specification


    headers = {}

        modified = datetime.utcfromtimestamp(os.path.getmtime(spec_path))
        date = modified.strftime("%a, %d %b %Y %I:%M:%S UTC")
        headers["If-Modified-Since"] = date
    except OSError as error:
        if error.errno != errno.ENOENT:

    request = urllib.Request(VALIDATION_SPEC, headers=headers)
    response = urllib.urlopen(request)

    if response.code == 200:
        with open(spec_path, "w+b") as spec:
            return html.parse(spec)

    with open(spec_path) as spec:
        return html.parse(spec)
项目:partytime    作者:sunlightlabs    | 项目源码 | 文件源码
def get_latest_url(list_url):
    doc = parse(list_url).getroot()
    return '' % doc.cssselect('h1')[0].getnext().cssselect('a')[0].values()[0]
项目:partytime    作者:sunlightlabs    | 项目源码 | 文件源码
def get_senate_ratings():
    url = get_latest_url('')
    doc = parse(url).getroot()

    good_tds = []

    for td in doc.cssselect('td'):
        d = dict(td.items())
        if not d.has_key('width') or not d['width'] == '92':
        data = [x for x in list(td.itertext()) if x.strip()]
        if len(data) == 1:

        rating = re.sub(r' \(.*$', '', data[0]) \
                .lower() \
                .replace(' ', '_') \
                .replace('toss_up', 'tossup') \

        data = data[1:]

        for race in data:
            state ='[A-Z]{2}', race).group()
            district = ''
            body = 'S'

            cr, created = CookRating.objects.get_or_create(body=body,
项目:partytime    作者:sunlightlabs    | 项目源码 | 文件源码
def get_house_ratings():
    url = get_latest_url('')
    doc = parse(url).getroot()

    tables = doc.cssselect('table.nestedTable')

    data = {}

     data['likely_gop']) = tables

    candidate_data = {}

    for key in data.keys():
        rows = data[key].cssselect('tr')[1:]
        for row in rows:
            district, incumbent, score = list(row.itertext())[::2]
            rating = key
            state, district = district.split('-')
            body = 'H'

            cr, created = CookRating.objects.get_or_create(body=body,
项目:el_pais_editoriales    作者:rinze    | 项目源码 | 文件源码
def process_editorial_list(url):
    Process a page that contains a list of editorials.
        - A list of URLs to individual editorial articles.
        - The URL to the next editorial list.
    content = urllib2.urlopen(url)
    tree = html.parse(content)
    next_edlist = get_next_edlist(tree)
    artlist = get_edarticles(tree)

    return (artlist, next_edlist)
项目:qal    作者:OptimalBPM    | 项目源码 | 文件源码
def _structure_init(self, _dataset):
        """Initializes the XML structure that data is to be applied to."""
        super(XpathDataset, self)._structure_init(_dataset)

        # Parse important information data from XPath 
        _root_node_name, self._structure_row_node_name, _parent_xpath = self._structure_parse_root_path(self.rows_xpath)

        # If the structure already loaded?
        if self._structure_row_node_parent is None:

            # If not try to load, or create file.    
            import os

            if os.path.exists(make_path_absolute(self.filename, self._base_path)):

                except Exception as e:
                    raise Exception(" - error parsing " + self.xpath_data_format + " file : " + str(e))
                # Create a tree with root node based on the first  

                if _root_node_name != "":
                    # noinspection PyUnusedLocal
                    if self.encoding:
                        _encoding = self.encoding
                        _encoding = "UTF-8"
                    # TODO: Check why this is done, _tree isn't used
                    # noinspection PyUnusedLocal
                    _tree = etree.parse(io.StringIO("<?xml version='1.0' ?>\n<" + _root_node_name + "/>"))
                    raise Exception(" - rows_xpath(" + str(
                        self.rows_xpath) + ") must be absolute and have at least the name of the root node. " +
                                           "Example: \"/root_node\" ")

        # If the structure there yet? It could be an XML file with only a top node. 
        if self._structure_row_node_parent is None:
            # If not existing, create a node structure up to the parent or the row nodes
            # from the information in the xpath.
            self._structure_top_node = self._structure_create_xpath_nodes(self._structure_top_node, self.rows_xpath)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def _build_doc(self):
            * If a URL that lxml cannot parse is passed.

            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=False, encoding=self.encoding)

            # try to parse the input in the simplest way
            r = parse(, parser=parser)

                r = r.getroot()
            except AttributeError:
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(
                r = fromstring(, parser=parser)

                    r = r.getroot()
                except AttributeError:
                # not a url
                scheme = parse_url(
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = ('%r is not a valid url scheme, valid schemes are '
                           '%s') % (scheme, _valid_schemes)
                    raise ValueError(msg)
                    # something else happened: maybe a faulty connection
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r