我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用lxml.html.HtmlElement()。
def xml(self, url, method='get', params=None, data=None): """ ?????xml :type url: str :param url: API :type method: str :param method: HTTP METHOD :type params: dict :param params: query :type data: dict :param data: body :rtype: html.HtmlElement :return: """ r = self.req(url, method, params, data) # this is required for avoid utf8-mb4 lead to encoding error return self.to_xml(r.content, base_url=r.url)
def _fragments_from_string(html_string): fragments = html.fragments_fromstring(html_string) if not len(fragments): return [] # convert and append text node before starting tag if not isinstance(fragments[0], html.HtmlElement): if len(fragments[0].strip()) > 0: if len(fragments) == 1: return html.fragments_fromstring('<p>%s</p>' % fragments[0]) else: paragraph = _create_element('p') paragraph.text = fragments[0] fragments[1].addprevious(paragraph) fragments.insert(1, paragraph) fragments.pop(0) if not len(fragments): return [] # remove xml instructions (if cleaning is disabled) for instruction in fragments[0].xpath('//processing-instruction()'): instruction.drop_tag() return fragments
def _sanitize_html_frags(html_value, valid_tags, valid_attributes): fragments = html.fragments_fromstring(html_value) for f in fragments: if isinstance(f, html.HtmlElement): _sanitize_html_rec(f, valid_tags, valid_attributes) if f.tag in valid_tags: _clean_attributes(f, valid_attributes) yield html.tostring(f, encoding="unicode") else: if f.text: yield f.text for sub in f: yield html.tostring(sub, encoding="unicode") if f.tail: yield f.tail if f.tag in ('p', 'br'): yield '\n' else: yield f
def get_role(self, intervention): roles = intervention.xpath('.//span[@class="italic"][text()[re:test(.,"^[\s\xad\-–?—\.]*(?:{})[\s\xad\-–?\.]*(?:\([A-Z][A-Z]\))?[\s\xad\-–?—\.]*$", "m")]]'.format('|'.join(self.loc['roles'])), namespaces=self.ns) if len(roles) > 0: output = [] for role in roles: if type(role) is str: output.append(role) elif type(role) is html.HtmlElement: output.append(role.text) for role in roles: lang = re.match( r'.*({}).*'.format('|'.join(self.langs)), role.text) if lang is not None: i_lang = lang.group(1) else: i_lang = None role.drop_tree() else: output = None i_lang = None if output is not None: output = " ".join(output) output = re.sub(r'\n', r' ', output) output = re.sub(r' +', r' ', output) output = re.sub(r'\([\p{Lu}\&/\-–]+\)', r'', output) output = re.sub(r'(\p{Ll})[\s\.\xad–\-?—,\)]+\Z', r'\1', output) output = re.sub(r'\A[\xad\s\.—–\-?,\)\(]+', r'', output) output = re.sub(r'[\xad\s\.—–\-?,\)]+\Z', r'', output) return output, i_lang
def _create_element(element, text=None): # creates lxml element without document tree (no body, no parents) new_element = html.HtmlElement() new_element.tag = element if text: new_element.text = text return new_element
def preprocess_media_tags(element): if isinstance(element, html.HtmlElement): if element.tag in ['ol', 'ul']: # ignore any spaces between <ul> and <li> element.text = '' elif element.tag == 'li': # ignore spaces after </li> element.tail = '' elif element.tag == 'iframe': iframe_src = element.get('src') youtube = re.match(youtube_re, iframe_src) vimeo = re.match(vimeo_re, iframe_src) if youtube or vimeo: element.text = '' # ignore any legacy text if youtube: yt_id = urlparse(iframe_src).path.replace('/embed/', '') element.set('src', '/embed/youtube?url=' + quote_plus('https://www.youtube.com/watch?v=' + yt_id)) elif vimeo: element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2))) if not len(element.xpath('./ancestor::figure')): _wrap_figure(element) else: element.drop_tag() elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet': twitter_links = element.xpath('.//a[@href]') for tw_link in twitter_links: if twitter_re.match(tw_link.get('href')): twitter_frame = html.HtmlElement() twitter_frame.tag = 'iframe' twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href'))) element.addprevious(twitter_frame) _wrap_figure(twitter_frame) element.drop_tree()
def _get_html_tree(self) -> html.HtmlElement: """ Gets html of the page as a tree, this can be used to extract the data using some xpath expressions. :return: Returns the root of the html tree. """ return html.fromstring(self._get_html())
def _clean_html(html_value, cleaner): fragments = html.fragments_fromstring(html_value) for f in fragments: if isinstance(f, html.HtmlElement): cleaner(f) yield html.tostring(f, encoding="unicode") else: yield f