我们从Python开源项目中,提取了以下5个代码示例,用于说明如何使用html.entities.html5()。
def parse_html_declaration(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == '<!', ('unexpected call to ' 'parse_html_declaration()') if rawdata[i:i+4] == '<!--': # this case is actually already handled in goahead() return self.parse_comment(i) elif rawdata[i:i+3] == '<![': return self.parse_marked_section(i) elif rawdata[i:i+9].lower() == '<!doctype': # find the closing > gtpos = rawdata.find('>', i+9) if gtpos == -1: return -1 self.handle_decl(rawdata[i+2:gtpos]) return gtpos+1 else: return self.parse_bogus_comment(i) # Internal -- parse bogus comment, return length or -1 if not terminated # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def _replace_charref(s): s = s.group(1) if s[0] == '#': # numeric charref if s[1] in 'xX': num = int(s[2:].rstrip(';'), 16) else: num = int(s[1:].rstrip(';')) if num in _invalid_charrefs: return _invalid_charrefs[num] if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF: return '\uFFFD' if num in _invalid_codepoints: return '' return chr(num) else: # named charref if s in _html5: return _html5[s] # find the longest matching name (as defined by the standard) for x in range(len(s)-1, 1, -1): if s[:x] in _html5: return _html5[s[:x]] + s[x:] else: return '&' + s
def unescape(self, s): if '&' not in s: return s def replaceEntities(s): s = s.groups()[0] try: if s[0] == "#": s = s[1:] if s[0] in ['x','X']: c = int(s[1:].rstrip(';'), 16) else: c = int(s.rstrip(';')) return chr(c) except ValueError: return '&#' + s else: from html.entities import html5 if s in html5: return html5[s] elif s.endswith(';'): return '&' + s for x in range(2, len(s)): if s[:x] in html5: return html5[s[:x]] + s[x:] else: return '&' + s return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))", replaceEntities, s, flags=re.ASCII)
def parse_endtag(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" match = endendtag.search(rawdata, i+1) # > if not match: return -1 gtpos = match.end() match = endtagfind.match(rawdata, i) # </ + tag + > if not match: if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos if self.strict: self.error("bad end tag: %r" % (rawdata[i:gtpos],)) # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: # w3.org/TR/html5/tokenization.html#end-tag-open-state if rawdata[i:i+3] == '</>': return i+3 else: return self.parse_bogus_comment(i) tagname = namematch.group().lower() # consume and ignore other stuff between the name and the > # Note: this is not 100% correct, since we might have things like # </tag attr=">">, but looking for > after tha name should cover # most of the cases and is much simpler gtpos = rawdata.find('>', namematch.end()) self.handle_endtag(tagname) return gtpos+1 elem = match.group(1).lower() # script or style if self.cdata_elem is not None: if elem != self.cdata_elem: self.handle_data(rawdata[i:gtpos]) return gtpos self.handle_endtag(elem.lower()) self.clear_cdata_mode() return gtpos # Overridable -- finish processing of start+end tag: <tag.../>