我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用html.entities.name2codepoint()。
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = chr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
def unescape(text): def fix_up(m): text_ = m.group(0) code = m.group(1) try: if text_[1] == "#": # character reference if text_[2] == "x": return chr(int(code[1:], 16)) else: return chr(int(code)) else: # named entity return chr(name2codepoint[code]) except (KeyError, ValueError): return text_ # leave as is return re.sub("&#?(\w+);", fix_up, text) # Match HTML comments
def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)
def htmlentity_transform(entity): """Transforms an HTML entity to a character.""" # Known non-numeric HTML entity try: if entity in compat_html_entities.name2codepoint: return compat_chr(compat_html_entities.name2codepoint[entity]) except Exception: pass mobj = re.match(r'#(x?[0-9A-Fa-f]+)', entity) if mobj is not None: numstr = mobj.group(1) if numstr.startswith(u'x'): base = 16 numstr = u'0%s' % numstr else: base = 10 try: ret = compat_chr(int(numstr, base)) return ret except Exception: printExc() # Unknown entity in name, return its literal representation return (u'&%s;' % entity)
def unescape(text): """ Removes HTML or XML character references and entities from a text string. :param text The HTML (or XML) source text. :return The plain text, as a Unicode string, if necessary. """ def fixup(m): text = m.group(0) code = m.group(1) try: if text[1] == "#": # character reference if text[2] == "x": return chr(int(code[1:], 16)) else: return chr(int(code)) else: # named entity return chr(name2codepoint[code]) except: return text # leave as is return re.sub("&#?(\w+);", fixup, text) # Match HTML comments # The buggy template {{Template:T}} has a comment terminating with just "->"
def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] else: k = htmlentitydefs.entitydefs[k] if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0])
def handle_entityref(self, name): if name in name2codepoint and not self.hide_output: code = name2codepoint[name] self.push_text("&#" + str(code) + ";")
def entity2text(entitydef): """Convert an HTML entity reference into unicode. http://stackoverflow.com/a/58125/408556 """ if entitydef.startswith('&#x'): cp = int(entitydef[3:-1], 16) elif entitydef.startswith('&#'): cp = int(entitydef[2:-1]) elif entitydef.startswith('&'): cp = name2codepoint[entitydef[1:-1]] else: logger.debug(entitydef) cp = None return chr(cp) if cp else entitydef
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. if ref in name2codepoint or ref == 'apos': self.pieces.append('&%s;' % ref) else: self.pieces.append('&%s' % ref)
def un_escape(self, text): # Removes HTML or XML character references and entities from a text string. # source: http://effbot.org/zone/re-sub.htm#unescape-html # # @param text The HTML (or XML) source text. # @return The plain text, as a Unicode string def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is if not isinstance(text,(str, unicode)): return text return unicode(re.sub("&#?\w+;", fixup, text))
def handle_entityref(self, name): try: c = unichr(name2codepoint[name]) self.text += c except: pass
def htmlentitydecode(s): return re.sub( '&(%s);' % '|'.join(name2codepoint), lambda m: unichr(name2codepoint[m.group(1)]), s)
def html_entity_decode(s): result = re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: str(unichr(name2codepoint[m.group(1)])), s) result = re.sub(r'&#(\d{2,3});', lambda m: chr(int(m.group(1))), result) return result
def unescape_html(text): """ Removes HTML or XML character references and entities from a text string. @param text The HTML (or XML) source text. @return The plain text, as a Unicode string, if necessary. Source: http://effbot.org/zone/re-sub.htm#unescape-html """ def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return chr(int(text[3:-1], 16)) else: return chr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = chr(htmlentities.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)
def handle_entityref(self, name): codepoint = htmlentitydefs.name2codepoint[name] self.result.append(chr(codepoint))
def strip(html): # Strip remaining enclosed tags html = sub('<.*?>', '', html) # Multiple whitespaces are rendered as a single one html = sub('[ \t\r\f\v]{2,}', ' ', html) html = html.replace('\n ', '\n') entitydict = {} entities = finditer('&([^#]\D{1,5}?);', html) for x in entities: key = x.group(0) if key not in entitydict: entitydict[key] = htmlentitydefs.name2codepoint[x.group(1)] entities = finditer('&#x([0-9A-Fa-f]{2,2}?);', html) for x in entities: key = x.group(0) if key not in entitydict: entitydict[key] = "%d" % int(key[3:5], 16) entities = finditer('&#(\d{1,5}?);', html) for x in entities: key = x.group(0) if key not in entitydict: entitydict[key] = x.group(1) for key, codepoint in iteritems(entitydict): html = html.replace(key, unichr(int(codepoint))) # Return result with leading/trailing whitespaces removed return html.strip()