我们从Python开源项目中,提取了以下16个代码示例,用于说明如何使用html.parser()。
def unescape_html(html_): """ Replace HTML entities (e.g. `£`) in a string. :param html_: The escaped HTML. :return: The input string with entities replaces. """ # http://stackoverflow.com/a/2360639 if sys.version_info.major == 2: # 2.7 # noinspection PyUnresolvedReferences,PyCompatibility from HTMLParser import HTMLParser return HTMLParser().unescape(html_) if sys.version_info.minor == 3: # 3.3 # noinspection PyCompatibility from html.parser import HTMLParser # noinspection PyDeprecation return HTMLParser().unescape(html_) # 3.4+ # noinspection PyCompatibility import html return html.unescape(html_)
def processIncomingTweet(tweet): #check tweet that has come in via the filter stream, it might have commands in it # print(tweet) global maxWordQ global wordq if scanTags(tweet,"NixieBotShowMe") : theWord=extractWord(html.parser.HTMLParser().unescape(tweet['text'])) if ((theWord is not None ) or ( hasCommand(tweet))) : wordqPut(tweet,priority = prioritise(tweet)) size = wordq.qsize() if size > maxWordQ : maxWordQ = size print("word request from", tweet['user']['screen_name'], "word = ", theWord, " Word queue at:", size, "maxqueue was ", maxWordQ) recentReqs.append(tweet) # store for sending to hard storage every now and then if len(recentReqs) > reqPickleFrequency : if pickleMe(recentReqs, "Requests", dateStamp=True) : recentReqs[:]=[] #userCounter.update(tweet['user']['screen_name']) # DMreceipt bad idea as it still counts against rate limit #for ht in tweet['entities']['hashtags']: # if ht['text']=="NBreceipt" and not rct: # sendReceipt(tweet,theWord,tt) # rct=True
def html2tele(html): #print("html2tele input: ", html) parser = _HTMLToText() parser.feed(html) parser.close() result = parser.get_text() result = re.sub(r'\n(\s*\n+)', '\n\n', result) result = re.sub(r' +<pre>', '<pre>', result) result = re.sub(r'</pre> +', '</pre>', result) #print("html2tele result: ", result) return result #----------
def simple_parse_to_segments(html, debug=False, **kwargs): html = fix_urls(html) html = '<html>' + html + '</html>' # html.parser seems to ignore the final entityref without html closure parser = simpleHTMLParser(debug) return parser.feed(html)
def on_success(self, tweet): global recentIDDeque if 'text' in tweet and not ('retweeted_status' in tweet) : print("<<<<<<<<<<<<<<<<<<< Incoming!<<<<<<<<<<<<<<<<<< " + html.parser.HTMLParser().unescape(tweet['text']) + tweet['id_str']) if tweet['id_str'] not in recentIDDeque : processIncomingTweet(tweet) recentIDDeque.appendleft(tweet['id_str']) else : print("!!!! duplicate! Ignored ") backOffTime = 60
def processIncomingTweet(tweet): #check tweet that has come in via the filter stream, it might have commands in it # print(tweet) global botState global wordq global randstream if scanTags(tweet,"NixieBotShowMe") : theWord=extractWord(html.parser.HTMLParser().unescape(tweet['text'])) if ((theWord is not None ) or ( hasCommand(tweet))) : wordqPut(tweet,priority = prioritise(tweet)) size = wordq.qsize() if size > botState['maxWordQ'] : botState['maxWordQ'] = size print("word request from", tweet['user']['screen_name'], "word = ", theWord, " Word queue at:", size, "maxqueue was ", botState['maxWordQ']) recentReqs.append(tweet) # store for sending to hard storage every now and then if len(recentReqs) > reqPickleFrequency : if pickleMe(recentReqs, "Requests", dateStamp=True) : recentReqs[:]=[] #userCounter.update(tweet['user']['screen_name']) elif scanTags(tweet,"NixieBotRollMe") : rollq.put(tweet) print("roll request incoming! Word queue at:", rollq.qsize()) else : #must be a trump tweet so submit to random for now randstream.on_success(tweet) # DMreceipt bad idea as it still counts against rate limit #for ht in tweet['entities']['hashtags']: # if ht['text']=="NBreceipt" and not rct: # sendReceipt(tweet,theWord,tt) # rct=True
def on_success(self, tweet): if 'text' in tweet and not ('retweeted_status' in tweet) : print("<<<<<<<<<<<<<<<<<<< Incoming!<<<<<<<<<<<<<<<<<< " + html.parser.HTMLParser().unescape(tweet['text'])) processIncomingTweet(tweet) backOffTime = 60
def test_html_import(self): import html import html.entities import html.parser self.assertTrue(True)
def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--toc-maker", help="path to ToC making tool") parser.add_argument("--twitter-poster", default="t update", help="twitter poster command") parser.add_argument("-t", "--use-twitter", action="store_true") known_args, unknown_args = parser.parse_known_args() if not known_args.toc_maker: known_args.toc_maker = "./gh-md-toc" if not os.path.isfile(known_args.toc_maker): s = cmd.getoutput("uname -s").lower() f = "gh-md-toc.%s.amd64.tgz" % s URL = "https://github.com/ekalinin/github-markdown-toc.go/releases/download/0.6.0/%s" % f if not os.path.isfile(f): if cmd.getstatusoutput("wget %s" % URL)[0] != 0: raise EnvironmentError("Cannot download toc maker from URL: %s" % URL) if cmd.getstatusoutput("tar xzf %s" % f)[0] != 0: raise EnvironmentError("Cannot untar toc maker from file %s" % f) os.remove(f) current_permissions = stat.S_IMODE(os.lstat(known_args.toc_maker).st_mode) os.chmod(known_args.toc_maker, current_permissions & stat.S_IXUSR) if unknown_args: filepath = unknown_args[0] else: print("You should specify the path for file to work with!") quit(1) return known_args, filepath
def test_with_deleted_parent(self): # see #18681 from html import parser html = sys.modules.pop('html') def cleanup(): sys.modules['html'] = html self.addCleanup(cleanup) with self.assertRaisesRegex(ImportError, 'html'): imp.reload(parser)
def __init__(self): '''Crea un parser per la class HTMLNode''' # inizializza la class base super() super().__init__() self.root = None self.stack = []
def parse(html): '''Esegue il parsing HTML del testo html e ritorna la radice dell'albero.''' parser = _MyHTMLParser() parser.feed(html) return parser.root
def loadUserFont(fontfile) : #load in font file generated from online font designer at http://b7971.lucsmall.com/ #lines should look like: 0x7622, // 0 - A #and the bit order should be reversed using the button at the top of that page global comLock global userProperChars font = {} stashfx = effx stashspeed = fxspeed setEffex(0,0) userProperChars = "" print("loading font") with open(fontfile) as ff : for line in ff : if line == '\n' : continue # cope with blank at end of file parts = line.split(",") print("parts = ",parts) bits = parts[0] letter = parts[1].split("-")[1].strip() bitval = int(bits,16) print(bitval,letter) font[letter] = bitval font['-'] = 0x0022 #nasty hack as hyphen entry is broken by the split("-") font[','] = 0x0004 # ditto for comma font['~'] = 0x1310 # and tilde print(len(font)," characters loaded, now sending") with comLock : print("loadfont got comlock") cmd = "$B7F" + "U" * tubes print(cmd) com.write(bytes(cmd+"\r","utf-8")) for glyph in font: userProperChars = userProperChars + glyph cmd="$B7W"+glyph mask =int('0b0100000000000000',2) while mask > 0 : if int(font[glyph]) & int(mask) > 0 : cmd = cmd + "1" else : cmd = cmd + "0" mask = mask >> 1 print(cmd) com.write(bytes(cmd+"\r","utf-8")) time.sleep(0.3) cmd="$B7M"+ glyph * tubes print(cmd) com.write(bytes(cmd+"\r","utf-8")) # special case (ok, bodge!) for space as the strip command in the font file parser above will remove it, and all fonts need a space cmd="$B7W 000000000000000" print(cmd) com.write(bytes(cmd+"\r","utf-8")) cmd="$B7M " print(cmd) com.write(bytes(cmd+"\r","utf-8")) userProperChars = userProperChars + " " setEffex(stashfx,stashspeed) # now write out character set file ( used by proper() ) with open("uCharSet.txt",'w' ) as cf : cf.write(userProperChars) print("loadfont rel comlock")
def test_future_moves(self): """ Ensure everything is available from the future.moves interface that we claim and expect. (Issue #104). """ from future.moves.collections import Counter, OrderedDict # backported to Py2.6 from future.moves.collections import UserDict, UserList, UserString from future.moves import configparser from future.moves import copyreg from future.moves.itertools import filterfalse, zip_longest from future.moves import html import future.moves.html.entities import future.moves.html.parser from future.moves import http import future.moves.http.client import future.moves.http.cookies import future.moves.http.cookiejar import future.moves.http.server from future.moves import queue from future.moves import socketserver from future.moves.subprocess import check_output # even on Py2.6 from future.moves.subprocess import getoutput, getstatusoutput from future.moves.sys import intern from future.moves import urllib import future.moves.urllib.error import future.moves.urllib.parse import future.moves.urllib.request import future.moves.urllib.response import future.moves.urllib.robotparser try: # Is _winreg available on Py2? If so, ensure future.moves._winreg is available too: import _winreg except ImportError: pass else: from future.moves import winreg from future.moves import xmlrpc import future.moves.xmlrpc.client import future.moves.xmlrpc.server from future.moves import _dummy_thread from future.moves import _markupbase from future.moves import _thread