我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用bs4.dammit.contains_replacement_characters()。
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet try: bs4.dammit.chardet = None with warnings.catch_warnings(record=True) as w: dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) finally: bs4.dammit.chardet = chardet
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet