我们从Python开源项目中,提取了以下47个代码示例,用于说明如何使用bs4.dammit.EncodingDetector()。
def prepare_markup(self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for parsing the document. """ # Instead of using UnicodeDammit to convert the bytestring to # Unicode using different encodings, use EncodingDetector to # iterate over the encodings, and tell lxml to try to parse # the document as each one in turn. is_html = not self.is_xml if is_html: self.processing_instruction_class = ProcessingInstruction else: self.processing_instruction_class = XMLProcessingInstruction if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False)
def prepare_markup(self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for parsing the document. """ if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False if isinstance(markup, unicode): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) # Instead of using UnicodeDammit to convert the bytestring to # Unicode using different encodings, use EncodingDetector to # iterate over the encodings, and tell lxml to try to parse # the document as each one in turn. is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False)
def prepare_markup(self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for parsing the document. """ # Instead of using UnicodeDammit to convert the bytestring to # Unicode using different encodings, use EncodingDetector to # iterate over the encodings, and tell lxml to try to parse # the document as each one in turn. is_html = not self.is_xml if is_html: self.processing_instruction_class = ProcessingInstruction else: self.processing_instruction_class = XMLProcessingInstruction if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False if isinstance(markup, unicode): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False)
def prepare_markup(self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for parsing the document. """ if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) # Instead of using UnicodeDammit to convert the bytestring to # Unicode using different encodings, use EncodingDetector to # iterate over the encodings, and tell lxml to try to parse # the document as each one in turn. is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False)
def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for parsing the document. """ if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False if isinstance(markup, unicode): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) # Instead of using UnicodeDammit to convert the bytestring to # Unicode using different encodings, use EncodingDetector to # iterate over the encodings, and tell lxml to try to parse # the document as each one in turn. is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector(markup, try_encodings, is_html) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False)
def prepare_markup(self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for parsing the document. """ if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False if isinstance(markup, unicode): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) # Instead of using UnicodeDammit to convert the bytestring to # Unicode using different encodings, use EncodingDetector to # iterate over the encodings, and tell lxml to try to parse # the document as each one in turn. is_html = not self.is_xml if is_html: self.processing_instruction_class = ProcessingInstruction else: self.processing_instruction_class = XMLProcessingInstruction try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False)