我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用unicodedata.combining()。
def clean_filename(filename): """Return a sanitized filename (replace / strip out illegal characters) :param filename: string used for a filename :type filename: str :return: sanitized filename :rtype: str """ return ''.join([ c for c in unicodedata.normalize( 'NFKD', ''.join([REPLACEMENT_CHAR.get(c, c) for c in filename]) ) if not unicodedata.combining(c) and c in '-_.() {0}{1}'.format(string.ascii_letters, string.digits) ])
def GetLineWidth(line): """Determines the width of the line in column positions. Args: line: A string, which may be a Unicode string. Returns: The width of the line in column positions, accounting for Unicode combining characters and wide characters. """ if isinstance(line, unicode): width = 0 for uc in unicodedata.normalize('NFC', line): if unicodedata.east_asian_width(uc) in ('W', 'F'): width += 2 elif not unicodedata.combining(uc): width += 1 return width else: return len(line)
def chars(self, num, truncate=None, html=False): """ Returns the text truncated to be no longer than the specified number of characters. Takes an optional argument of what should be used to notify that the string has been truncated, defaulting to a translatable string of an ellipsis (...). """ length = int(num) text = unicodedata.normalize('NFC', self._wrapped) # Calculate the length to truncate to (max length - end_text length) truncate_len = length for char in self.add_truncation_text('', truncate): if not unicodedata.combining(char): truncate_len -= 1 if truncate_len == 0: break if html: return self._truncate_html(length, truncate, text, truncate_len, False) return self._text_chars(length, truncate, text, truncate_len)
def _text_chars(self, length, truncate, text, truncate_len): """ Truncates a string after a certain number of chars. """ s_len = 0 end_index = None for i, char in enumerate(text): if unicodedata.combining(char): # Don't consider combining characters # as adding to the string length continue s_len += 1 if end_index is None and s_len > truncate_len: end_index = i if s_len > length: # Return the truncated string return self.add_truncation_text(text[:end_index or 0], truncate) # Return the original string since no truncation was necessary return text
def shave_marks_latin(txt): """Remove all diacritic marks from Latin base characters""" norm_txt = unicodedata.normalize('NFD', txt) # <1> latin_base = False keepers = [] for c in norm_txt: if unicodedata.combining(c) and latin_base: # <2> continue # ignore diacritic on Latin base char keepers.append(c) # <3> # if it isn't combining char, it's a new base char if not unicodedata.combining(c): # <4> latin_base = c in string.ascii_letters shaved = ''.join(keepers) return unicodedata.normalize('NFC', shaved) # <5> # END SHAVE_MARKS_LATIN # BEGIN ASCIIZE
def strip_accents_unicode(s): """Transform accentuated unicode symbols into their simple counterpart Warning: the python-level loop and join operations make this implementation 20 times slower than the strip_accents_ascii basic normalization. See also -------- strip_accents_ascii Remove accentuated char for any unicode symbol that has a direct ASCII equivalent. """ normalized = unicodedata.normalize('NFKD', s) if normalized == s: return s else: return ''.join([c for c in normalized if not unicodedata.combining(c)])
def calibrate(self): data = (u'a', u'1', u' ', u'\u1234', u'\uFFFF') len_data = len(data) digit = unicodedata.digit numeric = unicodedata.numeric decimal = unicodedata.decimal category = unicodedata.category bidirectional = unicodedata.bidirectional decomposition = unicodedata.decomposition mirrored = unicodedata.mirrored combining = unicodedata.combining for i in xrange(self.rounds): c = data[i % len_data]
def chars(self, num, truncate=None, html=False): """ Returns the text truncated to be no longer than the specified number of characters. Takes an optional argument of what should be used to notify that the string has been truncated, defaulting to a translatable string of an ellipsis (...). """ self._setup() length = int(num) text = unicodedata.normalize('NFC', self._wrapped) # Calculate the length to truncate to (max length - end_text length) truncate_len = length for char in self.add_truncation_text('', truncate): if not unicodedata.combining(char): truncate_len -= 1 if truncate_len == 0: break if html: return self._truncate_html(length, truncate, text, truncate_len, False) return self._text_chars(length, truncate, text, truncate_len)
def ArtistName(artist): try: artist = unicodedata.normalize('NFKD', artist.decode('utf-8')) except UnicodeError: artist = unicodedata.normalize('NFKD', artist) # Strip diacritics stripped = u'' for i in range(len(artist)): point = artist[i] if not unicodedata.combining(point): stripped += point return stripped ####################################################################################################
def destress(s, replace={}): """ Returns the string with no diacritics. """ for k, v in replace.items(): s = s.replace(k, v) for k, v in { u'ø' : 'o' , u'ß' : 'ss', u'œ' : 'ae', u'æ' : 'oe', u'“' : '"' , u'”' : '"' , u'‘' : "'" , u'’' : "'" , u'?' : '/' , u'¿' : '?' , u'¡' : '!'}.items(): s = s.replace(k, v) f = unicodedata.combining # f('´') == 0 s = unicodedata.normalize('NFKD', s) # é => e + ´ s = ''.join(ch for ch in s if not f(ch)) return s # print(destress(u'pâté')) # 'pate'
def remove_accents(self, string): nkfd_form = unicodedata.normalize('NFKD', str(string)) return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
def _combining_class(cp): return unicodedata.combining(unichr(cp))
def check_initial_combiner(label): if unicodedata.category(label[0])[0] == 'M': raise IDNAError('Label begins with an illegal combining character') return True
def remove_accents(s): nkfd_form = unicodedata.normalize('NFKD', s) return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
def strwidth_ucs_4(width_data, string): return sum((( ( 0 ) if combining(symbol) else ( width_data[east_asian_width(symbol)] ) ) for symbol in string))
def strwidth_ucs_2(width_data, string): return sum((( ( width_data[east_asian_width(string[i - 1] + symbol)] ) if 0xDC00 <= ord(symbol) <= 0xDFFF else ( 0 ) if combining(symbol) or 0xD800 <= ord(symbol) <= 0xDBFF else ( width_data[east_asian_width(symbol)] ) ) for i, symbol in enumerate(string)))
def remove_accents(self, input_str): nkfd_form = unicodedata.normalize('NFKD', unicode(input_str, encoding='utf-8', errors='ignore')) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])