Python unicodedata 模块,combining() 实例源码


项目:aiodownload    作者:jelloslinger    | 项目源码 | 文件源码
def clean_filename(filename):
    """Return a sanitized filename (replace / strip out illegal characters)

    :param filename: string used for a filename
    :type filename: str

    :return: sanitized filename
    :rtype: str

    return ''.join([
        c for c in unicodedata.normalize(
            ''.join([REPLACEMENT_CHAR.get(c, c) for c in filename])
        if not unicodedata.combining(c) and c in '-_.() {0}{1}'.format(string.ascii_letters, string.digits)
项目:linter    作者:ethz-asl    | 项目源码 | 文件源码
def GetLineWidth(line):
  """Determines the width of the line in column positions.

    line: A string, which may be a Unicode string.

    The width of the line in column positions, accounting for Unicode
    combining characters and wide characters.
  if isinstance(line, unicode):
    width = 0
    for uc in unicodedata.normalize('NFC', line):
      if unicodedata.east_asian_width(uc) in ('W', 'F'):
        width += 2
      elif not unicodedata.combining(uc):
        width += 1
    return width
    return len(line)
项目:CodingDojo    作者:ComputerSocietyUNB    | 项目源码 | 文件源码
def chars(self, num, truncate=None, html=False):
        Returns the text truncated to be no longer than the specified number
        of characters.

        Takes an optional argument of what should be used to notify that the
        string has been truncated, defaulting to a translatable string of an
        ellipsis (...).
        length = int(num)
        text = unicodedata.normalize('NFC', self._wrapped)

        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in self.add_truncation_text('', truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
        if html:
            return self._truncate_html(length, truncate, text, truncate_len, False)
        return self._text_chars(length, truncate, text, truncate_len)
项目:CodingDojo    作者:ComputerSocietyUNB    | 项目源码 | 文件源码
def _text_chars(self, length, truncate, text, truncate_len):
        Truncates a string after a certain number of chars.
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
            s_len += 1
            if end_index is None and s_len > truncate_len:
                end_index = i
            if s_len > length:
                # Return the truncated string
                return self.add_truncation_text(text[:end_index or 0],

        # Return the original string since no truncation was necessary
        return text
项目:notebooks    作者:fluentpython    | 项目源码 | 文件源码
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:   # <2>
            continue  # ignore diacritic on Latin base char
        keepers.append(c)                             # <3>
        # if it isn't combining char, it's a new base char
        if not unicodedata.combining(c):              # <4>
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)   # <5>

项目:2016CCF_BDCI_Sougou    作者:coderSkyChen    | 项目源码 | 文件源码
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic

    See also
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
项目:2016CCF-sougou    作者:prozhuchen    | 项目源码 | 文件源码
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic

    See also
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
项目:oil    作者:oilshell    | 项目源码 | 文件源码
def calibrate(self):

            data = (u'a', u'1', u' ', u'\u1234', u'\uFFFF')
            len_data = len(data)
            digit = unicodedata.digit
            numeric = unicodedata.numeric
            decimal = unicodedata.decimal
            category = unicodedata.category
            bidirectional = unicodedata.bidirectional
            decomposition = unicodedata.decomposition
            mirrored = unicodedata.mirrored
            combining = unicodedata.combining

            for i in xrange(self.rounds):

                c = data[i % len_data]
项目:python2-tracer    作者:extremecoders-re    | 项目源码 | 文件源码
def calibrate(self):

            data = (u'a', u'1', u' ', u'\u1234', u'\uFFFF')
            len_data = len(data)
            digit = unicodedata.digit
            numeric = unicodedata.numeric
            decimal = unicodedata.decimal
            category = unicodedata.category
            bidirectional = unicodedata.bidirectional
            decomposition = unicodedata.decomposition
            mirrored = unicodedata.mirrored
            combining = unicodedata.combining

            for i in xrange(self.rounds):

                c = data[i % len_data]
项目:haros_plugins    作者:git-afsantos    | 项目源码 | 文件源码
def GetLineWidth(line):
  """Determines the width of the line in column positions.

    line: A string, which may be a Unicode string.

    The width of the line in column positions, accounting for Unicode
    combining characters and wide characters.
  if isinstance(line, unicode):
    width = 0
    for uc in unicodedata.normalize('NFC', line):
      if unicodedata.east_asian_width(uc) in ('W', 'F'):
        width += 2
      elif not unicodedata.combining(uc):
        width += 1
    return width
    return len(line)
项目:lifesoundtrack    作者:MTG    | 项目源码 | 文件源码
def chars(self, num, truncate=None, html=False):
        Returns the text truncated to be no longer than the specified number
        of characters.

        Takes an optional argument of what should be used to notify that the
        string has been truncated, defaulting to a translatable string of an
        ellipsis (...).
        length = int(num)
        text = unicodedata.normalize('NFC', self._wrapped)

        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in self.add_truncation_text('', truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
        if html:
            return self._truncate_html(length, truncate, text, truncate_len, False)
        return self._text_chars(length, truncate, text, truncate_len)
项目:lifesoundtrack    作者:MTG    | 项目源码 | 文件源码
def _text_chars(self, length, truncate, text, truncate_len):
        Truncates a string after a certain number of chars.
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
            s_len += 1
            if end_index is None and s_len > truncate_len:
                end_index = i
            if s_len > length:
                # Return the truncated string
                return self.add_truncation_text(text[:end_index or 0],

        # Return the original string since no truncation was necessary
        return text
项目:LeetCode    作者:YJL33    | 项目源码 | 文件源码
def GetLineWidth(line):
  """Determines the width of the line in column positions.

    line: A string, which may be a Unicode string.

    The width of the line in column positions, accounting for Unicode
    combining characters and wide characters.
  if isinstance(line, unicode):
    width = 0
    for uc in unicodedata.normalize('NFC', line):
      if unicodedata.east_asian_width(uc) in ('W', 'F'):
        width += 2
      elif not unicodedata.combining(uc):
        width += 1
    return width
    return len(line)
项目:2016_CCFsougou    作者:dhdsjy    | 项目源码 | 文件源码
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic

    See also
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
项目:OpenSky_BL    作者:fishpepper    | 项目源码 | 文件源码
def GetLineWidth(line):
  """Determines the width of the line in column positions.

    line: A string, which may be a Unicode string.

    The width of the line in column positions, accounting for Unicode
    combining characters and wide characters.
  if isinstance(line, unicode):
    width = 0
    for uc in unicodedata.normalize('NFC', line):
      if unicodedata.east_asian_width(uc) in ('W', 'F'):
        width += 2
      elif not unicodedata.combining(uc):
        width += 1
    return width
    return len(line)
项目:muvio.bundle    作者:piplongrun    | 项目源码 | 文件源码
def ArtistName(artist):

    artist = unicodedata.normalize('NFKD', artist.decode('utf-8'))
  except UnicodeError:
    artist = unicodedata.normalize('NFKD', artist)

  # Strip diacritics
  stripped = u''

  for i in range(len(artist)):
    point = artist[i]

    if not unicodedata.combining(point):
      stripped += point

  return stripped

项目:grasp    作者:textgain    | 项目源码 | 文件源码
def destress(s, replace={}):
    """ Returns the string with no diacritics.
    for k, v in replace.items():
        s = s.replace(k, v)
    for k, v in {
     u'ø' : 'o' ,
     u'ß' : 'ss',
     u'œ' : 'ae',
     u'æ' : 'oe',
     u'“' : '"' ,
     u'”' : '"' ,
     u'‘' : "'" ,
     u'’' : "'" ,
     u'?' : '/' ,
     u'¿' : '?' ,
     u'¡' : '!'}.items():
        s = s.replace(k, v)
    f = unicodedata.combining             # f('´') == 0
    s = unicodedata.normalize('NFKD', s)  # é => e + ´
    s = ''.join(ch for ch in s if not f(ch))
    return s

# print(destress(u'pâté')) # 'pate'
项目:Chromium_DepotTools    作者:p07r0457    | 项目源码 | 文件源码
def GetLineWidth(line):
  """Determines the width of the line in column positions.

    line: A string, which may be a Unicode string.

    The width of the line in column positions, accounting for Unicode
    combining characters and wide characters.
  if isinstance(line, unicode):
    width = 0
    for uc in unicodedata.normalize('NFC', line):
      if unicodedata.east_asian_width(uc) in ('W', 'F'):
        width += 2
      elif not unicodedata.combining(uc):
        width += 1
    return width
    return len(line)
项目:liberator    作者:libscie    | 项目源码 | 文件源码
def chars(self, num, truncate=None, html=False):
        Returns the text truncated to be no longer than the specified number
        of characters.

        Takes an optional argument of what should be used to notify that the
        string has been truncated, defaulting to a translatable string of an
        ellipsis (...).
        length = int(num)
        text = unicodedata.normalize('NFC', self._wrapped)

        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in self.add_truncation_text('', truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
        if html:
            return self._truncate_html(length, truncate, text, truncate_len, False)
        return self._text_chars(length, truncate, text, truncate_len)
项目:liberator    作者:libscie    | 项目源码 | 文件源码
def _text_chars(self, length, truncate, text, truncate_len):
        Truncates a string after a certain number of chars.
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
            s_len += 1
            if end_index is None and s_len > truncate_len:
                end_index = i
            if s_len > length:
                # Return the truncated string
                return self.add_truncation_text(text[:end_index or 0],

        # Return the original string since no truncation was necessary
        return text
项目:2016_CCFsougou2    作者:dhdsjy    | 项目源码 | 文件源码
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic

    See also
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
项目:node-gn    作者:Shouqun    | 项目源码 | 文件源码
def GetLineWidth(line):
  """Determines the width of the line in column positions.

    line: A string, which may be a Unicode string.

    The width of the line in column positions, accounting for Unicode
    combining characters and wide characters.
  if isinstance(line, unicode):
    width = 0
    for uc in unicodedata.normalize('NFC', line):
      if unicodedata.east_asian_width(uc) in ('W', 'F'):
        width += 2
      elif not unicodedata.combining(uc):
        width += 1
    return width
    return len(line)
项目:djanoDoc    作者:JustinChavez    | 项目源码 | 文件源码
def chars(self, num, truncate=None, html=False):
        Returns the text truncated to be no longer than the specified number
        of characters.

        Takes an optional argument of what should be used to notify that the
        string has been truncated, defaulting to a translatable string of an
        ellipsis (...).
        length = int(num)
        text = unicodedata.normalize('NFC', self._wrapped)

        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in self.add_truncation_text('', truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
        if html:
            return self._truncate_html(length, truncate, text, truncate_len, False)
        return self._text_chars(length, truncate, text, truncate_len)
项目:djanoDoc    作者:JustinChavez    | 项目源码 | 文件源码
def _text_chars(self, length, truncate, text, truncate_len):
        Truncates a string after a certain number of chars.
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
            s_len += 1
            if end_index is None and s_len > truncate_len:
                end_index = i
            if s_len > length:
                # Return the truncated string
                return self.add_truncation_text(text[:end_index or 0],

        # Return the original string since no truncation was necessary
        return text
项目:ln2sql    作者:FerreroJeremy    | 项目源码 | 文件源码
def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
项目:ln2sql    作者:FerreroJeremy    | 项目源码 | 文件源码
def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
项目:ln2sql    作者:FerreroJeremy    | 项目源码 | 文件源码
def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
项目:ln2sql    作者:FerreroJeremy    | 项目源码 | 文件源码
def remove_accents(self, string):
        nkfd_form = unicodedata.normalize('NFKD', str(string))
        return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
项目:my-first-blog    作者:AnkurBegining    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:my-first-blog    作者:AnkurBegining    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:googletranslate.popclipext    作者:wizyoung    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:googletranslate.popclipext    作者:wizyoung    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:pip-update-requirements    作者:alanhamlett    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:pip-update-requirements    作者:alanhamlett    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:code    作者:ActiveState    | 项目源码 | 文件源码
def remove_accents(s): 
    nkfd_form = unicodedata.normalize('NFKD', s) 
    return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
项目:noc-orchestrator    作者:DirceuSilvaLabs    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:noc-orchestrator    作者:DirceuSilvaLabs    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:jira_worklog_scanner    作者:pgarneau    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:jira_worklog_scanner    作者:pgarneau    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:workflows.kyoyue    作者:wizyoung    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:workflows.kyoyue    作者:wizyoung    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:purelove    作者:hucmosin    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:purelove    作者:hucmosin    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:centos-base-consul    作者:zeroc0d3lab    | 项目源码 | 文件源码
def strwidth_ucs_4(width_data, string):
    return sum(((
        ) if combining(symbol) else (
    ) for symbol in string))
项目:centos-base-consul    作者:zeroc0d3lab    | 项目源码 | 文件源码
def strwidth_ucs_2(width_data, string):
    return sum(((
            width_data[east_asian_width(string[i - 1] + symbol)]
        ) if 0xDC00 <= ord(symbol) <= 0xDFFF else (
        ) if combining(symbol) or 0xD800 <= ord(symbol) <= 0xDBFF else (
    ) for i, symbol in enumerate(string)))
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def _combining_class(cp):
    return unicodedata.combining(unichr(cp))
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True
项目:stegator    作者:1modm    | 项目源码 | 文件源码
def remove_accents(self, input_str):
        nkfd_form = unicodedata.normalize('NFKD', unicode(input_str, encoding='utf-8', errors='ignore'))
        return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])