我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用difflib.SequenceMatcher()。
def search(searchTerm, list, keyName : str = None, numMatches : int = 3): """Searches the provided list for the searchTerm - using a keyName if provided for dicts.""" if len(list) < 1: return None # Iterate through the list and create a list of items searchList = [] for item in list: if keyName: testName = item[keyName] else: testName = item matchRatio = difflib.SequenceMatcher(None, searchTerm.lower(), testName.lower()).ratio() # matchRatio = Levenshtein.ratio(searchTerm.lower(), testName.lower()) searchList.append({ 'Item' : item, 'Ratio' : matchRatio }) # sort the servers by population searchList = sorted(searchList, key=lambda x:x['Ratio'], reverse=True) if numMatches > len(searchList): # Less than three - let's just give what we've got numMatches = len(searchList) return searchList[:numMatches]
def validate(self, password, user=None): if not user: return for attribute_name in self.user_attributes: value = getattr(user, attribute_name, None) if not value or not isinstance(value, string_types): continue value_parts = re.split('\W+', value) + [value] for value_part in value_parts: if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity: verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name) raise ValidationError( _("The password is too similar to the %(verbose_name)s."), code='password_too_similar', params={'verbose_name': verbose_name}, )
def filecompare( self, filename_left, filename_right ): if type(filename_left) == type([]): lines_left = filename_left else: try: lines_left = wb_read_file.readFileContentsAsUnicode( filename_left ).split('\n') except IOError as e: print( 'Error opening %s\n%s' % (filename_left, e) ) return 0 if type(filename_right) == type([]): lines_right = filename_right else: try: lines_right = wb_read_file.readFileContentsAsUnicode( filename_right ).split('\n') except IOError as e: print( 'Error opening %s\n%s' % (filename_right, e) ) return 0 lines_left = [eolRemoval( line ) for line in lines_left] lines_right = [eolRemoval( line ) for line in lines_right] matcher = difflib.SequenceMatcher( isLineJunk, lines_left, lines_right ) for tag, left_lo, left_hi, right_lo, right_hi in matcher.get_opcodes(): if tag == 'replace': self.fancy_replace( lines_left, left_lo, left_hi, lines_right, right_lo, right_hi ) elif tag == 'delete': self.dump( self.text_body.addDeletedLine, lines_left, left_lo, left_hi ) elif tag == 'insert': self.dump( self.text_body.addInsertedLine, lines_right, right_lo, right_hi ) elif tag == 'equal': self.dump( self.text_body.addNormalLine, lines_left, left_lo, left_hi ) else: raise ValueError( 'unknown tag ' + str( tag ) ) self.text_body.addEnd() return 1 # need to strip any \n or \r thats on the end of the line
def validate(self, password, user=None): if not user: return for attribute_name in self.user_attributes: value = getattr(user, attribute_name, None) if not value or not isinstance(value, string_types): continue value_parts = re.split(r'\W+', value) + [value] for value_part in value_parts: if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() >= self.max_similarity: try: verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name) except FieldDoesNotExist: verbose_name = attribute_name raise ValidationError( _("The password is too similar to the %(verbose_name)s."), code='password_too_similar', params={'verbose_name': verbose_name}, )
def get_matching_blocks(self): size = min(len(self.b), len(self.b)) threshold = min(self.threshold, size / 4) actual = difflib.SequenceMatcher.get_matching_blocks(self) return [item for item in actual if item[2] > threshold or not item[2]]
def get_scored_matches(word: str, possibilities: List[str], n: int=3, cutoff: float=0.6) -> List[Tuple[float, str]]: if not n > 0: raise ValueError("n must be > 0: %r" % (n,)) if not (0.0 <= cutoff <= 1.0): raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) result = [] s: SequenceMatcher = SequenceMatcher() s.set_seq2(word) for x in possibilities: s.set_seq1(x) if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: result.append((s.ratio(), x)) # Move the best scorers to head of list result = heapq.nlargest(n, result) # Strip scores for the best n matches return result
def build_token_counts(characterizer, texts): tokenizer = Tokenizer(characterizer=characterizer) tokenizer.train([t['text'] for t in texts]) token_counts = Counter() seq_matcher = difflib.SequenceMatcher() for t in texts: t['tokens'] = tokenizer.tokenize(t['text']) if not t['tokens']: continue if 'urls' in t['entities'] and t['entities']['urls']: #TODO: replace those urls instead of adding them for url in t['entities']['urls']: t['tokens'].append(url['display_url']) if t['__is_rt__']: t['tokens'].append(u'@{0}'.format(t['user']['screen_name']).lower()) token_counts.update(t['tokens']) return token_counts
def sededit(a, b, context=0): ''' Take two strings and output a sed-like diff ''' if a == b: return '' a_len = len(a) b_len = len(b) start1, end1, start2, end2 = a_len, 0, b_len, 0 s = difflib.SequenceMatcher(None, a, b) for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == 'equal': continue elif tag == 'insert': ins = 1 else: ins = 0 start1 = max(min(i1-context-ins, start1), 0) start2 = max(min(j1-context-ins, start2), 0) end1 = min(max(i2+context+ins, end1), a_len) end2 = min(max(j2+context+ins, end2), b_len) return 's/%s%s%s/%s/' % ( ('' if start1 else '^'), a[start1:end1], ('$' if end1 == a_len else ''), b[start2:end2])
def pick_best(title, item1, item2): """ Pick best record among two items with identical scores. """ def compare(x): return difflib.SequenceMatcher(None, title.lower(), x.lower()).ratio() if not item1['title']: return item2 elif not item2['title']: return item2 r1 = compare(item1['title'][0]) r2 = compare(item2['title'][0]) if r1 > r2: return item1 elif r2 > r1: return item2 else: # Try to find other discriminating criteria... e.g. prefer journal-articles if score_type(item1["type"]) > score_type(item2["type"]): return item1 else: return item2
def get_initial_matches(self): """ This does the main work of finding matching n-gram sequences between the texts. """ sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams) matchingBlocks = sequence.get_matching_blocks() # Only return the matching sequences that are higher than the # threshold given by the user. highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold] numBlocks = len(highMatchingBlocks) if numBlocks > 0: print('%s total matches found.' % numBlocks, flush=True) return highMatchingBlocks
def render_diff(old_text, new_text): print (old_text, old_text.__class__) print (new_text, new_text.__class__) sm = difflib.SequenceMatcher(a=old_text, b=new_text) out_toks = [] for opcode, s1, e1, s2, e2 in sm.get_opcodes(): if opcode == 'equal': out_toks.append(old_text[s1:e1]) elif opcode == 'insert': out_toks.append('<span class="insert">' + new_text[s2:e2] + '</span>') elif opcode == 'delete': out_toks.append('<span class="delete">' + old_text[s1:e1] + '</span>') elif opcode == 'replace': out_toks.append('<span class="delete">' + old_text[s1:e1] + '</span>') out_toks.append('<span class="insert">' + new_text[s2:e2] + '</span>') return ''.join(out_toks)
def validate(self, password, user=None): if not user: return for attribute_name in self.user_attributes: value = getattr(user, attribute_name, None) if not value or not isinstance(value, string_types): continue value_parts = re.split(r'\W+', value) + [value] for value_part in value_parts: if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity: try: verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name) except FieldDoesNotExist: verbose_name = attribute_name raise ValidationError( _("The password is too similar to the %(verbose_name)s."), code='password_too_similar', params={'verbose_name': verbose_name}, )
def get_diff_lines(self): import difflib postdiffs = list() commentdiffs = list() s = difflib.SequenceMatcher(lambda x: x.isspace(), self.post.code, self.code) for o in s.get_opcodes(): if o[0] in ('replace','delete'): postdiffs.append(('mod', s.a[o[1]:o[2]])) if o[0] in ('replace','insert'): commentdiffs.append(('mod', s.b[o[3]:o[4]])) if o[0] == 'equal': postdiffs.append(('eq', s.a[o[1]:o[2]])) commentdiffs.append(('eq', s.b[o[3]:o[4]])) self.__normalize__(postdiffs) self.__normalize__(commentdiffs) return (postdiffs,commentdiffs)
def get_custom_path(self, searchtitle, title): '''locate custom folder on disk as pvrart location''' title_path = "" custom_path = self._mutils.addon.getSetting("pvr_art_custom_path") if custom_path and self._mutils.addon.getSetting("pvr_art_custom") == "true": delim = "\\" if "\\" in custom_path else "/" dirs = xbmcvfs.listdir(custom_path)[0] for strictness in [1, 0.95, 0.9, 0.8]: if title_path: break for directory in dirs: if title_path: break directory = directory.decode("utf-8") curpath = os.path.join(custom_path, directory) + delim for item in [title, searchtitle]: match = SM(None, item, directory).ratio() if match >= strictness: title_path = curpath break if not title_path and self._mutils.addon.getSetting("pvr_art_download") == "true": title_path = os.path.join(custom_path, normalize_string(title)) + delim return title_path
def ratio(s1, s2): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) == 0 or len(s2) == 0: return 0 m = SequenceMatcher(None, s1, s2) return utils.intr(100 * m.ratio()) # todo: skip duplicate indexes for a little more speed
def P_update_tree(self, user, archive_path): # private, plex can't use _var """update the cache of the dir read state for everything between cb_path and archive_path.""" Log.Debug('updating tree {}'.format(archive_path)) base = Prefs['cb_path'] x = difflib.SequenceMatcher(a=base, b=archive_path) for tag, i1, i2, j1, j2 in x.get_opcodes(): if tag == 'insert': try: diff = os.path.split(archive_path[j1:j2])[0] d = diff.replace('\\', '/').split('/')[1] path = os.path.join(base, d) Log.Debug('archive root: {}'.format(path)) if os.path.abspath(base) == os.path.abspath(path): Log.Debug('item is in root dir. skipping.') else: state = self.dir_read_state(user, path, True) except Exception as e: Log.Error('P_update_tree {}'.format(e)) return
def reset(self): """ Resets thread data model """ self.disableStdOut = False self.hashDBCursor = None self.inTransaction = False self.lastComparisonPage = None self.lastComparisonHeaders = None self.lastErrorPage = None self.lastHTTPError = None self.lastRedirectMsg = None self.lastQueryDuration = 0 self.lastRequestMsg = None self.lastRequestUID = 0 self.lastRedirectURL = None self.resumed = False self.retriesCount = 0 self.seqMatcher = difflib.SequenceMatcher(None) self.shared = shared self.valueStack = []
def get_relevant_entities(self, google_cloud_entities, target_entities, target_wikipedia_urls): entities_to_return = [] target_wikipedia_urls_lower = [target_wikipedia_url.lower() for target_wikipedia_url in target_wikipedia_urls] for google_cloud_entity in google_cloud_entities: # Look at Wikipedia URLs if google_cloud_entity.wikipedia_url and google_cloud_entity.wikipedia_url.lower() in target_wikipedia_urls_lower: entities_to_return.append(google_cloud_entity.name) continue # Look at names a = google_cloud_entity.name.lower().split(" ") for target_entity in target_entities: b = target_entity.lower().split(" ") if google_cloud_entity in entities_to_return: break for google_cloud_entity_part in a: for target_entity_part in b: ratio = SequenceMatcher(None, google_cloud_entity_part, target_entity_part).ratio() if ratio > 0.7: entities_to_return.append(google_cloud_entity.name) break if google_cloud_entity in entities_to_return: break return entities_to_return
def closest_rule(self, adapter): def score_rule(rule): return sum([ 0.98 * difflib.SequenceMatcher( None, rule.endpoint, self.endpoint ).ratio(), 0.01 * bool(set(self.values or ()).issubset(rule.arguments)), 0.01 * bool(rule.methods and self.method in rule.methods) ]) if adapter and adapter.map._rules: return max(adapter.map._rules, key=score_rule) else: return None
def similar(a, b): return SequenceMatcher(None, a, b).ratio()
def global_search(cls, text, limit, menu='ir.ui.menu'): """ Search on models for text including menu Returns a list of tuple (ratio, model, model_name, id, name, icon) The size of the list is limited to limit """ pool = Pool() ModelAccess = pool.get('ir.model.access') if not limit > 0: raise ValueError('limit must be > 0: %r' % (limit,)) models = cls.search(['OR', ('global_search_p', '=', True), ('model', '=', menu), ]) access = ModelAccess.get_access([m.model for m in models]) s = StringMatcher() if isinstance(text, str): text = text.decode('utf-8') s.set_seq2(text) def generate(): for model in models: if not access[model.model]['read']: continue Model = pool.get(model.model) if not hasattr(Model, 'search_global'): continue for record, name, icon in Model.search_global(text): if isinstance(name, str): name = name.decode('utf-8') s.set_seq1(name) yield (s.ratio(), model.model, model.rec_name, record.id, name, icon) return heapq.nlargest(int(limit), generate())
def match_user(slack_users, author_name, threshold=0.6): """ Do a fuzzy match of author name to full name. If it matches, return a formatted Slack handle. Else return original full name. Args: slack_users (list of dict): A list of slack users from their API author_name (str): The commit author's full name threshold (float): All matches must be at least this high to pass. Returns: str: The slack markup for the handle of that author. If one can't be found, the author's name is returned unaltered. """ lower_author_name = reformatted_full_name(author_name) def match_for_user(slack_user): """Get match ratio for slack user, or 0 if below threshold""" lower_name = reformatted_full_name(slack_user['profile']['real_name']) ratio = SequenceMatcher(a=lower_author_name, b=lower_name).ratio() if ratio >= threshold: return ratio else: return 0 slack_matches = [(slack_user, match_for_user(slack_user)) for slack_user in slack_users] slack_matches = [(slack_user, match) for (slack_user, match) in slack_matches if match >= threshold] if len(slack_matches) > 0: matched_user = max(slack_matches, key=lambda pair: pair[1])[0] return "<@{id}>".format(id=matched_user['id']) else: return author_name
def opcodes(self): sm = difflib.SequenceMatcher(None, self.target.active_uids, self.new_unit_list) return sm.get_opcodes()
def closest_rule(self, adapter): def _score_rule(rule): return sum([ 0.98 * difflib.SequenceMatcher( None, rule.endpoint, self.endpoint ).ratio(), 0.01 * bool(set(self.values or ()).issubset(rule.arguments)), 0.01 * bool(rule.methods and self.method in rule.methods) ]) if adapter and adapter.map._rules: return max(adapter.map._rules, key=_score_rule)
def diff_text(a, b): s = SequenceMatcher(None, a, b) opcode = {'replace': lambda i1, i2, j1, j2: "<strike>%s</strike><strong>%s</strong>" % (a[i1:i2], b[j1:j2]), 'delete': lambda i1, i2, j1, j2: "<strike>%s</strike>" % (a[i1:i2], ), 'insert': lambda i1, i2, j1, j2: "<strong>%s</strong>" % (b[j1:j2], ), 'equal': lambda i1, i2, j1, j2: a[i1:i2]} return safe("".join(opcode[tag](*args) for tag, *args in s.get_opcodes()))
def print_diffs(expected,actual): a=expected b=actual s = SequenceMatcher(None,a,b) print '\n' ctr=0 for block in s.get_matching_blocks(): apos=block[0] bpos=block[0] aendpos=apos+block[2] bendpos=bpos+block[2] achunk=expected[apos:aendpos] bchunk=actual[bpos:bendpos] # print "a[%d] and b[%d] match for %d elements" % block print '\nACTUAL has matching Error at '+str(aendpos) print 'Expected ='+expected[bendpos:bendpos+100]+'\nFound ='+actual[aendpos:aendpos+100] print 'Matched values from 0 to '+str(aendpos-1)+' are' print ' EXPECTED='+bchunk print ' ACTUAL ='+achunk print '' if ctr==0: break else: ctr+=1 ########################################################################### ## Unit Tests - OPML to MM conversions ########################################################################### # # These tests are designed to run in the local project folder opmltomm
def similar(self, a, b): return SequenceMatcher(None, a, b).ratio() > self.similarity_ratio
def get_best_similar(data): import difflib key, use_similar, similar_pool = data # try to find some close key in existing messages... # Optimized code inspired by difflib.get_close_matches (as we only need the best match). # We also consider to never make a match when len differs more than -len_key / 2, +len_key * 2 (which is valid # as long as use_similar is not below ~0.7). # Gives an overall ~20% of improvement! #tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar) #if tmp: #tmp = tmp[0] tmp = None s = difflib.SequenceMatcher() s.set_seq2(key[1]) len_key = len(key[1]) min_len = len_key // 2 max_len = len_key * 2 for x in similar_pool: if min_len < len(x) < max_len: s.set_seq1(x) if s.real_quick_ratio() >= use_similar and s.quick_ratio() >= use_similar: sratio = s.ratio() if sratio >= use_similar: tmp = x use_similar = sratio return key, tmp
def similar(a, b): return SequenceMatcher(None, a, b).ratio() #return Differ(None, a, b).ratio()
def ratio(s1, s2): s1, s2 = utils.make_type_consistent(s1, s2) m = SequenceMatcher(None, s1, s2) return utils.intr(100 * m.ratio())
def partial_ratio(s1, s2): """"Return the ratio of the most similar substring as a number between 0 and 100.""" s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return utils.intr(100 * max(scores)) ############################## # Advanced Scoring Functions # ##############################
def ratcliff_obershelp_similarity(a, b): """ A kind of approximate string matching. Computes the generalized Ratcliff/Obershelp similarity of two strings as the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively matching characters in the unmatched region on either side of the longest common subsequence. """ if a and b: return SequenceMatcher(None, a, b).ratio() else: return None