Python difflib 模块,SequenceMatcher() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用difflib.SequenceMatcher()。
def search(searchTerm, list, keyName : str = None, numMatches : int = 3):
"""Searches the provided list for the searchTerm - using a keyName if provided for dicts."""
if len(list) < 1:
return None
# Iterate through the list and create a list of items
searchList = []
for item in list:
if keyName:
testName = item[keyName]
else:
testName = item
matchRatio = difflib.SequenceMatcher(None, searchTerm.lower(), testName.lower()).ratio()
# matchRatio = Levenshtein.ratio(searchTerm.lower(), testName.lower())
searchList.append({ 'Item' : item, 'Ratio' : matchRatio })
# sort the servers by population
searchList = sorted(searchList, key=lambda x:x['Ratio'], reverse=True)
if numMatches > len(searchList):
# Less than three - let's just give what we've got
numMatches = len(searchList)
return searchList[:numMatches]
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def filecompare( self, filename_left, filename_right ):
if type(filename_left) == type([]):
lines_left = filename_left
else:
try:
lines_left = wb_read_file.readFileContentsAsUnicode( filename_left ).split('\n')
except IOError as e:
print( 'Error opening %s\n%s' % (filename_left, e) )
return 0
if type(filename_right) == type([]):
lines_right = filename_right
else:
try:
lines_right = wb_read_file.readFileContentsAsUnicode( filename_right ).split('\n')
except IOError as e:
print( 'Error opening %s\n%s' % (filename_right, e) )
return 0
lines_left = [eolRemoval( line ) for line in lines_left]
lines_right = [eolRemoval( line ) for line in lines_right]
matcher = difflib.SequenceMatcher( isLineJunk, lines_left, lines_right )
for tag, left_lo, left_hi, right_lo, right_hi in matcher.get_opcodes():
if tag == 'replace':
self.fancy_replace( lines_left, left_lo, left_hi, lines_right, right_lo, right_hi )
elif tag == 'delete':
self.dump( self.text_body.addDeletedLine, lines_left, left_lo, left_hi )
elif tag == 'insert':
self.dump( self.text_body.addInsertedLine, lines_right, right_lo, right_hi )
elif tag == 'equal':
self.dump( self.text_body.addNormalLine, lines_left, left_lo, left_hi )
else:
raise ValueError( 'unknown tag ' + str( tag ) )
self.text_body.addEnd()
return 1
# need to strip any \n or \r thats on the end of the line
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split(r'\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() >= self.max_similarity:
try:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
except FieldDoesNotExist:
verbose_name = attribute_name
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split(r'\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() >= self.max_similarity:
try:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
except FieldDoesNotExist:
verbose_name = attribute_name
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def get_matching_blocks(self):
size = min(len(self.b), len(self.b))
threshold = min(self.threshold, size / 4)
actual = difflib.SequenceMatcher.get_matching_blocks(self)
return [item for item in actual
if item[2] > threshold
or not item[2]]
def get_scored_matches(word: str, possibilities: List[str], n: int=3, cutoff: float=0.6) -> List[Tuple[float, str]]:
if not n > 0:
raise ValueError("n must be > 0: %r" % (n,))
if not (0.0 <= cutoff <= 1.0):
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
result = []
s: SequenceMatcher = SequenceMatcher()
s.set_seq2(word)
for x in possibilities:
s.set_seq1(x)
if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff:
result.append((s.ratio(), x))
# Move the best scorers to head of list
result = heapq.nlargest(n, result)
# Strip scores for the best n matches
return result
def build_token_counts(characterizer, texts):
tokenizer = Tokenizer(characterizer=characterizer)
tokenizer.train([t['text'] for t in texts])
token_counts = Counter()
seq_matcher = difflib.SequenceMatcher()
for t in texts:
t['tokens'] = tokenizer.tokenize(t['text'])
if not t['tokens']:
continue
if 'urls' in t['entities'] and t['entities']['urls']:
#TODO: replace those urls instead of adding them
for url in t['entities']['urls']:
t['tokens'].append(url['display_url'])
if t['__is_rt__']:
t['tokens'].append(u'@{0}'.format(t['user']['screen_name']).lower())
token_counts.update(t['tokens'])
return token_counts
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def sededit(a, b, context=0):
'''
Take two strings and output a sed-like diff
'''
if a == b:
return ''
a_len = len(a)
b_len = len(b)
start1, end1, start2, end2 = a_len, 0, b_len, 0
s = difflib.SequenceMatcher(None, a, b)
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == 'equal':
continue
elif tag == 'insert':
ins = 1
else:
ins = 0
start1 = max(min(i1-context-ins, start1), 0)
start2 = max(min(j1-context-ins, start2), 0)
end1 = min(max(i2+context+ins, end1), a_len)
end2 = min(max(j2+context+ins, end2), b_len)
return 's/%s%s%s/%s/' % (
('' if start1 else '^'), a[start1:end1],
('$' if end1 == a_len else ''), b[start2:end2])
def pick_best(title, item1, item2):
"""
Pick best record among two items with identical scores.
"""
def compare(x):
return difflib.SequenceMatcher(None, title.lower(), x.lower()).ratio()
if not item1['title']:
return item2
elif not item2['title']:
return item2
r1 = compare(item1['title'][0])
r2 = compare(item2['title'][0])
if r1 > r2:
return item1
elif r2 > r1:
return item2
else:
# Try to find other discriminating criteria... e.g. prefer journal-articles
if score_type(item1["type"]) > score_type(item2["type"]):
return item1
else:
return item2
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def get_initial_matches(self):
"""
This does the main work of finding matching n-gram sequences between
the texts.
"""
sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams)
matchingBlocks = sequence.get_matching_blocks()
# Only return the matching sequences that are higher than the
# threshold given by the user.
highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold]
numBlocks = len(highMatchingBlocks)
if numBlocks > 0:
print('%s total matches found.' % numBlocks, flush=True)
return highMatchingBlocks
def render_diff(old_text, new_text):
print (old_text, old_text.__class__)
print (new_text, new_text.__class__)
sm = difflib.SequenceMatcher(a=old_text, b=new_text)
out_toks = []
for opcode, s1, e1, s2, e2 in sm.get_opcodes():
if opcode == 'equal':
out_toks.append(old_text[s1:e1])
elif opcode == 'insert':
out_toks.append('<span class="insert">' + new_text[s2:e2] + '</span>')
elif opcode == 'delete':
out_toks.append('<span class="delete">' + old_text[s1:e1] + '</span>')
elif opcode == 'replace':
out_toks.append('<span class="delete">' + old_text[s1:e1] + '</span>')
out_toks.append('<span class="insert">' + new_text[s2:e2] + '</span>')
return ''.join(out_toks)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split(r'\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
try:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
except FieldDoesNotExist:
verbose_name = attribute_name
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def get_diff_lines(self):
import difflib
postdiffs = list()
commentdiffs = list()
s = difflib.SequenceMatcher(lambda x: x.isspace(), self.post.code, self.code)
for o in s.get_opcodes():
if o[0] in ('replace','delete'):
postdiffs.append(('mod', s.a[o[1]:o[2]]))
if o[0] in ('replace','insert'):
commentdiffs.append(('mod', s.b[o[3]:o[4]]))
if o[0] == 'equal':
postdiffs.append(('eq', s.a[o[1]:o[2]]))
commentdiffs.append(('eq', s.b[o[3]:o[4]]))
self.__normalize__(postdiffs)
self.__normalize__(commentdiffs)
return (postdiffs,commentdiffs)
def get_custom_path(self, searchtitle, title):
'''locate custom folder on disk as pvrart location'''
title_path = ""
custom_path = self._mutils.addon.getSetting("pvr_art_custom_path")
if custom_path and self._mutils.addon.getSetting("pvr_art_custom") == "true":
delim = "\\" if "\\" in custom_path else "/"
dirs = xbmcvfs.listdir(custom_path)[0]
for strictness in [1, 0.95, 0.9, 0.8]:
if title_path:
break
for directory in dirs:
if title_path:
break
directory = directory.decode("utf-8")
curpath = os.path.join(custom_path, directory) + delim
for item in [title, searchtitle]:
match = SM(None, item, directory).ratio()
if match >= strictness:
title_path = curpath
break
if not title_path and self._mutils.addon.getSetting("pvr_art_download") == "true":
title_path = os.path.join(custom_path, normalize_string(title)) + delim
return title_path
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def ratio(s1, s2):
if s1 is None:
raise TypeError("s1 is None")
if s2 is None:
raise TypeError("s2 is None")
s1, s2 = utils.make_type_consistent(s1, s2)
if len(s1) == 0 or len(s2) == 0:
return 0
m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.ratio())
# todo: skip duplicate indexes for a little more speed
def P_update_tree(self, user, archive_path): # private, plex can't use _var
"""update the cache of the dir read state for everything between cb_path and archive_path."""
Log.Debug('updating tree {}'.format(archive_path))
base = Prefs['cb_path']
x = difflib.SequenceMatcher(a=base, b=archive_path)
for tag, i1, i2, j1, j2 in x.get_opcodes():
if tag == 'insert':
try:
diff = os.path.split(archive_path[j1:j2])[0]
d = diff.replace('\\', '/').split('/')[1]
path = os.path.join(base, d)
Log.Debug('archive root: {}'.format(path))
if os.path.abspath(base) == os.path.abspath(path):
Log.Debug('item is in root dir. skipping.')
else:
state = self.dir_read_state(user, path, True)
except Exception as e:
Log.Error('P_update_tree {}'.format(e))
return
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def reset(self):
"""
Resets thread data model
"""
self.disableStdOut = False
self.hashDBCursor = None
self.inTransaction = False
self.lastComparisonPage = None
self.lastComparisonHeaders = None
self.lastErrorPage = None
self.lastHTTPError = None
self.lastRedirectMsg = None
self.lastQueryDuration = 0
self.lastRequestMsg = None
self.lastRequestUID = 0
self.lastRedirectURL = None
self.resumed = False
self.retriesCount = 0
self.seqMatcher = difflib.SequenceMatcher(None)
self.shared = shared
self.valueStack = []
def get_relevant_entities(self, google_cloud_entities, target_entities, target_wikipedia_urls):
entities_to_return = []
target_wikipedia_urls_lower = [target_wikipedia_url.lower() for target_wikipedia_url in target_wikipedia_urls]
for google_cloud_entity in google_cloud_entities:
# Look at Wikipedia URLs
if google_cloud_entity.wikipedia_url and google_cloud_entity.wikipedia_url.lower() in target_wikipedia_urls_lower:
entities_to_return.append(google_cloud_entity.name)
continue
# Look at names
a = google_cloud_entity.name.lower().split(" ")
for target_entity in target_entities:
b = target_entity.lower().split(" ")
if google_cloud_entity in entities_to_return:
break
for google_cloud_entity_part in a:
for target_entity_part in b:
ratio = SequenceMatcher(None, google_cloud_entity_part, target_entity_part).ratio()
if ratio > 0.7:
entities_to_return.append(google_cloud_entity.name)
break
if google_cloud_entity in entities_to_return:
break
return entities_to_return
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def global_search(cls, text, limit, menu='ir.ui.menu'):
"""
Search on models for text including menu
Returns a list of tuple (ratio, model, model_name, id, name, icon)
The size of the list is limited to limit
"""
pool = Pool()
ModelAccess = pool.get('ir.model.access')
if not limit > 0:
raise ValueError('limit must be > 0: %r' % (limit,))
models = cls.search(['OR',
('global_search_p', '=', True),
('model', '=', menu),
])
access = ModelAccess.get_access([m.model for m in models])
s = StringMatcher()
if isinstance(text, str):
text = text.decode('utf-8')
s.set_seq2(text)
def generate():
for model in models:
if not access[model.model]['read']:
continue
Model = pool.get(model.model)
if not hasattr(Model, 'search_global'):
continue
for record, name, icon in Model.search_global(text):
if isinstance(name, str):
name = name.decode('utf-8')
s.set_seq1(name)
yield (s.ratio(), model.model, model.rec_name,
record.id, name, icon)
return heapq.nlargest(int(limit), generate())
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def match_user(slack_users, author_name, threshold=0.6):
"""
Do a fuzzy match of author name to full name. If it matches, return a formatted Slack handle. Else return original
full name.
Args:
slack_users (list of dict): A list of slack users from their API
author_name (str): The commit author's full name
threshold (float): All matches must be at least this high to pass.
Returns:
str: The slack markup for the handle of that author.
If one can't be found, the author's name is returned unaltered.
"""
lower_author_name = reformatted_full_name(author_name)
def match_for_user(slack_user):
"""Get match ratio for slack user, or 0 if below threshold"""
lower_name = reformatted_full_name(slack_user['profile']['real_name'])
ratio = SequenceMatcher(a=lower_author_name, b=lower_name).ratio()
if ratio >= threshold:
return ratio
else:
return 0
slack_matches = [(slack_user, match_for_user(slack_user)) for slack_user in slack_users]
slack_matches = [(slack_user, match) for (slack_user, match) in slack_matches if match >= threshold]
if len(slack_matches) > 0:
matched_user = max(slack_matches, key=lambda pair: pair[1])[0]
return "<@{id}>".format(id=matched_user['id'])
else:
return author_name
def opcodes(self):
sm = difflib.SequenceMatcher(None,
self.target.active_uids,
self.new_unit_list)
return sm.get_opcodes()
def closest_rule(self, adapter):
def _score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=_score_rule)
def closest_rule(self, adapter):
def _score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=_score_rule)
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def diff_text(a, b):
s = SequenceMatcher(None, a, b)
opcode = {'replace': lambda i1, i2, j1, j2: "<strike>%s</strike><strong>%s</strong>" % (a[i1:i2], b[j1:j2]),
'delete': lambda i1, i2, j1, j2: "<strike>%s</strike>" % (a[i1:i2], ),
'insert': lambda i1, i2, j1, j2: "<strong>%s</strong>" % (b[j1:j2], ),
'equal': lambda i1, i2, j1, j2: a[i1:i2]}
return safe("".join(opcode[tag](*args) for tag, *args in s.get_opcodes()))
def print_diffs(expected,actual):
a=expected
b=actual
s = SequenceMatcher(None,a,b)
print '\n'
ctr=0
for block in s.get_matching_blocks():
apos=block[0]
bpos=block[0]
aendpos=apos+block[2]
bendpos=bpos+block[2]
achunk=expected[apos:aendpos]
bchunk=actual[bpos:bendpos]
# print "a[%d] and b[%d] match for %d elements" % block
print '\nACTUAL has matching Error at '+str(aendpos)
print 'Expected ='+expected[bendpos:bendpos+100]+'\nFound ='+actual[aendpos:aendpos+100]
print 'Matched values from 0 to '+str(aendpos-1)+' are'
print ' EXPECTED='+bchunk
print ' ACTUAL ='+achunk
print ''
if ctr==0:
break
else:
ctr+=1
###########################################################################
## Unit Tests - OPML to MM conversions
###########################################################################
#
# These tests are designed to run in the local project folder opmltomm
def similar(self, a, b):
return SequenceMatcher(None, a, b).ratio() > self.similarity_ratio
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
def get_best_similar(data):
import difflib
key, use_similar, similar_pool = data
# try to find some close key in existing messages...
# Optimized code inspired by difflib.get_close_matches (as we only need the best match).
# We also consider to never make a match when len differs more than -len_key / 2, +len_key * 2 (which is valid
# as long as use_similar is not below ~0.7).
# Gives an overall ~20% of improvement!
#tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
#if tmp:
#tmp = tmp[0]
tmp = None
s = difflib.SequenceMatcher()
s.set_seq2(key[1])
len_key = len(key[1])
min_len = len_key // 2
max_len = len_key * 2
for x in similar_pool:
if min_len < len(x) < max_len:
s.set_seq1(x)
if s.real_quick_ratio() >= use_similar and s.quick_ratio() >= use_similar:
sratio = s.ratio()
if sratio >= use_similar:
tmp = x
use_similar = sratio
return key, tmp
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
#return Differ(None, a, b).ratio()
def closest_rule(self, adapter):
def _score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=_score_rule)
def ratio(s1, s2):
s1, s2 = utils.make_type_consistent(s1, s2)
m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.ratio())
def partial_ratio(s1, s2):
""""Return the ratio of the most similar substring
as a number between 0 and 100."""
s1, s2 = utils.make_type_consistent(s1, s2)
if len(s1) <= len(s2):
shorter = s1
longer = s2
else:
shorter = s2
longer = s1
m = SequenceMatcher(None, shorter, longer)
blocks = m.get_matching_blocks()
# each block represents a sequence of matching characters in a string
# of the form (idx_1, idx_2, len)
# the best partial match will block align with at least one of those blocks
# e.g. shorter = "abcd", longer = XXXbcdeEEE
# block = (1,3,3)
# best score === ratio("abcd", "Xbcd")
scores = []
for block in blocks:
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
long_end = long_start + len(shorter)
long_substr = longer[long_start:long_end]
m2 = SequenceMatcher(None, shorter, long_substr)
r = m2.ratio()
if r > .995:
return 100
else:
scores.append(r)
return utils.intr(100 * max(scores))
##############################
# Advanced Scoring Functions #
##############################
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def get_matching_blocks(self):
size = min(len(self.b), len(self.b))
threshold = min(self.threshold, size / 4)
actual = difflib.SequenceMatcher.get_matching_blocks(self)
return [item for item in actual
if item[2] > threshold
or not item[2]]
def ratcliff_obershelp_similarity(a, b):
"""
A kind of approximate string matching.
Computes the generalized Ratcliff/Obershelp similarity of two strings
as the number of matching characters divided by the total number of characters in the two strings.
Matching characters are those in the longest common subsequence plus,
recursively matching characters in the unmatched region on either side of the longest common subsequence.
"""
if a and b:
return SequenceMatcher(None, a, b).ratio()
else:
return None