Python editdistance 模块,eval() 实例源码
我们从Python开源项目中,提取了以下48个代码示例,用于说明如何使用editdistance.eval()。
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def total_distance(observed_sentence, corrected_sentence):
"""Calculates the total distance between the two given sentences.
Args:
observed_sentence: Observed sentence.
corrected_sentence: Corrected sentence.
Returns:
Total Levenshtein distance between the two sentences.
"""
total_distance = 0
observed_words = list(observed_sentence)
corrected_words = list(corrected_sentence)
for i in range(len(observed_words)):
comparable_words = observed_words[i], corrected_words[i]
total_distance += editdistance.eval(*comparable_words)
return total_distance
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def similarities(self):
"""
Compute Levenshtein distance matrix between files (implemented in C++ pip package: editdistance)
Later: https://docs.python.org/2/library/difflib.html
:return:
"""
ucos = sorted(self.filedb.keys())
sims = {}
for idx, uco in enumerate(ucos):
logger.info('Comparing %s...' % uco)
sims[uco] = {}
for idx2, uco2 in enumerate(ucos[idx+1:]):
dist = editdistance.eval(self.file_data[uco], self.file_data[uco2])
sims[uco][uco2] = dist
logger.info(' %6d vs %6d : %4d %s %s' % (uco, uco2, dist, self.filedb[uco], self.filedb[uco2]))
def best_match(word, corrected_med_list, corrected_english_list):
min_dist_med = len(word)
best_med_word = ''
min_dist_eng = len(word)
best_eng_word = ''
for word_t in corrected_med_list:
if editdistance.eval(word, word_t) < min_dist_med:
min_dist_med = editdistance.eval(word, word_t)
best_med_word = word_t
for word_t in corrected_english_list:
if editdistance.eval(word, word_t) < min_dist_eng:
min_dist_eng = editdistance.eval(word, word_t)
best_eng_word = word_t
if min_dist_med <= min_dist_eng:
return best_med_word
else:
return best_eng_word
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def compare_strings_by_edit_distance(first=None, second=None):
"""
Get the edit distance between the two strings passed to this method.
:param first: The first string to compare.
:param second: The second string to compare.
:return: A number representing the edit distance between the two strings passed
as arguments to this method.
"""
return editdistance.eval(first, second)
# Class Methods
# Public Methods
# Protected Methods
# Private Methods
# Properties
# Representation and Comparison
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
for j in range(0, num_proc):
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
print('\nOut of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
% (num, mean_ed, mean_norm_ed))
def simscore(a1, b1):
max_len = max([len(a1), len(b1)])
if max_len == 0:
return 0
dist = editdistance.eval(a1, b1)
if dist > max_len:
print dist
return 1.0 - (float(dist)/float(max_len))
def similarity(a1, b1):
max_len = max([len(a1), len(b1)])
if max_len == 0:
return 0
dist = editdistance.eval(a1, b1)
return 1.0 - (float(dist)/float(max_len))
def letter_error_count(self) -> float:
return editdistance.eval(self.expected, self.predicted)
def word_error_count(self) -> float:
return editdistance.eval(self.expected_words, self.predicted.split())
def getEditDistanceMat(gtTranscriptions,sampleTranscriptions):
outputShape=[len(gtTranscriptions),len(sampleTranscriptions)]
distMat=np.empty(outputShape)
maxSizeMat=np.empty(outputShape)
for gtNum in range(len(gtTranscriptions)):
for sampleNum in range(len(sampleTranscriptions)):
distMat[gtNum,sampleNum]=editdistance.eval(gtTranscriptions[gtNum],sampleTranscriptions[sampleNum])
maxSizeMat[gtNum,sampleNum]=max(len(gtTranscriptions[gtNum]),len(sampleTranscriptions[sampleNum]))
return distMat/maxSizeMat,distMat
def _normalized_edit_dist(s1, s2):
return float(editdistance.eval(s1, s2)) / max(len(s1), len(s2), 1)
def compare_cc_list_levenshtein(sample, ref):
"""
Compares the cyclomatic complexity values of all functions in `sample`
with those of all functions in `ref`, by taking the Levenshtein distance
between these lists. This detects added/removed functions and functions
that have changed in complexity between a sample and a reference.
"""
if hasattr(ref, 'cclist') and ref.cclist is not None:
ratio = 1 - (editdistance.eval(sample.cclist, ref.cclist)
/ float(max(len(sample.cclist), len(ref.cclist))))
else:
ratio = 0.0
return (ratio * 100, ref.name, ref.version)
def annotate(self, tokens):
X_focus = self.preprocessor.transform(tokens=tokens)['X_focus']
X_context = self.pretrainer.transform(tokens=tokens)
# get predictions:
new_in = {}
if self.include_token:
new_in['focus_in'] = X_focus
if self.include_context:
new_in['context_in'] = X_context
preds = self.model.predict(new_in)
if isinstance(preds, np.ndarray):
preds = [preds]
annotation_dict = {'tokens': tokens}
if self.include_lemma:
pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=preds[self.lemma_out_idx])
annotation_dict['lemmas'] = pred_lemmas
if self.postcorrect:
for i in range(len(pred_lemmas)):
if pred_lemmas[i] not in self.known_lemmas:
pred_lemmas[i] = min(self.known_lemmas,
key=lambda x: editdistance.eval(x, pred_lemmas[i]))
annotation_dict['postcorrect_lemmas'] = pred_lemmas
if self.include_pos:
pred_pos = self.preprocessor.inverse_transform_pos(predictions=preds[self.pos_out_idx])
annotation_dict['pos'] = pred_pos
if self.include_morph:
pred_morph = self.preprocessor.inverse_transform_morph(predictions=preds[self.morph_out_idx])
annotation_dict['morph'] = pred_morph
return annotation_dict
def searchPackages(name):
results = loadJson('https://www.archlinux.org/packages/search/json/?q=%s' % name)['results']
results = sorted(results, key=lambda x: levdist(name, x['pkgname']))[:100]
packages = [parsePackage(package, name) for package in results if package['arch'] in (arch, 'any')]
results = loadJson('https://aur.archlinux.org/rpc/?v=5&type=search&arg=%s' % name)['results']
results = sorted(results, key=lambda x: levdist(name, x['Name']))[:100]
packages += [parsePackage(package, name) for package in results]
packages = sorted(packages, key=lambda x: levdist(name, x[0]))[:100]
return packages
def set_trimming(self, u, t, use_edit_distance=True):
untrimmed = u.query_sequence.upper()
untrimmed_len = len(untrimmed)
trimmed = t.query_sequence.upper()
trimmed_len = len(trimmed)
trimmed_front = 0 if use_edit_distance else -1
if use_edit_distance and (untrimmed_len > trimmed_len):
for i in range(untrimmed_len - trimmed_len + 1):
if untrimmed[i:(i+trimmed_len)] == trimmed:
trimmed_front = i
break
else:
# Since Skewer performs automatic error correction, the trimmed and
# untrimmed reads may not match, so in that case we find the closest
# match by Levenshtein distance.
dist = None
for i in range(untrimmed_len - trimmed_len + 1):
d = editdistance.eval(untrimmed[i:(i+trimmed_len)], trimmed)
if not dist:
dist = d
elif d < dist:
trimmed_front = i
dist = d
self.trimmed_front = trimmed_front
self.trimmed_back = untrimmed_len - (trimmed_len + trimmed_front)
def edit(seq1, seq2):
"""
Wrapper around editdistance.eval for fast Levenshtein
distance computation.
Args:
seq1 (str): Reference sequence
seq2 (str): Sequence to compare
Examples:
>>> edit('banana', 'bahama')
2
"""
return int(ed.eval(seq1, seq2))
def edit_distance(train_in, test_in, qcolumns = ['question1', 'question2'], append=''):
train = train_in.copy().loc[:,qcolumns]
test = test_in.copy().loc[:,qcolumns]
import editdistance
def my_fun(row, qcolumns):
return editdistance.eval(row[qcolumns[0]], row[qcolumns[1]])
key = 'edit_dist'+append
train[key] = train.apply(lambda x: my_fun(x, qcolumns=qcolumns), axis=1)
test[key] = test.apply(lambda x: my_fun(x, qcolumns=qcolumns), axis=1)
return (train, test)
def bestNameDiff(profileone, profiletwo):
""" Applies Levenshtein distance between best names of two profiles."""
n1 = profileone.bestname()
n2 = profiletwo.bestname()
if (not n1) or (not n2):
return 0
l1 = profileone.name_length
l2 = profiletwo.name_length
diff = editdistance.eval(n1,n2)
return 1-(diff/(l1 if l1 > l2 else l2))
def string_sim(n1, n2):
""" Applies Levenshtein distance between strings."""
if (not n1) or (not n2):
return 0
l1 = len(n1)
l2 = len(n2)
diff = editdistance.eval(n1,n2)
return 1-(diff/(l1 if l1 > l2 else l2))
def collect_file_paths(path,gene_file):
genes_of_interest=[]
for line in open(gene_file):
genes_of_interest.append(line.strip())
isoform_list=[]
gene_read_counter={}
isoform_read_counter={}
for gene in genes_of_interest:
gene_read_counter[gene]=0
for file1 in sorted(os.listdir(path+'/parsed_reads')):
if gene in file1:
file2=file1+'_sub'
out_sub=open(path+'/parsed_reads/'+file2,'w')
counter=0
isoform_reads=read_fasta(path+'/parsed_reads/'+file1)
isoform_read_list=list(isoform_reads.keys())
print(gene_read_counter,gene_read_counter[gene],len(isoform_reads.keys()))
gene_read_counter[gene]+=len(isoform_reads.keys())
isoform_read_counter[path+'/parsed_reads/'+file2]=len(isoform_reads.keys())
read1 = isoform_read_list[0]
out_sub.write('>'+read1+'\n'+isoform_reads[read1]+'\n')
for read2 in isoform_read_list[1::]:
if counter<subsample:
out_sub.write('>'+read2+'\n')
dist_1 = editdistance.eval(isoform_reads[read1],isoform_reads[read2])**2/float(len(isoform_reads[read1])*len(isoform_reads[read2]))
dist_2 = editdistance.eval(isoform_reads[read1],reverse_complement(isoform_reads[read2]))**2/float(len(isoform_reads[read1])*len(isoform_reads[read2]))
if dist_1 < dist_2:
out_sub.write(isoform_reads[read2]+'\n')
else:
out_sub.write(reverse_complement(isoform_reads[read2])+'\n')
counter+=1
isoform_list.append((path+'/parsed_reads/'+file2,gene))
return isoform_list,gene_read_counter,isoform_read_counter
def test_simulate_sequencing_errors(self):
"""Test function simulating sequencing errors."""
error_rate = 0.1
error_weights = {'substitution': 1.0 / 6,
'insertion': 1.0 / 6,
'deletion': 4.0 / 6}
sequence = sim_seq.simulate_sequence(5000)
mutated_record = sim_seq.simulate_sequencing_errors(
sequence, error_rate, error_weights)
distance = editdistance.eval(sequence, mutated_record.seq)
expected_errors = len(sequence) * error_rate
errors_sd = np.sqrt(len(sequence) * error_rate * (1 - error_rate))
# Should pass 0.9973 proportion of cases:
self.assertTrue(expected_errors - errors_sd * 3 < distance < expected_errors +
errors_sd * 3, msg="expected: {} realised:{}".format(expected_errors, distance))
def show_edit_distance(self, num):
num_left = num
mean_norm_ed = 0.0
mean_ed = 0.0
wrong = 0
right = 0
while num_left > 0:
word_batch = next(self.text_img_gen)[0]
num_proc = min(word_batch['the_input'].shape[0], num_left)
decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc], word_batch['labeltype_input'][0:num_proc])
for j in range(0, num_proc):
ocr_result = deaccent(unicode(re.sub("[\+\/]", "", re.sub("\\s", "", decoded_res[j])), 'utf-8'))
gold_label = re.sub("[\+\/]", "", re.sub("\\s", "", word_batch['source_str'][j]))
if gold_label == ocr_result:
right += 1
else:
wrong += 1
edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
mean_ed += float(edit_dist)
mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
num_left -= num_proc
absacc = float(right) / (float(right) + float(wrong))
mean_norm_ed = mean_norm_ed / num
mean_ed = mean_ed / num
outline = ' Out of %d samples: Mean edit distance: %.3f Mean normalized edit distance: %0.3f\n Absolute accuracy over labels is %0.2f\n' % (
num, mean_ed, mean_norm_ed, absacc)
print(outline)
return mean_norm_ed, absacc
def text_distance(str1, str2):
str1 = normalize_txt(str1)
str2 = normalize_txt(str2)
return editdistance.eval(str1, str2)
def track_decoding(self, decoded_str, expected_str):
self.letter_edit_distance = editdistance.eval(expected_str, decoded_str)
self.letter_error_rate = self.letter_edit_distance / len(expected_str)
self.word_edit_distance = editdistance.eval(expected_str.split(), decoded_str.split())
self.word_error_rate = self.word_edit_distance / len(expected_str.split())
self.sum_letter_edit_distance += self.letter_edit_distance
self.sum_letter_error_rate += self.letter_error_rate
self.sum_word_edit_distance += self.word_edit_distance
self.sum_word_error_rate += self.word_error_rate
self.decodings_counter += 1
def run_step(self, model: SpeechModel, sess: tf.Session, stats: EvalStatistics,
save: bool, verbose=True, feed_dict: Dict=None):
global_step = model.global_step.eval()
# Validate on data set and write summary
if save:
avg_loss, decoded, label, summary = model.step(sess, update=False, decode=True, return_label=True,
summary=True, feed_dict=feed_dict)
model.summary_writer.add_summary(summary, global_step)
else:
avg_loss, decoded, label = model.step(sess, update=False, decode=True,
return_label=True, feed_dict=feed_dict)
if verbose:
perplexity = np.exp(float(avg_loss)) if avg_loss < 300 else float("inf")
print("validation average loss {:.2f} perplexity {:.2f}".format(avg_loss, perplexity))
# Print decode
decoded_ids_paths = [Evaluation.extract_decoded_ids(path) for path in decoded]
for label_ids in Evaluation.extract_decoded_ids(label):
expected_str = speecht.vocabulary.ids_to_sentence(label_ids)
if verbose:
print('expected: {}'.format(expected_str))
for decoded_path in decoded_ids_paths:
decoded_ids = next(decoded_path)
decoded_str = speecht.vocabulary.ids_to_sentence(decoded_ids)
stats.track_decoding(decoded_str, expected_str)
if verbose:
print('decoded: {}'.format(decoded_str))
print('LED: {} LER: {:.2f} WED: {} WER: {:.2f}'.format(stats.letter_edit_distance,
stats.letter_error_rate,
stats.word_edit_distance,
stats.word_error_rate))
def closest(self, date=datetime.date.today(), country=None,
limit=datetime.timedelta(days=366)):
"""
Get the closest CPI value for a specified date. The date defaults to
today. A limit can be provided to exclude all values for dates further
away than defined by the limit. This defaults to 366 days.
"""
# Try to get the country
try:
possible_countries = [self.data[country]]
except:
possible_countries = [elem for elem in self.data.keys() if editdistance.eval(country,elem) < 3]
if len(possible_countries) == 0:
return "No country found, typo unlikely for ",country
# Find the closest date
country_cpi = {}
for country in possible_countries:
min_year_diff = 1000
min_year = 0
for year in self.data[country]:
if min_year_diff > abs(date.year - int(year)):
min_year_diff = abs(date.year - int(year))
min_year = year
country_cpi[country] = self.data[country][min_year]
if len(country_cpi) == 1:
return country_cpi[country_cpi.keys()[0]]
else:
return country_cpi
def closest(self, date=datetime.date.today(), country=None,
limit=datetime.timedelta(days=366)):
"""
Get the closest CPI value for a specified date. The date defaults to
today. A limit can be provided to exclude all values for dates further
away than defined by the limit. This defaults to 366 days.
"""
# Try to get the country
try:
possible_countries = [self.data[country]]
except:
possible_countries = [elem for elem in self.data.keys() if editdistance.eval(country,elem) < 3]
if len(possible_countries) == 0:
return "No country found, typo unlikely for ",country
# Find the closest date
country_cpi = {}
for country in possible_countries:
min_year_diff = 1000
min_year = 0
for year in self.data[country]:
if min_year_diff > abs(date.year - int(year)):
min_year_diff = abs(date.year - int(year))
min_year = year
country_cpi[country] = self.data[country][min_year]
if len(country_cpi) == 1:
return country_cpi[country_cpi.keys()[0]]
else:
return country_cpi
def compute_cer(results):
"""
Arguments:
results (list): list of ground truth and
predicted sequence pairs.
Returns the CER for the full set.
"""
dist = sum(editdistance.eval(label, pred)
for label, pred in results)
total = sum(len(label) for label, _ in results)
return dist / total
def __evaluateLevensteinDistance(self, question1, question2):
leven_dis = levendis.eval(question1.lower(), question2.lower())
return leven_dis
def fast_levenshtein_distance(self, source, target):
"""Wrapper for the distance function in the Levenshtein module
Args:
source (unicode): source word
target (unicode): target word
Returns:
int: minimum number of Levenshtein edits required to get from
`source` to `target`
"""
return int(editdistance.eval(source, target))
def fast_levenshtein_distance_div_maxlen(self, source, target):
"""Levenshtein distance divided by maxlen
Args:
source (unicode): source word
target (unicode): target word
Returns:
int: minimum number of Levenshtein edits required to get from
`source` to `target` divided by the length of the longest
of these arguments
"""
maxlen = max(len(source), len(target))
return int(editdistance.eval(source, target)) / maxlen
def calc_score(value, values):
distance = 1000000000
for v in values:
if len(value) == len(v):
d = bit_edit_distance(value, v)
else:
d = editdistance.eval(value, v) * 8
distance = min(distance, d)
return distance
def batched_wer(ref, hyp):
''' Computes mean WER
ref: list of references
hyp: list of corresponding hypotheses
'''
assert len(ref) == len(hyp)
wer = 0.
for r,f in zip(ref, hyp):
rate = editdistance.eval(r, f) / len(r)
wer += rate
return wer/len(ref)
def strSimilarity(word1, word2):
''' Measure the similarity based on Edit Distance
### Measure how similar word1 is with respect to word2
'''
diff = ed.eval(word1.lower(), word2.lower()) #search
# lcs = LCS(word1, word2) #search
length = max(len(word1), len(word2))
if diff >= length:
similarity = 0.0
else:
similarity = 1.0 * (length-diff) / length
return similarity
def strSimilarity(word1, word2):
''' Measure the similarity based on Edit Distance
### Measure how similar word1 is with respect to word2
'''
diff = ed.eval(word1.lower(), word2.lower()) #search
# lcs = LCS(word1, word2) #search
length = max(len(word1), len(word2))
if diff >= length:
similarity = 0.0
else:
similarity = 1.0 * (length-diff) / length
return similarity
def strSimilarity(word1, word2):
''' Measure the similarity based on Edit Distance
### Measure how similar word1 is with respect to word2
'''
diff = ed.eval(word1.lower(), word2.lower()) #search
# lcs = LCS(word1, word2) #search
length = max(len(word1), len(word2))
if diff >= length:
similarity = 0.0
else:
similarity = 1.0 * (length-diff) / length
return similarity
def strSimilarity(word1, word2):
''' Measure the similarity based on Edit Distance
### Measure how similar word1 is with respect to word2
'''
diff = ed.eval(word1.lower(), word2.lower()) #search
# lcs = LCS(word1, word2) #search
length = max(len(word1), len(word2))
if diff >= length:
similarity = 0.0
else:
similarity = 1.0 * (length-diff) / length
return similarity
def getFSNSMetrics(gtIdTransDict,methodIdTransDict):
"""Provides metrics for the FSNS dataset.
FM, precision, recall and correctSequences are an implementation of the metrics described in
"End-to-End Interpretation of the French Street Name Signs Dataset"
[https://link.springer.com/chapter/10.1007%2F978-3-319-46604-0_30]
Params:
gtIdTransDict : sample_id to data dictionary. A simple file name to file contents might do.
methodIdTransDict : sample_id to data dictionary. A simple file name to file contents might do.
returns:
A tuple with floats between 0 and 1 with all worth reporting measurements.
FM, Precision, Recall, global correct word trascriptions, if someone returned
"rue" as the transcription of every image, assuming half the images have it, he
would get a precision of 50%, a recall of ~5% and an FM of ~9.1%.
He would get a correctSequences score of 0%, and a similarity of e%.
"""
def compareTexts(sampleTxt,gtTxt):
relevant=gtTxt.lower().split()
retrieved=sampleTxt.lower().split()
correct=(set(relevant).intersection(set(retrieved)))
similarity=1.0/(1+editdistance.eval(gtTxt.lower(),sampleTxt.lower()))
res=(len(correct),len(relevant),len(retrieved),relevant==retrieved,similarity)
return res
mDict={k:'' for k in gtIdTransDict.keys()}
mDict.update(methodIdTransDict)
methodIdTransDict=mDict
methodKeys=sorted(methodIdTransDict.keys())
gtKeys=sorted(gtIdTransDict.keys())
if len(methodKeys)!= len(set(methodKeys)) or len(gtKeys)!= len(set(gtKeys)) or len(set(methodKeys)-set(gtKeys))>0 :#gt and method dissagree on samples
sys.stderr.write("GT and submission dissagree on the sample ids\n")
sys.exit(1)
corectRelevantRetrievedSimilarity=np.zeros([len(gtKeys),5],dtype='float32')
for k in range(len(gtKeys)):
sId=gtKeys[k]
corectRelevantRetrievedSimilarity[k,:]=compareTexts(methodIdTransDict[sId],gtIdTransDict[sId])
precision=(corectRelevantRetrievedSimilarity[:,0].sum()/(corectRelevantRetrievedSimilarity[:,1].sum()))
recall=(corectRelevantRetrievedSimilarity[:,0].sum()/(corectRelevantRetrievedSimilarity[:,2].sum()))
FM=(2*precision*recall)/(precision+recall)
correctSequences=corectRelevantRetrievedSimilarity[:,3].mean()
similarity=corectRelevantRetrievedSimilarity[:,4].mean()
combinedSoftMetric=(1-FM)*FM+FM*similarity#The better FM is, the less it maters in the overall score
return combinedSoftMetric,FM,precision,recall,similarity,correctSequences,corectRelevantRetrievedSimilarity
def _correct(observed_sentence, bigrams, distribution, max_error_rate):
"""Corrects a given sentence.
Note: The lower the max_error_rate, the faster the algorithm, but the
likelier it will fail.
Args:
observed_sentence: Observed sentence.
bigrams: First-order Markov chain of likely word sequences.
distribution: Error probability distribution function.
max_error_rate: Maximum number of errors in a word to consider.
Returns:
Ordered list of tuples of (corrected sentence, its probability).
Most likely interpretations come first.
"""
trellis = [{Sentence.START: (1.0, None)}]
observed_words = list(observed_sentence)
number_of_words = len(observed_words)
for k in range(1, number_of_words):
observed_word = observed_words[k]
max_errors = int(len(observed_word) * max_error_rate) + 1
current_states = {}
previous_states = trellis[k - 1]
trellis.append(current_states)
for previous_word in previous_states:
previous_prob = previous_states[previous_word][0]
future_states = bigrams.yield_future_states((previous_word,))
for possible_word, conditional_prob in future_states:
# Conditional probability: P(X_k | X_k-1) * previous
# probability.
total_prob = conditional_prob * previous_prob
# Emission probability: P(E_k | X_k).
distance = editdistance.eval(observed_word, possible_word)
total_prob *= distribution(distance)
# Ignore states that have too many mistakes.
if distance > max_errors:
continue
# Only keep link of max probability.
if possible_word in current_states:
if current_states[possible_word][0] >= total_prob:
continue
current_states[possible_word] = (total_prob, previous_word)
# Find most likely ending.
interpretations = list(_backtrack_path(trellis, x) for x in trellis[-1])
interpretations.sort(key=lambda x: x[1], reverse=True)
return interpretations
def test(self, multilabel_threshold=0.5):
if not self.include_test:
raise ValueError('Please do not call .test() if no test data is available.')
score_dict = {}
# get test predictions:
test_in = {}
if self.include_token:
test_in['focus_in'] = self.test_X_focus
if self.include_context:
test_in['context_in'] = self.test_contexts
test_preds = self.model.predict(test_in,
batch_size=self.batch_size)
if isinstance(test_preds, np.ndarray):
test_preds = [test_preds]
if self.include_lemma:
print('::: Test scores (lemmas) :::')
pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=test_preds[self.lemma_out_idx])
if self.postcorrect:
for i in range(len(pred_lemmas)):
if pred_lemmas[i] not in self.known_lemmas:
pred_lemmas[i] = min(self.known_lemmas,
key=lambda x: editdistance.eval(x, pred_lemmas[i]))
score_dict['test_lemma'] = evaluation.single_label_accuracies(gold=self.test_lemmas,
silver=pred_lemmas,
test_tokens=self.test_tokens,
known_tokens=self.preprocessor.known_tokens)
if self.include_pos:
print('::: Test scores (pos) :::')
pred_pos = self.preprocessor.inverse_transform_pos(predictions=test_preds[self.pos_out_idx])
score_dict['test_pos'] = evaluation.single_label_accuracies(gold=self.test_pos,
silver=pred_pos,
test_tokens=self.test_tokens,
known_tokens=self.preprocessor.known_tokens)
if self.include_morph:
print('::: Test scores (morph) :::')
pred_morph = self.preprocessor.inverse_transform_morph(predictions=test_preds[self.morph_out_idx],
threshold=multilabel_threshold)
if self.include_morph == 'label':
score_dict['test_morph'] = evaluation.single_label_accuracies(gold=self.test_morph,
silver=pred_morph,
test_tokens=self.test_tokens,
known_tokens=self.preprocessor.known_tokens)
elif self.include_morph == 'multilabel':
score_dict['test_morph'] = evaluation.multilabel_accuracies(gold=self.test_morph,
silver=pred_morph,
test_tokens=self.test_tokens,
known_tokens=self.preprocessor.known_tokens)
return score_dict
def fix_ambiguous(ambiguous_sbi):
"""
For each ambiguous sbi code find to most likely candidate
0 vs.id,
1 vs.naam,
2 codes.hr_code,
3 codes.alt_code,
4 codes.title,
5 codes.alt_title,
6 codes.sub_cat,
7 codes.alt_sub_cat,
8 codes.mks_title
"""
original_count = 0
suggestion_count = 0
for row in ambiguous_sbi:
normalcode = row[2]
zerocode = row[3]
desc1 = row[4]
desc2 = row[5]
original = row[8]
distance_desc1 = editdistance.eval(desc1, original)
distance_desc2 = editdistance.eval(desc2, original)
if distance_desc1 > distance_desc2:
# the alternative match with 0 is better
suggestion_count += 1
ves = hrmodels.Vestiging.objects.get(id=row[0])
invalid_activiteit = ves.activiteiten.get(sbi_code=normalcode)
# fix the code
invalid_activiteit.sbi_code = zerocode
# save the corrected sbi code
invalid_activiteit.save()
# now save updated code
else:
# do nothing default is fine
original_count += 1
log.debug(f'{normalcode}, {zerocode}, {desc1[:18]}, {desc2[:18]}, {original[:18]}, {distance_desc1}, {distance_desc2}') # noqa
log.debug("%s-%s = Original-Suggestion", original_count, suggestion_count)