我正在尝试创建一个用于 NER 识别的训练数据集。为此,我有大量数据需要标记并删除不必要的句子。删除不必要的句子后,必须更新索引部分。昨天,我看到一些用户提供了一些令人难以置信的代码段,现在我找不到了。通过调整他们的代码段,我可以简要介绍我的问题
让我们获取一个训练样本数据:
data = [{"content":'''Hello we are hans and john. I enjoy playing Football. I love eating grapes. Hanaan is great.''',"annotations":[{"id":1,"start":13,"end":17,"tag":"name"}, {"id":2,"start":22,"end":26,"tag":"name"}, {"id":3,"start":68,"end":74,"tag":"fruit"}, {"id":4,"start":76,"end":82,"tag":"name"}]}]
可以使用以下 spacy 显示代码进行可视化
import json import spacy from spacy import displacy data = [{"content":'''Hello we are hans and john. I enjoy playing Football. I love eating grapes. Hanaan is great.''',"annotations":[{"id":1,"start":13,"end":17,"tag":"name"}, {"id":2,"start":22,"end":26,"tag":"name"}, {"id":3,"start":68,"end":74,"tag":"fruit"}, {"id":4,"start":76,"end":82,"tag":"name"}]}] annot_tags = data[data_index]["annotations"] entities = [] for j in annot_tags: start = j["start"] end = j["end"] tag = j["tag"] entitie = (start,end,tag) entities.append(entitie) data_gen = (data[data_index]["content"],{"entities":entities}) data_one = [] data_one.append(data_gen) nlp = spacy.blank('en') raw_text = data_one[0][0] doc = nlp.make_doc(raw_text) spans = data_one[0][1]["entities"] ents = [] for span_start, span_end, label in spans: ent = doc.char_span(span_start, span_end, label=label) if ent is None: continue ents.append(ent) doc.ents = ents displacy.render(doc, style="ent", jupyter=True)
输出将是
输出 1
现在我想删除未标记的句子并更新索引值。因此所需的输出如下
所需输出
此外,数据必须采用以下格式。未标记的句子将被删除,并且索引值必须更新,这样我才能获得如上所示的输出。
所需的输出数据
[{"content":'''Hello we are hans and john. I love eating grapes. Hanaan is great.''',"annotations":[{"id":1,"start":13,"end":17,"tag":"name"}, {"id":2,"start":22,"end":26,"tag":"name"}, {"id":3,"start":42,"end":48,"tag":"fruit"}, {"id":4,"start":50,"end":56,"tag":"name"}]}]
我昨天关注了一篇帖子并得到了一个接近有效的代码。
代码
import re data = [{"content":'''Hello we are hans and john. I enjoy playing Football. I love eating grapes. Hanaan is great.''',"annotations":[{"id":1,"start":13,"end":17,"tag":"name"}, {"id":2,"start":22,"end":26,"tag":"name"}, {"id":3,"start":68,"end":74,"tag":"fruit"}, {"id":4,"start":76,"end":82,"tag":"name"}]}] for idx, each in enumerate(data[0]['annotations']): start = each['start'] end = each['end'] word = data[0]['content'][start:end] data[0]['annotations'][idx]['word'] = word sentences = [ {'sentence':x.strip() + '.','checked':False} for x in data[0]['content'].split('.')] new_data = [{'content':'', 'annotations':[]}] for idx, each in enumerate(data[0]['annotations']): for idx_alpha, sentence in enumerate(sentences): if sentence['checked'] == True: continue temp = each.copy() check_word = temp['word'] if check_word in sentence['sentence']: start_idx = re.search(r'\b({})\b'.format(check_word), sentence['sentence']).start() end_idx = start_idx + len(check_word) current_len = len(new_data[0]['content']) new_data[0]['content'] += sentence['sentence'] + ' ' temp.update({'start':start_idx + current_len, 'end':end_idx + current_len}) new_data[0]['annotations'].append(temp) sentences[idx_alpha]['checked'] = True break print(new_data)
输出
[{'content': 'Hello we are hans and john. I love eating grapes. Hanaan is great. ', 'annotations': [{'id': 1, 'start': 13, 'end': 17, 'tag': 'name', 'word': 'hans'}, {'id': 3, 'start': 42, 'end': 48, 'tag': 'fruit', 'word': 'grapes'}, {'id': 4, 'start': 50, 'end': 56, 'tag': 'name', 'word': 'Hanaan'}]}]
这里,john 这个名字丢失了。如果存在多个标签,我也不能丢失它
这是一项相当复杂的任务,因为您需要识别句子,因为对进行简单的拆分'.'可能不起作用,因为它会拆分诸如'Mr.'等内容。
'.'
'Mr.'
既然您使用的是 spacy,为什么不让它识别句子,然后遍历这些句子并计算出这些开始结束索引,并且不包括任何没有实体的句子。然后重建内容。
import json import spacy from spacy import displacy import re data = [{"content":'''Hello we are hans and john. I enjoy playing Football. \ I love eating grapes. Hanaan is great. Mr. Jones is nice.''',"annotations":[{"id":1,"start":13,"end":17,"tag":"name"}, {"id":2,"start":22,"end":26,"tag":"name"}, {"id":3,"start":68,"end":74,"tag":"fruit"}, {"id":4,"start":76,"end":82,"tag":"name"}, {"id":5,"start":93,"end":102,"tag":"name"}]}] for idx, each in enumerate(data[0]['annotations']): start = each['start'] end = each['end'] word = data[0]['content'][start:end] data[0]['annotations'][idx]['word'] = word text = data[0]['content'] nlp = spacy.load('en_core_web_sm') nlp.add_pipe('sentencizer') doc = nlp(text) sentences = [i for i in doc.sents] annotations = data[0]['annotations'] new_data = [{"content":'', 'annotations':[]}] for sentence in sentences: idx_to_remove = [] for idx, annotation in enumerate(annotations): if annotation['word'] in sentence.text: temp = annotation.copy() start_idx = re.search(r'\b({})\b'.format(annotation['word']), sentence.text).start() end_idx = start_idx + len(annotation['word']) current_len = len(new_data[0]['content']) temp.update({'start':start_idx + current_len, 'end':end_idx + current_len}) new_data[0]['annotations'].append(temp) idx_to_remove.append(idx) if len(idx_to_remove) > 0: new_data[0]['content'] += sentence.text + ' ' for x in range(0,len(idx_to_remove)): del annotations[0]
输出:
print(new_data) [{'content': 'Hello we are hans and john. I love eating grapes. Hanaan is great. Mr. Jones is nice. ', 'annotations': [ {'id': 1, 'start': 13, 'end': 17, 'tag': 'name', 'word': 'hans'}, {'id': 2, 'start': 22, 'end': 26, 'tag': 'name', 'word': 'john'}, {'id': 3, 'start': 42, 'end': 48, 'tag': 'fruit', 'word': 'grapes'}, {'id': 4, 'start': 50, 'end': 56, 'tag': 'name', 'word': 'Hanaan'}, {'id': 5, 'start': 67, 'end': 76, 'tag': 'name', 'word': 'Mr. Jones'}]}]