我们从Python开源项目中,提取了以下4个代码示例,用于说明如何使用spacy.tokens()。
def run_nlp(txt_dir, spacy_dir, nlp=None): """ Process text files in directory txt_dir with Spacy NLP pipeline and serialize analyses to directory spacy_dir """ if not nlp: nlp = spacy.load('en') makedirs(spacy_dir, exist_ok=True) for txt_fname in sorted_glob(join(txt_dir, '*.txt')): print('reading ' + txt_fname) text = open(txt_fname).read() # Spacy considers '\n' as a separate token. # That causes problems when writing tokens in column format, # so we strip the final '\n'. doc = nlp(text.rstrip('\n')) spacy_fname = join(spacy_dir, splitext(basename(txt_fname))[0] + '.spacy') write_doc(spacy_fname, doc)
def __call__(self, text): words = text.split(' ') # All tokens 'own' a subsequent space character in this tokenizer spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)
def map_chars_to_tokens(doc): """ Creates a mapping from input characters to corresponding input tokens For instance, given the input: Nuclear theory ... ||||||||||||||| 012345678911111... 01234 it returns an array of size equal to the number of input chars plus one, whcih looks like this: 000000011111112... This means that the first 7 chars map to the first token ("Nuclear"), the next 7 chars (including the initial whitespace) map to the second token ("theory") and so on. """ n_chars = len(doc.text_with_ws) char2token = np.zeros(n_chars + 1, 'int') start_char = 0 for token in doc: end_char = token.idx + len(token) char2token[start_char:end_char] = token.i start_char = end_char char2token[-1] = char2token[-2] + 1 return char2token
def _original_string(self, tokens, offsets): """ Recreate string with original char offsets :param tokens: :param offsets: :return: """ s = "" for t, i in zip(tokens, offsets): diff = i - len(s) if diff: s += ' ' * diff s += t return s