Python spacy 模块,tokens() 实例源码


项目:scienceie17    作者:OC-ScienceIE    | 项目源码 | 文件源码
def run_nlp(txt_dir, spacy_dir, nlp=None):
    Process text files in directory txt_dir with Spacy NLP pipeline and
    serialize analyses to directory spacy_dir
    if not nlp:
        nlp = spacy.load('en')

    makedirs(spacy_dir, exist_ok=True)

    for txt_fname in sorted_glob(join(txt_dir, '*.txt')):
        print('reading ' + txt_fname)
        text = open(txt_fname).read()
        # Spacy considers '\n' as a separate token.
        # That causes problems when writing tokens in column format,
        # so we strip the final '\n'.
        doc = nlp(text.rstrip('\n'))
        spacy_fname = join(spacy_dir,
                           splitext(basename(txt_fname))[0] + '.spacy')
        write_doc(spacy_fname, doc)
项目:botcycle    作者:D2KLab    | 项目源码 | 文件源码
def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)
项目:scienceie17    作者:OC-ScienceIE    | 项目源码 | 文件源码
def map_chars_to_tokens(doc):
    Creates a mapping from input characters to corresponding input tokens

    For instance, given the input:

    Nuclear theory ...

    it returns an array of size equal to the number of input chars plus one,
    whcih looks like this:


    This means that the first 7 chars map to the first token ("Nuclear"),
    the next 7 chars (including the initial whitespace) map to the second
    token ("theory") and so on.
    n_chars = len(doc.text_with_ws)
    char2token = np.zeros(n_chars + 1, 'int')
    start_char = 0
    for token in doc:
        end_char = token.idx + len(token)
        char2token[start_char:end_char] = token.i
        start_char = end_char
    char2token[-1] = char2token[-2] + 1
    return char2token
项目:snorkel-biocorpus    作者:HazyResearch    | 项目源码 | 文件源码
def _original_string(self, tokens, offsets):
        Recreate string with original char offsets
        :param tokens:
        :param offsets:
        s = ""
        for t, i in zip(tokens, offsets):
            diff = i - len(s)
            if diff:
                s += ' ' * diff
            s += t
        return s