@Override public Tokenizer create(AttributeFactory factory, Reader input) { MMSegTokenizer tokenizer = tokenizerLocal.get(); if(tokenizer == null) { tokenizer = newTokenizer(input); } else { try { tokenizer.setReader(input); } catch (IOException e) { tokenizer = newTokenizer(input); log.info("MMSegTokenizer.reset i/o error by:"+e.getMessage()); } } return tokenizer; }
static Object[] newTokenizerArgs(Random random, Reader reader, Class<?>[] paramTypes) { Object[] args = new Object[paramTypes.length]; for (int i = 0; i < args.length; i++) { Class<?> paramType = paramTypes[i]; if (paramType == Reader.class) { args[i] = reader; } else if (paramType == AttributeFactory.class) { // TODO: maybe the collator one...??? args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; } else if (paramType == AttributeSource.class) { // TODO: args[i] = new AttributeSource(); // this is currently too scary to deal with! args[i] = null; // force IAE } else { args[i] = newRandomArg(random, paramType); } } return args; }
@Override public Tokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_44)) { if (!EdgeNGramTokenFilter.Side.FRONT.getLabel().equals(side)) { throw new IllegalArgumentException(EdgeNGramTokenizer.class.getSimpleName() + " does not support backward n-grams as of Lucene 4.4"); } return new EdgeNGramTokenizer(luceneMatchVersion, input, minGramSize, maxGramSize); } else { return new Lucene43EdgeNGramTokenizer(luceneMatchVersion, input, side, minGramSize, maxGramSize); } }
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */ @Override public Tokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_44)) { return new NGramTokenizer(luceneMatchVersion, factory, input, minGramSize, maxGramSize); } else { return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize); } }
@Override public Tokenizer create(AttributeFactory factory, Reader input) { if (reverse) { return new ReversePathHierarchyTokenizer(factory, input, delimiter, replacement, skip); } return new PathHierarchyTokenizer(factory, input, delimiter, replacement, skip); }
public Tokenizer create(AttributeFactory arg0, Reader input) { Tokenizer _IKTokenizer = new IKTokenizer(input, this.useSmart); return _IKTokenizer; }
@Override public CJKTokenizer create(AttributeFactory factory, Reader in) { return new CJKTokenizer(factory, in); }
@Override public ArabicLetterTokenizer create(AttributeFactory factory, Reader input) { return new ArabicLetterTokenizer(luceneMatchVersion, factory, input); }
/** * Split the input using configured pattern */ @Override public PatternTokenizer create(final AttributeFactory factory, final Reader in) { return new PatternTokenizer(factory, in, pattern, group); }
@Override public RussianLetterTokenizer create(AttributeFactory factory, Reader in) { return new RussianLetterTokenizer(luceneMatchVersion, factory, in); }
@Override public WikipediaTokenizer create(AttributeFactory factory, Reader input) { return new WikipediaTokenizer(factory, input, WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet()); }
@Override public StandardTokenizer create(AttributeFactory factory, Reader input) { StandardTokenizer tokenizer = new StandardTokenizer(luceneMatchVersion, factory, input); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }
@Override public ClassicTokenizer create(AttributeFactory factory, Reader input) { ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, factory, input); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }
@Override public UAX29URLEmailTokenizer create(AttributeFactory factory, Reader input) { UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, factory, input); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }
@Override public ICUTokenizer create(AttributeFactory factory, Reader input) { assert config != null : "inform must be called first!"; return new ICUTokenizer(factory, input, config); }
@Override public ChineseTokenizer create(AttributeFactory factory, Reader in) { return new ChineseTokenizer(factory, in); }
@Override public SentenceTokenizer create(AttributeFactory factory, Reader input) { return new SentenceTokenizer(factory, input); }
@Override public UIMAAnnotationsTokenizer create(AttributeFactory factory, Reader input) { return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, factory, input); }
@Override public UIMATypeAwareAnnotationsTokenizer create(AttributeFactory factory, Reader input) { return new UIMATypeAwareAnnotationsTokenizer (descriptorPath, tokenType, featurePath, configurationParameters, factory, input); }
/** Creates a TokenStream of the specified input using the default attribute factory. */ public final Tokenizer create(Reader input) { return create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input); }
/** Creates a TokenStream of the specified input using the given AttributeFactory */ abstract public Tokenizer create(AttributeFactory factory, Reader input);
@Override public KeywordTokenizer create(AttributeFactory factory, Reader input) { return new KeywordTokenizer(factory, input, KeywordTokenizer.DEFAULT_BUFFER_SIZE); }
@Override public WhitespaceTokenizer create(AttributeFactory factory, Reader input) { return new WhitespaceTokenizer(luceneMatchVersion, factory, input); }
@Override public LetterTokenizer create(AttributeFactory factory, Reader input) { return new LetterTokenizer(luceneMatchVersion, factory, input); }
@Override public LowerCaseTokenizer create(AttributeFactory factory, Reader input) { return new LowerCaseTokenizer(luceneMatchVersion, factory, input); }
@Override public JapaneseTokenizer create(AttributeFactory factory, Reader input) { return new JapaneseTokenizer(factory, input, userDictionary, discardPunctuation, mode); }