@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { // capture state lazily - maybe no SinkFilter accepts this state AttributeSource.State state = null; for (WeakReference<SinkTokenStream> ref : sinks) { final SinkTokenStream sink = ref.get(); if (sink != null) { if (sink.accept(this)) { if (state == null) { state = this.captureState(); } sink.addState(state); } } } return true; } return false; }
@Override public boolean accept(AttributeSource source) { if (termAtt == null) { termAtt = source.addAttribute(CharTermAttribute.class); } try { Date date = dateFormat.parse(termAtt.toString());//We don't care about the date, just that we can parse it as a date if (date != null) { return true; } } catch (ParseException e) { } return false; }
@Override public final boolean incrementToken() throws IOException { if (cache == null) { // fill cache lazily cache = new LinkedList<AttributeSource.State>(); fillCache(); iterator = cache.iterator(); } if (!iterator.hasNext()) { // the cache is exhausted, return false return false; } // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. restoreState(iterator.next()); return true; }
private String[] walkTokens() throws IOException { List<String> wordList = new ArrayList<>(); while (input.incrementToken()) { CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class); char[] buffer = textAtt.buffer(); String word = new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset()); wordList.add(word); AttributeSource attrs = input.cloneAttributes(); tokenAttrs.add(attrs); } String[] words = new String[wordList.size()]; for (int i = 0; i < words.length; i++) { words[i] = wordList.get(i); } return words; }
TokenizerWrapper() { super(); tokenizerTimestamp = dictionaryTimestamp; tokenizer = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); try { final Field attributesField = getAccessibleField(AttributeSource.class, "attributes"); final Object attributesObj = attributesField.get(tokenizer); attributesField.set(this, attributesObj); final Field attributeImplsField = getAccessibleField(AttributeSource.class, "attributeImpls"); final Object attributeImplsObj = attributeImplsField.get(tokenizer); attributeImplsField.set(this, attributeImplsObj); final Field currentStateField = getAccessibleField(AttributeSource.class, "currentState"); final Object currentStateObj = currentStateField.get(tokenizer); currentStateField.set(this, currentStateObj); } catch (final Exception e) { throw new IllegalStateException( "Failed to update the tokenizer.", e); } }
static Object[] newTokenizerArgs(Random random, Reader reader, Class<?>[] paramTypes) { Object[] args = new Object[paramTypes.length]; for (int i = 0; i < args.length; i++) { Class<?> paramType = paramTypes[i]; if (paramType == Reader.class) { args[i] = reader; } else if (paramType == AttributeSource.class) { // TODO: args[i] = new AttributeSource(); // this is currently too scary to deal with! args[i] = null; // force IAE } else { args[i] = newRandomArg(random, paramType); } } return args; }
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { final List<AttributeSource> tokens = new ArrayList<>(); final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); try { tokenStream.reset(); int position = 0; while (tokenStream.incrementToken()) { position += posIncrAtt.getPositionIncrement(); trackerAtt.setActPosition(position); tokens.add(tokenStream.cloneAttributes()); } tokenStream.end(); } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } return tokens; }
TokenizerWrapper() { super(); tokenizerTimestamp = dictionaryTimestamp; tokenizer = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); try { Field attributesField = getAccessibleField(AttributeSource.class, "attributes"); final Object attributesObj = attributesField.get(tokenizer); attributesField.set(this, attributesObj); Field attributeImplsField = getAccessibleField(AttributeSource.class, "attributeImpls"); final Object attributeImplsObj = attributeImplsField.get(tokenizer); attributeImplsField.set(this, attributeImplsObj); Field currentStateField = getAccessibleField(AttributeSource.class, "currentState"); final Object currentStateObj = currentStateField.get(tokenizer); currentStateField.set(this, currentStateObj); } catch (final Exception e) { throw new IllegalStateException( "Failed to update the tokenizer.", e); } }
static Object[] newTokenizerArgs(Random random, Reader reader, Class<?>[] paramTypes) { Object[] args = new Object[paramTypes.length]; for (int i = 0; i < args.length; i++) { Class<?> paramType = paramTypes[i]; if (paramType == Reader.class) { args[i] = reader; } else if (paramType == AttributeFactory.class) { // TODO: maybe the collator one...??? args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; } else if (paramType == AttributeSource.class) { // TODO: args[i] = new AttributeSource(); // this is currently too scary to deal with! args[i] = null; // force IAE } else { args[i] = newRandomArg(random, paramType); } } return args; }
@Override @SuppressWarnings("unchecked") protected TermsEnum getTermsEnum(final Terms terms, AttributeSource atts) throws IOException { // very strange: java.lang.Number itself is not Comparable, but all subclasses used here are if (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) { return TermsEnum.EMPTY; } return new NumericRangeTermsEnum(terms.iterator(null)); }
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) { return TermsEnum.EMPTY; } TermsEnum tenum = terms.iterator(null); if ((lowerTerm == null || (includeLower && lowerTerm.length == 0)) && upperTerm == null) { return tenum; } return new TermRangeTermsEnum(tenum, lowerTerm, upperTerm, includeLower, includeUpper); }
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { TermsEnum tenum = terms.iterator(null); if (prefix.bytes().length == 0) { // no prefix -- match all terms for this field: return tenum; } return new PrefixTermsEnum(tenum, prefix.bytes()); }
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(null), term.bytes()); } return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); }
/** * Sets attributeSource to a new instance. */ void setAttributeSource(AttributeSource attributeSource) { if (this.attributeSource != attributeSource) { this.attributeSource = attributeSource; termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class); posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class); offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class); payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class); } }
private AttributeSource nextTok() throws IOException { if (buffer!=null && !buffer.isEmpty()) { return buffer.removeFirst(); } else { if (!exhausted && input.incrementToken()) { return this; } else { exhausted = true; return null; } } }
private SlowSynonymMap match(SlowSynonymMap map) throws IOException { SlowSynonymMap result = null; if (map.submap != null) { AttributeSource tok = nextTok(); if (tok != null) { // clone ourselves. if (tok == this) tok = cloneAttributes(); // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length()); if (subMap != null) { // recurse result = match(subMap); } if (result != null) { matched.addFirst(tok); } else { // push back unmatched token pushTok(tok); } } } // if no longer sequence matched, so if this node has synonyms, it's the match. if (result==null && map.synonyms!=null) { result = map; } return result; }
@Override public boolean accept(AttributeSource source) { try { if (count >= lower && count < upper){ return true; } return false; } finally { count++; } }
@Override public boolean accept(AttributeSource source) { if (typeAtt == null) { typeAtt = source.addAttribute(TypeAttribute.class); } //check to see if this is a Category return (typeToMatch.equals(typeAtt.type())); }
@Override public final void end() throws IOException { super.end(); AttributeSource.State finalState = captureState(); for (WeakReference<SinkTokenStream> ref : sinks) { final SinkTokenStream sink = ref.get(); if (sink != null) { sink.setFinalState(finalState); } } }
@Override public final boolean incrementToken() { // lazy init the iterator if (it == null) { it = cachedStates.iterator(); } if (!it.hasNext()) { return false; } AttributeSource.State state = it.next(); restoreState(state); return true; }
@Override public final boolean incrementToken() throws IOException { if (tokens != null && tokens.hasNext()){ AttributeSource.State state = tokens.next(); restoreState(state); return true; } clearAttributes(); int tokenType = scanner.getNextToken(); if (tokenType == WikipediaTokenizerImpl.YYEOF) { return false; } String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]; if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){ setupToken(); } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){ collapseTokens(tokenType); } else if (tokenOutput == BOTH){ //collapse into a single token, add it to tokens AND output the individual tokens //output the untokenized Token first collapseAndSaveTokens(tokenType, type); } int posinc = scanner.getPositionIncrement(); if (first && posinc == 0) { posinc = 1; // don't emit posinc=0 for the first token! } posIncrAtt.setPositionIncrement(posinc); typeAtt.setType(type); first = false; return true; }
private void collapseAndSaveTokens(int tokenType, String type) throws IOException { //collapse StringBuilder buffer = new StringBuilder(32); int numAdded = scanner.setText(buffer); //TODO: how to know how much whitespace to add int theStart = scanner.yychar(); int lastPos = theStart + numAdded; int tmpTokType; int numSeen = 0; List<AttributeSource.State> tmp = new ArrayList<>(); setupSavedToken(0, type); tmp.add(captureState()); //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){ int currPos = scanner.yychar(); //append whitespace for (int i = 0; i < (currPos - lastPos); i++){ buffer.append(' '); } numAdded = scanner.setText(buffer); setupSavedToken(scanner.getPositionIncrement(), type); tmp.add(captureState()); numSeen++; lastPos = currPos + numAdded; } //trim the buffer // TODO: this is inefficient String s = buffer.toString().trim(); termAtt.setEmpty().append(s); offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ scanner.yypushback(scanner.yylength()); } tokens = tmp.iterator(); }
@Override protected void swap(int i, int j) { AttributeSource.State tmp = buffered[i]; buffered[i] = buffered[j]; buffered[j] = tmp; int tmp2 = startOff[i]; startOff[i] = startOff[j]; startOff[j] = tmp2; tmp2 = posInc[i]; posInc[i] = posInc[j]; posInc[j] = tmp2; }
/** Make this tokenizer get attributes from the delegate token stream. */ private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) { return new AttributeFactory() { @Override public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { return (AttributeImpl) source.addAttribute(attClass); } }; }
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) { super(source, input); if (bufferSize <= 0) { throw new IllegalArgumentException("bufferSize must be > 0"); } termAtt.resizeBuffer(bufferSize); }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); if (first) { String[] words = walkTokens(); if (words.length == 0) { return false; } createTags(words); first = false; indexToken = 0; } if (indexToken == tokenAttrs.size()) { return false; } AttributeSource as = tokenAttrs.get(indexToken); Iterator<? extends Class<? extends Attribute>> it = as.getAttributeClassesIterator(); while (it.hasNext()) { Class<? extends Attribute> attrClass = it.next(); if (!hasAttribute(attrClass)) { addAttribute(attrClass); } } as.copyTo(this); indexToken++; return true; }
private void appendPayloads(String[] tags, int length) { for (int i = 0; i < length; i++) { AttributeSource attrs = tokenAttrs.get(i); if (tags[i] != null) { try { PayloadAttribute payloadAtt = attrs.hasAttribute(PayloadAttribute.class) ? attrs.getAttribute(PayloadAttribute.class) : attrs.addAttribute(PayloadAttribute.class); BytesRef bytesRef = new BytesRef(tags[i].toUpperCase(Locale.getDefault()).getBytes("UTF-8")); payloadAtt.setPayload(bytesRef); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } } }
@Override protected TermsEnum getTermsEnum(final Terms terms, final AttributeSource atts) throws IOException { if ((maxEdits == 0) || (prefixLength >= term.text().length())) { // can // only // match // if // it's // exact return new SingleTermsEnum(terms.iterator(null), term.bytes()); } return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); }
@Override protected TermsEnum getTermsEnum( Terms terms, AttributeSource atts) throws IOException { MoaMatcher matcher = this.moaPattern.matcher( "" ); TermsEnum termsEnum = terms.iterator(); return new MoarTermsEnum( matcher, termsEnum ); }
/** for a docsenum, gets the 'other' reused enum. * Example: Pulsing(Standard). * when doing a term range query you are switching back and forth * between Pulsing and Standard * * The way the reuse works is that Pulsing.other = Standard and * Standard.other = Pulsing. */ private DocsEnum getOther(DocsEnum de) { if (de == null) { return null; } else { final AttributeSource atts = de.attributes(); return atts.addAttribute(PulsingEnumAttribute.class).enums().get(this); } }
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (this.terms.size() == 0) { return TermsEnum.EMPTY; } return new SeekingTermSetTermsEnum(terms.iterator(null), this.terms, ords); }