public ComboTokenStream(TokenStream... tokenStreams) { // Load the TokenStreams, track their position, and register their attributes this.positionedTokenStreams = new PositionedTokenStream[tokenStreams.length]; for (int i = tokenStreams.length - 1; i >= 0; --i) { if (tokenStreams[i] == null) { continue; } this.positionedTokenStreams[i] = new PositionedTokenStream(tokenStreams[i]); // Add each and every token seen in the current sub AttributeSource Iterator<Class<? extends Attribute>> iterator = this.positionedTokenStreams[i].getAttributeClassesIterator(); while (iterator.hasNext()) { addAttribute(iterator.next()); } } this.lastPosition = 0; // Create an initially empty queue. // It will be filled at first incrementToken() call, because // it needs to call the same function on each sub-TokenStreams. this.readQueue = new PriorityQueue<PositionedTokenStream>(tokenStreams.length); readQueueResetted = false; }
/** Make this tokenizer get attributes from the delegate token stream. */ private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) { return new AttributeFactory() { @Override public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { return (AttributeImpl) source.addAttribute(attClass); } }; }
protected NumericTokenizer(NumericTokenStream numericTokenStream, char[] buffer, Object extra) throws IOException { super(delegatingAttributeFactory(numericTokenStream)); this.numericTokenStream = numericTokenStream; // Add attributes from the numeric token stream, this works fine because the attribute factory delegates to numericTokenStream for (Iterator<Class<? extends Attribute>> it = numericTokenStream.getAttributeClassesIterator(); it.hasNext();) { addAttribute(it.next()); } this.extra = extra; this.buffer = buffer; started = true; }
/** * other attribute extract object. * Extracted object group by AttributeClassName * * @param stream current TokenStream * @param includeAttributes filtering attributes * @return Map<key value> */ private static Map<String, Object> extractExtendedAttributes(TokenStream stream, final Set<String> includeAttributes) { final Map<String, Object> extendedAttributes = new TreeMap<>(); stream.reflectWith(new AttributeReflector() { @Override public void reflect(Class<? extends Attribute> attClass, String key, Object value) { if (CharTermAttribute.class.isAssignableFrom(attClass)) return; if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return; if (OffsetAttribute.class.isAssignableFrom(attClass)) return; if (TypeAttribute.class.isAssignableFrom(attClass)) return; if (includeAttributes == null || includeAttributes.isEmpty() || includeAttributes.contains(key.toLowerCase(Locale.ROOT))) { if (value instanceof BytesRef) { final BytesRef p = (BytesRef) value; value = p.toString(); } extendedAttributes.put(key, value); } } }); return extendedAttributes; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); if (first) { String[] words = walkTokens(); if (words.length == 0) { return false; } createTags(words); first = false; indexToken = 0; } if (indexToken == tokenAttrs.size()) { return false; } AttributeSource as = tokenAttrs.get(indexToken); Iterator<? extends Class<? extends Attribute>> it = as.getAttributeClassesIterator(); while (it.hasNext()) { Class<? extends Attribute> attrClass = it.next(); if (!hasAttribute(attrClass)) { addAttribute(attrClass); } } as.copyTo(this); indexToken++; return true; }
public Tuple2<Double, Multiset<String>> transform(Row row) throws IOException { Double label = row.getDouble(1); StringReader document = new StringReader(row.getString(0).replaceAll("br2n", "")); List<String> wordsList = new ArrayList<>(); try (BulgarianAnalyzer analyzer = new BulgarianAnalyzer(BULGARIAN_STOP_WORDS_SET)) { TokenStream stream = analyzer.tokenStream("words", document); TokenFilter lowerFilter = new LowerCaseFilter(stream); TokenFilter numbers = new NumberFilter(lowerFilter); TokenFilter length = new LengthFilter(numbers, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 3); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replace(",", "(comma)").replaceAll("\n|\r", ""); if (word.contains("_")) { continue; } wordsList.add(word); } } } Multiset<String> words = ConcurrentHashMultiset.create(wordsList); return new Tuple2<>(label, words); }
public static void main(String[] args) throws IOException { System.out.println(NumberUtils.isDigits("12345")); System.out.println(NumberUtils.isDigits("12345.1")); System.out.println(NumberUtils.isDigits("12345,2")); System.out.println(NumberUtils.isNumber("12345")); System.out.println(NumberUtils.isNumber("12345.1")); System.out.println(NumberUtils.isNumber("12345,2".replace(",", "."))); System.out.println(NumberUtils.isNumber("12345,2")); StringReader input = new StringReader( "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24" .replaceAll("br2n", "")); LetterTokenizer tokenizer = new LetterTokenizer(); tokenizer.setReader(input); TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET); TokenFilter length = new LengthFilter(stopFilter, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", ""); System.out.println(word); } } }
@Override public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { if (attClass == TermToBytesRefAttribute.class) return new MyTermAttributeImpl(); if (CharTermAttribute.class.isAssignableFrom(attClass)) throw new IllegalArgumentException("no"); return delegate.createAttributeInstance(attClass); }
@Override public boolean incrementToken() { if (tokenIterator.hasNext()) { clearAttributes(); AttributeSource next = tokenIterator.next(); Iterator<Class<? extends Attribute>> atts = next.getAttributeClassesIterator(); while (atts.hasNext()) // make sure all att impls in the token exist here addAttribute(atts.next()); next.copyTo(this); return true; } else { return false; } }
/** Make this Tokenizer get attributes from the delegate token stream. */ private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) { return new AttributeFactory() { @Override public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { return (AttributeImpl) source.addAttribute(attClass); } }; }
@Override public AttributeImpl createAttributeInstance( Class<? extends Attribute> attClass) { return attClass.isAssignableFrom(ICUCollatedTermAttributeImpl.class) ? new ICUCollatedTermAttributeImpl(collator) : delegate.createAttributeInstance(attClass); }
@Override public AttributeImpl createAttributeInstance( Class<? extends Attribute> attClass) { return attClass.isAssignableFrom(CollatedTermAttributeImpl.class) ? new CollatedTermAttributeImpl(collator) : delegate.createAttributeInstance(attClass); }
private <A extends Attribute> A getAttrIfExists(Class<A> att) { if (hasAttribute(att)) { return getAttribute(att); } else { return null; } }
@Override public AttributeImpl createAttributeInstance( Class<? extends Attribute> attClass) { return attClass.isAssignableFrom(MockUTF16TermAttributeImpl.class) ? new MockUTF16TermAttributeImpl() : delegate.createAttributeInstance(attClass); }
@Override public boolean incrementToken() throws IOException { //clearAttributes(); if (first) { //gather all tokens from doc String[] words = walkTokens(); if (words.length == 0) { return false; } //tagging posTags = createTags(words); first = false; tokenIdx = 0; } if (tokenIdx == tokenAttrs.size()) { resetParams(); return false; } AttributeSource as = tokenAttrs.get(tokenIdx); Iterator<? extends Class<? extends Attribute>> it = as.getAttributeClassesIterator(); while (it.hasNext()) { Class<? extends Attribute> attrClass = it.next(); if (!hasAttribute(attrClass)) { addAttribute(attrClass); } } as.copyTo(this); MWEMetadata metadata = exitingPayload.getPayload() == null ? new MWEMetadata() : MWEMetadata.deserialize(exitingPayload.getPayload().utf8ToString()); metadata.addMetaData(MWEMetadataType.POS, posTags[tokenIdx]); exitingPayload.setPayload(new BytesRef(MWEMetadata.serialize(metadata))); tokenIdx++; return true; }
/** * Create a new CachingTokenStream around <code>input</code>, * caching its token attributes, which can be replayed again * after a call to {@link #reset()}. */ public CachingTokenStream(TokenStream input) { this.input = input; Iterator<Class<? extends Attribute>> attrIter = this.input.getAttributeClassesIterator(); while (attrIter.hasNext()) { addAttribute(attrIter.next()); } }
/** * other attribute extract object.<br/> * Extracted object group by AttributeClassName * * @param stream current TokenStream * @param includeAttributes filtering attributes * @param shortAttrName if true, return short attribute name * @return Nested Object : Map<attrClass, Map<key, value>> */ private Map<String, Map<String, Object>> extractExtendedAttributes(TokenStream stream, final Set<String> includeAttributes, final boolean shortAttrName) { final Map<String, Map<String, Object>> extendedAttributes = new TreeMap<>(); stream.reflectWith(new AttributeReflector() { @Override public void reflect(Class<? extends Attribute> attClass, String key, Object value) { if (CharTermAttribute.class.isAssignableFrom(attClass)) return; if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return; if (OffsetAttribute.class.isAssignableFrom(attClass)) return; if (TypeAttribute.class.isAssignableFrom(attClass)) return; if (includeAttributes == null || includeAttributes.isEmpty() || includeAttributes.contains(attClass.getSimpleName().toLowerCase())) { Map<String, Object> currentAttributes = extendedAttributes.get(attClass.getName()); if (currentAttributes == null) { currentAttributes = new HashMap<>(); } if (value instanceof BytesRef) { final BytesRef p = (BytesRef) value; value = p.toString(); } currentAttributes.put(key, value); if (shortAttrName) { extendedAttributes.put(attClass.getName().substring(attClass.getName().lastIndexOf(".")+1), currentAttributes); } else { extendedAttributes.put(attClass.getName(), currentAttributes); } } } }); return extendedAttributes; }
@Override public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { if (CharTermAttribute.class.isAssignableFrom(attClass)) throw new IllegalArgumentException("NumericTokenStream does not support CharTermAttribute."); return delegate.createAttributeInstance(attClass); }
@Override public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { return attClass.isAssignableFrom(Token.class) ? new Token() : delegate.createAttributeInstance(attClass); }
@Override public String toFormattedString(Field f) throws IOException { Map<String,Object> map = new LinkedHashMap<>(); map.put(VERSION_KEY, VERSION); if (f.fieldType().stored()) { String stringValue = f.stringValue(); if (stringValue != null) { map.put(STRING_KEY, stringValue); } BytesRef binaryValue = f.binaryValue(); if (binaryValue != null) { map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length)); } } TokenStream ts = f.tokenStreamValue(); if (ts != null) { List<Map<String,Object>> tokens = new LinkedList<>(); while (ts.incrementToken()) { Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); String cTerm = null; String tTerm = null; Map<String,Object> tok = new TreeMap<>(); while (it.hasNext()) { Class<? extends Attribute> cl = it.next(); Attribute att = ts.getAttribute(cl); if (att == null) { continue; } if (cl.isAssignableFrom(CharTermAttribute.class)) { CharTermAttribute catt = (CharTermAttribute)att; cTerm = new String(catt.buffer(), 0, catt.length()); } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att; tTerm = tatt.getBytesRef().utf8ToString(); } else { if (cl.isAssignableFrom(FlagsAttribute.class)) { tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute)att).getFlags())); } else if (cl.isAssignableFrom(OffsetAttribute.class)) { tok.put(OFFSET_START_KEY, ((OffsetAttribute)att).startOffset()); tok.put(OFFSET_END_KEY, ((OffsetAttribute)att).endOffset()); } else if (cl.isAssignableFrom(PayloadAttribute.class)) { BytesRef p = ((PayloadAttribute)att).getPayload(); if (p != null && p.length > 0) { tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length)); } } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { tok.put(POSINCR_KEY, ((PositionIncrementAttribute)att).getPositionIncrement()); } else if (cl.isAssignableFrom(TypeAttribute.class)) { tok.put(TYPE_KEY, ((TypeAttribute)att).type()); } else { tok.put(cl.getName(), att.toString()); } } } String term = null; if (cTerm != null) { term = cTerm; } else { term = tTerm; } if (term != null && term.length() > 0) { tok.put(TOKEN_KEY, term); } tokens.add(tok); } map.put(TOKENS_KEY, tokens); } return JSONUtil.toJSON(map, -1); }
@Override public String toFormattedString(Field f) throws IOException { Map<String,Object> map = new LinkedHashMap<String,Object>(); map.put(VERSION_KEY, VERSION); if (f.fieldType().stored()) { String stringValue = f.stringValue(); if (stringValue != null) { map.put(STRING_KEY, stringValue); } BytesRef binaryValue = f.binaryValue(); if (binaryValue != null) { map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length)); } } TokenStream ts = f.tokenStreamValue(); if (ts != null) { List<Map<String,Object>> tokens = new LinkedList<Map<String,Object>>(); while (ts.incrementToken()) { Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); String cTerm = null; String tTerm = null; Map<String,Object> tok = new TreeMap<String,Object>(); while (it.hasNext()) { Class<? extends Attribute> cl = it.next(); if (!ts.hasAttribute(cl)) { continue; } Attribute att = ts.getAttribute(cl); if (cl.isAssignableFrom(CharTermAttribute.class)) { CharTermAttribute catt = (CharTermAttribute)att; cTerm = new String(catt.buffer(), 0, catt.length()); } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att; tTerm = tatt.getBytesRef().utf8ToString(); } else { if (cl.isAssignableFrom(FlagsAttribute.class)) { tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute)att).getFlags())); } else if (cl.isAssignableFrom(OffsetAttribute.class)) { tok.put(OFFSET_START_KEY, ((OffsetAttribute)att).startOffset()); tok.put(OFFSET_END_KEY, ((OffsetAttribute)att).endOffset()); } else if (cl.isAssignableFrom(PayloadAttribute.class)) { BytesRef p = ((PayloadAttribute)att).getPayload(); if (p != null && p.length > 0) { tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length)); } } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { tok.put(POSINCR_KEY, ((PositionIncrementAttribute)att).getPositionIncrement()); } else if (cl.isAssignableFrom(TypeAttribute.class)) { tok.put(TYPE_KEY, ((TypeAttribute)att).type()); } else { tok.put(cl.getName(), att.toString()); } } } String term = null; if (cTerm != null) { term = cTerm; } else { term = tTerm; } if (term != null && term.length() > 0) { tok.put(TOKEN_KEY, term); } tokens.add(tok); } map.put(TOKENS_KEY, tokens); } return JSONUtil.toJSON(map, -1); }