/** initialize levenshtein DFAs up to maxDistance, if possible */ private List<CompiledAutomaton> initAutomata(int maxDistance) { final List<CompiledAutomaton> runAutomata = dfaAtt.automata(); //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.size() <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions); String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength); for (int i = runAutomata.size(); i <= maxDistance; i++) { Automaton a = builder.toAutomaton(i, prefix); //System.out.println("compute automaton n=" + i); runAutomata.add(new CompiledAutomaton(a, true, false)); } } return runAutomata; }
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { final List<MultiTermsEnum.TermsEnumIndex> termsEnums = new ArrayList<>(); for(int i=0;i<subs.length;i++) { final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm); if (termsEnum != null) { termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); } } if (termsEnums.size() > 0) { return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); } else { return TermsEnum.EMPTY; } }
/** Returns a TermsEnum that iterates over all terms that * are accepted by the provided {@link * CompiledAutomaton}. If the <code>startTerm</code> is * provided then the returned enum will only accept terms * > <code>startTerm</code>, but you still must call * next() first to get to the first term. Note that the * provided <code>startTerm</code> must be accepted by * the automaton. * * <p><b>NOTE</b>: the returned TermsEnum cannot * seek</p>. */ public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException { // TODO: eventually we could support seekCeil/Exact on // the returned enum, instead of only being able to seek // at the start if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } if (startTerm == null) { return new AutomatonTermsEnum(iterator(null), compiled); } else { return new AutomatonTermsEnum(iterator(null), compiled) { @Override protected BytesRef nextSeekTerm(BytesRef term) throws IOException { if (term == null) { term = startTerm; } return super.nextSeekTerm(term); } }; } }
/** initialize levenshtein DFAs up to maxDistance, if possible */ private List<CompiledAutomaton> initAutomata(int maxDistance) { final List<CompiledAutomaton> runAutomata = dfaAtt.automata(); //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.size() <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions); for (int i = runAutomata.size(); i <= maxDistance; i++) { Automaton a = builder.toAutomaton(i); //System.out.println("compute automaton n=" + i); // constant prefix if (realPrefixLength > 0) { Automaton prefix = BasicAutomata.makeString( UnicodeUtil.newString(termText, 0, realPrefixLength)); a = BasicOperations.concatenate(prefix, a); } runAutomata.add(new CompiledAutomaton(a, true, false)); } } return runAutomata; }
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { final List<MultiTermsEnum.TermsEnumIndex> termsEnums = new ArrayList<MultiTermsEnum.TermsEnumIndex>(); for(int i=0;i<subs.length;i++) { final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm); if (termsEnum != null) { termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); } } if (termsEnums.size() > 0) { return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); } else { return TermsEnum.EMPTY; } }
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new IntersectTermsEnum(this, compiled, startTerm); }
/** * return an automata-based enum for matching up to editDistance from * lastTerm, if possible */ protected TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) throws IOException { final List<CompiledAutomaton> runAutomata = initAutomata(editDistance); if (editDistance < runAutomata.size()) { //System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString())); final CompiledAutomaton compiled = runAutomata.get(editDistance); return new AutomatonFuzzyTermsEnum(terms.intersect(compiled, lastTerm == null ? null : compiled.floor(lastTerm, new BytesRefBuilder())), runAutomata.subList(0, editDistance + 1).toArray(new CompiledAutomaton[editDistance + 1])); } else { return null; } }
public AutomatonFuzzyTermsEnum(TermsEnum tenum, CompiledAutomaton compiled[]) { super(tenum, false); this.matchers = new ByteRunAutomaton[compiled.length]; for (int i = 0; i < compiled.length; i++) this.matchers[i] = compiled[i].runAutomaton; termRef = new BytesRef(term.text()); }
@Override public void copyTo(AttributeImpl target) { final List<CompiledAutomaton> targetAutomata = ((LevenshteinAutomataAttribute) target).automata(); targetAutomata.clear(); targetAutomata.addAll(automata); }
/** * Construct an enumerator based upon an automaton, enumerating the specified * field, working on a supplied TermsEnum * <p> * @lucene.experimental * <p> * @param compiled CompiledAutomaton */ public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) { super(tenum); this.finite = compiled.finite; this.runAutomaton = compiled.runAutomaton; assert this.runAutomaton != null; this.commonSuffixRef = compiled.commonSuffixRef; this.automaton = compiled.automaton; // used for path tracking, where each bit is a numbered state. visited = new long[runAutomaton.getSize()]; termComp = getComparator(); }
private TermsEnum wildcardEnumeration(final IndexReader reader) throws IOException { Terms terms = MultiFields.getTerms(reader, term.field()); if(terms == null){ return null; } return new AutomatonTermsEnum( terms.iterator(), new CompiledAutomaton( WildcardQuery.toAutomaton(term), false, false)); }
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { super(); //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = dict; this.fstReader = fst.getBytesReader(); this.fstOutputs = dict.outputs; this.fsa = compiled.runAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0 ; i < stack.length; i++) { this.stack[i] = new Frame(); } Frame frame; frame = loadVirtualFrame(newFrame()); this.level++; frame = loadFirstFrame(newFrame()); pushFrame(frame); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = isAccept(topFrame()); } else { doSeekCeil(startTerm); pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame()); } }
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = index; this.fstReader = fst.getBytesReader(); this.fstOutputs = index.outputs; this.fsa = compiled.runAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0 ; i < stack.length; i++) { this.stack[i] = new Frame(); } Frame frame; frame = loadVirtualFrame(newFrame()); this.level++; frame = loadFirstFrame(newFrame()); pushFrame(frame); this.decoded = false; this.pending = false; if (startTerm == null) { pending = isAccept(topFrame()); } else { doSeekCeil(startTerm); pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame()); } }
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new OrdsIntersectTermsEnum(this, compiled, startTerm); }
@Override public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException { TermsEnum termsEnum = in.intersect(automaton, bytes); assert termsEnum != null; assert bytes == null || bytes.isValid(); return new AssertingTermsEnum(termsEnum); }
/** * Terms api equivalency */ public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException { if (leftTerms == null || rightTerms == null) { assertNull(info, leftTerms); assertNull(info, rightTerms); return; } assertTermsStatisticsEquals(info, leftTerms, rightTerms); assertEquals(leftTerms.hasOffsets(), rightTerms.hasOffsets()); assertEquals(leftTerms.hasPositions(), rightTerms.hasPositions()); assertEquals(leftTerms.hasPayloads(), rightTerms.hasPayloads()); TermsEnum leftTermsEnum = leftTerms.iterator(null); TermsEnum rightTermsEnum = rightTerms.iterator(null); assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true); assertTermsSeekingEquals(info, leftTerms, rightTerms); if (deep) { int numIntersections = atLeast(3); for (int i = 0; i < numIntersections; i++) { String re = AutomatonTestUtil.randomRegexp(random()); CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton()); if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.intersect(automaton, null); TermsEnum rightIntersection = rightTerms.intersect(automaton, null); assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely()); } } } }
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception { if (leftTerms == null || rightTerms == null) { assertNull(leftTerms); assertNull(rightTerms); return; } assertTermsStatistics(leftTerms, rightTerms); // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different TermsEnum leftTermsEnum = leftTerms.iterator(null); TermsEnum rightTermsEnum = rightTerms.iterator(null); assertTermsEnum(leftTermsEnum, rightTermsEnum, true); assertTermsSeeking(leftTerms, rightTerms); if (deep) { int numIntersections = atLeast(3); for (int i = 0; i < numIntersections; i++) { String re = AutomatonTestUtil.randomRegexp(random()); CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton()); if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.intersect(automaton, null); TermsEnum rightIntersection = rightTerms.intersect(automaton, null); assertTermsEnum(leftIntersection, rightIntersection, rarely()); } } } }
private boolean accepts(CompiledAutomaton c, BytesRef b) { int state = c.runAutomaton.getInitialState(); for(int idx=0;idx<b.length;idx++) { assertTrue(state != -1); state = c.runAutomaton.step(state, b.bytes[b.offset+idx] & 0xff); } return c.runAutomaton.isAccept(state); }
@Override public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException { TermsEnum termsEnum = super.intersect(automaton, bytes); assert termsEnum != null; assert bytes == null || bytes.isValid(); return new AssertingTermsEnum(termsEnum); }