@Override public IntsRef subtract(IntsRef output, IntsRef inc) { assert output != null; assert inc != null; if (inc == NO_OUTPUT) { // no prefix removed return output; } else if (inc.length == output.length) { // entire output removed return NO_OUTPUT; } else { assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; assert inc.length > 0; return new IntsRef(output.ints, output.offset + inc.length, output.length-inc.length); } }
@Override public IntsRef add(IntsRef prefix, IntsRef output) { assert prefix != null; assert output != null; if (prefix == NO_OUTPUT) { return output; } else if (output == NO_OUTPUT) { return prefix; } else { assert prefix.length > 0; assert output.length > 0; IntsRef result = new IntsRef(prefix.length + output.length); System.arraycopy(prefix.ints, prefix.offset, result.ints, 0, prefix.length); System.arraycopy(output.ints, output.offset, result.ints, prefix.length, output.length); result.length = prefix.length + output.length; return result; } }
/** Looks up the output for this input, or null if the * input is not accepted. */ public static<T> T get(FST<T> fst, IntsRef input) throws IOException { // TODO: would be nice not to alloc this on every lookup final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); final BytesReader fstReader = fst.getBytesReader(); // Accumulate output as we go T output = fst.outputs.getNoOutput(); for(int i=0;i<input.length;i++) { if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) { return null; } output = fst.outputs.add(output, arc.output); } if (arc.isFinal()) { return fst.outputs.add(output, arc.nextFinalOutput); } else { return null; } }
private void testRandomWords(int maxNumWords, int numIter) throws IOException { Random random = new Random(random().nextLong()); for(int iter=0;iter<numIter;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter " + iter); } for(int inputMode=0;inputMode<2;inputMode++) { final int numWords = random.nextInt(maxNumWords+1); Set<IntsRef> termsSet = new HashSet<>(); IntsRef[] terms = new IntsRef[numWords]; while(termsSet.size() < numWords) { final String term = getRandomString(random); termsSet.add(toIntsRef(term, inputMode)); } doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()])); } } }
private final void count(List<MatchingDocs> matchingDocs) throws IOException { IntsRef scratch = new IntsRef(); for(MatchingDocs hits : matchingDocs) { OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context); DocIdSetIterator docs = hits.bits.iterator(); int doc; while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { ords.get(doc, scratch); for(int i=0;i<scratch.length;i++) { values[scratch.ints[scratch.offset+i]]++; } } } rollup(); }
@Override public OrdinalsSegmentReader getReader(AtomicReaderContext context) throws IOException { BinaryDocValues values0 = context.reader().getBinaryDocValues(field); if (values0 == null) { values0 = DocValues.emptyBinary(); } final BinaryDocValues values = values0; return new OrdinalsSegmentReader() { @Override public void get(int docID, IntsRef ordinals) throws IOException { final BytesRef bytes = values.get(docID); decode(bytes, ordinals); } }; }
public void testSimpleDictionary() throws Exception { InputStream affixStream = getClass().getResourceAsStream("simple.aff"); InputStream dictStream = getClass().getResourceAsStream("simple.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3); assertNotNull(ordList); assertEquals(1, ordList.length); BytesRef ref = new BytesRef(); dictionary.flagLookup.get(ordList.ints[0], ref); char flags[] = Dictionary.decodeFlags(ref); assertEquals(1, flags.length); ordList = dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5); assertNotNull(ordList); assertEquals(1, ordList.length); dictionary.flagLookup.get(ordList.ints[0], ref); flags = Dictionary.decodeFlags(ref); assertEquals(1, flags.length); affixStream.close(); dictStream.close(); }
public void testCompressedDictionary() throws Exception { InputStream affixStream = getClass().getResourceAsStream("compressed.aff"); InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3); BytesRef ref = new BytesRef(); dictionary.flagLookup.get(ordList.ints[0], ref); char flags[] = Dictionary.decodeFlags(ref); assertEquals(1, flags.length); affixStream.close(); dictStream.close(); }
public void testCompressedBeforeSetDictionary() throws Exception { InputStream affixStream = getClass().getResourceAsStream("compressed-before-set.aff"); InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3); BytesRef ref = new BytesRef(); dictionary.flagLookup.get(ordList.ints[0], ref); char flags[] = Dictionary.decodeFlags(ref); assertEquals(1, flags.length); affixStream.close(); dictStream.close(); }
public void testCompressedEmptyAliasDictionary() throws Exception { InputStream affixStream = getClass().getResourceAsStream("compressed-empty-alias.aff"); InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3); BytesRef ref = new BytesRef(); dictionary.flagLookup.get(ordList.ints[0], ref); char flags[] = Dictionary.decodeFlags(ref); assertEquals(1, flags.length); affixStream.close(); dictStream.close(); }
/** Adds all leaving arcs, including 'finished' arc, if * the node is final, from this node into the queue. */ public void addStartPaths(FST.Arc<T> node, T startOutput, boolean allowEmptyString, IntsRef input) throws IOException { // De-dup NO_OUTPUT since it must be a singleton: if (startOutput.equals(fst.outputs.getNoOutput())) { startOutput = fst.outputs.getNoOutput(); } FSTPath<T> path = new FSTPath<T>(startOutput, node, input); fst.readFirstTargetArc(node, path.arc, bytesReader); //System.out.println("add start paths"); // Bootstrap: find the min starting arc while (true) { if (allowEmptyString || path.arc.label != FST.END_LABEL) { addIfCompetitive(path); } if (path.arc.isLast()) { break; } fst.readNextArc(path.arc, bytesReader); } }
private void testRandomWords(int maxNumWords, int numIter) throws IOException { Random random = new Random(random().nextLong()); for(int iter=0;iter<numIter;iter++) { if (VERBOSE) { System.out.println("\nTEST: iter " + iter); } for(int inputMode=0;inputMode<2;inputMode++) { final int numWords = random.nextInt(maxNumWords+1); Set<IntsRef> termsSet = new HashSet<IntsRef>(); IntsRef[] terms = new IntsRef[numWords]; while(termsSet.size() < numWords) { final String term = getRandomString(random); termsSet.add(toIntsRef(term, inputMode)); } doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()])); } } }
/** * Builds the final automaton from a list of entries. */ private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException { // Build the automaton. final Outputs<Object> outputs = NoOutputs.getSingleton(); final Object empty = outputs.getNoOutput(); final Builder<Object> builder = new Builder<Object>( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInts.DEFAULT, true, 15); BytesRef scratch = new BytesRef(); BytesRef entry; final IntsRef scratchIntsRef = new IntsRef(); int count = 0; BytesRefIterator iter = sorter.iterator(); while((entry = iter.next()) != null) { count++; if (scratch.compareTo(entry) != 0) { builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); } } return count == 0 ? null : builder.finish(); }
@Override public void encode(IntsRef values, BytesRef buf) { buf.offset = buf.length = 0; int upto = values.offset + values.length; for (int i = values.offset; i < upto; i++) { int value = values.ints[i]; if (value == 1) { indicator |= ENCODE_TABLE[ordinal]; } else { encodeQueue.ints[encodeQueue.length++] = value - 2; } ++ordinal; // encode the chunk and the indicator if (ordinal == 8) { encodeChunk(buf); } } // encode remaining values if (ordinal != 0) { encodeChunk(buf); } }
@BeforeClass public static void beforeClassEncodingTest() throws Exception { int capacity = atLeast(10000); data = new IntsRef(capacity); for (int i = 0; i < 10; i++) { data.ints[i] = i + 1; // small values } for (int i = 10; i < data.ints.length; i++) { data.ints[i] = random().nextInt(Integer.MAX_VALUE - 1) + 1; // some encoders don't allow 0 } data.length = data.ints.length; uniqueSortedData = IntsRef.deepCopyOf(data); Arrays.sort(uniqueSortedData.ints); uniqueSortedData.length = 0; int prev = -1; for (int i = 0; i < uniqueSortedData.ints.length; i++) { if (uniqueSortedData.ints[i] != prev) { uniqueSortedData.ints[uniqueSortedData.length++] = uniqueSortedData.ints[i]; prev = uniqueSortedData.ints[i]; } } }
private static void encoderTest(IntEncoder encoder, IntsRef data, IntsRef expected) throws IOException { // ensure toString is implemented String toString = encoder.toString(); assertFalse(toString.startsWith(encoder.getClass().getName() + "@")); IntDecoder decoder = encoder.createMatchingDecoder(); toString = decoder.toString(); assertFalse(toString.startsWith(decoder.getClass().getName() + "@")); BytesRef bytes = new BytesRef(100); // some initial capacity - encoders should grow the byte[] IntsRef values = new IntsRef(100); // some initial capacity - decoders should grow the int[] for (int i = 0; i < 2; i++) { // run 2 iterations to catch encoders/decoders which don't reset properly encoding(encoder, data, bytes); decoding(bytes, values, encoder.createMatchingDecoder()); assertTrue(expected.intsEquals(values)); } }
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRef scratch = new IntsRef(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); }
public void testInternalFinalState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final boolean willRewrite = random().nextBoolean(); final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite, PackedInts.DEFAULT, true, 15); builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput()); final FST<Long> fst = builder.finish(); StringWriter w = new StringWriter(); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); Util.toDot(fst, w, false, false); w.close(); //System.out.println(w.toString()); // check for accept state at label t assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1); // check for accept state at label n assertTrue(w.toString().indexOf("[label=\"n\" style=\"bold\"") != -1); }
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException { meta.writeVInt(field.number); meta.writeByte(FST); meta.writeLong(data.getFilePointer()); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs); IntsRef scratch = new IntsRef(); long ord = 0; for (BytesRef v : values) { builder.add(Util.toIntsRef(v, scratch), ord); ord++; } FST<Long> fst = builder.finish(); if (fst != null) { fst.save(data); } meta.writeVLong(ord); }
/** * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @throws IOException if an {@link IOException} occurs; */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef intsSpare = new IntsRef(); final int size = hash.size(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); builder.add(intsSpare, new BytesRef(outputValues.get(id))); } return new StemmerOverrideMap(builder.finish(), ignoreCase); }
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); Automaton automaton; try (TokenStream ts = stream) { automaton = toAutomaton(ts, ts2a); } LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); Set<IntsRef> set = new HashSet<>(); for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) { set.add(IntsRef.deepCopyOf(string)); } return Collections.unmodifiableSet(set); }
@Override public void seekExact(long ord) throws IOException { // TODO: would be better to make this simpler and faster. // but we dont want to introduce a bug that corrupts our enum state! bytesReader.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); BytesRefBuilder scratchBytes = new BytesRefBuilder(); scratchBytes.clear(); Util.toBytesRef(output, scratchBytes); // TODO: we could do this lazily, better to try to push into FSTEnum though? in.seekExact(scratchBytes.get()); }
private List<CharsRef> doStem(char word[], int length, boolean caseVariant) { List<CharsRef> stems = new ArrayList<>(); IntsRef forms = dictionary.lookupWord(word, 0, length); if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { boolean checkKeepCase = caseVariant && dictionary.keepcase != -1; boolean checkNeedAffix = dictionary.needaffix != -1; boolean checkOnlyInCompound = dictionary.onlyincompound != -1; if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) { dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch); char wordFlags[] = Dictionary.decodeFlags(scratch); // we are looking for a case variant, but this word does not allow it if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) { continue; } // we can't add this form, its a pseudostem requiring an affix if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) { continue; } // we can't add this form, it only belongs inside a compound word if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) { continue; } } stems.add(newStem(word, length, forms, i)); } } try { boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant)); } catch (IOException bogus) { throw new RuntimeException(bogus); } return stems; }
private CharsRef newStem(char buffer[], int length, IntsRef forms, int formID) { final String exception; if (dictionary.hasStemExceptions) { int exceptionID = forms.ints[forms.offset + formID + 1]; if (exceptionID > 0) { exception = dictionary.getStemException(exceptionID); } else { exception = null; } } else { exception = null; } if (dictionary.needsOutputCleaning) { scratchSegment.setLength(0); if (exception != null) { scratchSegment.append(exception); } else { scratchSegment.append(buffer, 0, length); } try { Dictionary.applyMappings(dictionary.oconv, scratchSegment); } catch (IOException bogus) { throw new RuntimeException(bogus); } char cleaned[] = new char[scratchSegment.length()]; scratchSegment.getChars(0, cleaned.length, cleaned, 0); return new CharsRef(cleaned, 0, cleaned.length); } else { if (exception != null) { return new CharsRef(exception); } else { return new CharsRef(buffer, 0, length); } } }
IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) { if (fst == null) { return null; } final FST.BytesReader bytesReader = fst.getBytesReader(); final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>()); // Accumulate output as we go final IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); IntsRef output = NO_OUTPUT; int l = offset + length; try { for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { cp = Character.codePointAt(word, i, l); if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) { return null; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { return null; } else if (arc.output != NO_OUTPUT) { return fst.outputs.add(output, arc.output); } else { return output; } } catch (IOException bogus) { throw new RuntimeException(bogus); } }
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; } builder.add(scratch.get(), output); } return builder.finish(); }
/** * Returns true if the given string (expressed as unicode codepoints) is accepted by the automaton. The input must be deterministic. * <p> * Complexity: linear in the length of the string. * <p> * <b>Note:</b> for full performance, use the {@link RunAutomaton} class. */ public static boolean run(Automaton a, IntsRef s) { assert a.isDeterministic(); int state = 0; for (int i=0;i<s.length;i++) { int nextState = a.step(state, s.ints[s.offset+i]); if (nextState == -1) { return false; } state = nextState; } return a.isAccept(state); }
@Override public IntsRef common(IntsRef output1, IntsRef output2) { assert output1 != null; assert output2 != null; int pos1 = output1.offset; int pos2 = output2.offset; int stopAt1 = pos1 + Math.min(output1.length, output2.length); while(pos1 < stopAt1) { if (output1.ints[pos1] != output2.ints[pos2]) { break; } pos1++; pos2++; } if (pos1 == output1.offset) { // no common prefix return NO_OUTPUT; } else if (pos1 == output1.offset + output1.length) { // output1 is a prefix of output2 return output1; } else if (pos2 == output2.offset + output2.length) { // output2 is a prefix of output1 return output2; } else { return new IntsRef(output1.ints, output1.offset, pos1-output1.offset); } }
@Override public void write(IntsRef prefix, DataOutput out) throws IOException { assert prefix != null; out.writeVInt(prefix.length); for(int idx=0;idx<prefix.length;idx++) { out.writeVInt(prefix.ints[prefix.offset+idx]); } }
@Override public IntsRef read(DataInput in) throws IOException { final int len = in.readVInt(); if (len == 0) { return NO_OUTPUT; } else { final IntsRef output = new IntsRef(len); for(int idx=0;idx<len;idx++) { output.ints[idx] = in.readVInt(); } output.length = len; return output; } }
/** Just maps each UTF16 unit (char) to the ints in an * IntsRef. */ public static IntsRef toUTF16(CharSequence s, IntsRefBuilder scratch) { final int charLimit = s.length(); scratch.setLength(charLimit); scratch.grow(charLimit); for (int idx = 0; idx < charLimit; idx++) { scratch.setIntAt(idx, (int) s.charAt(idx)); } return scratch.get(); }