public void testMaxPosition3WithSynomyms() throws IOException { for (final boolean consumeAll : new boolean[]{true, false}) { MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false); // if we are consuming all tokens, we can use the checks, otherwise we can't tokenizer.setEnableChecks(consumeAll); SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("one"), new CharsRef("first"), true); builder.add(new CharsRef("one"), new CharsRef("alpha"), true); builder.add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRefBuilder multiWordCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef); builder.add(new CharsRef("one"), multiWordCharsRef.get(), true); SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef); builder.add(new CharsRef("two"), multiWordCharsRef.get(), true); SynonymMap synonymMap = builder.build(); TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3, consumeAll); // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. assertTokenStreamContents(stream, new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"}, new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0}); } }
/** Sugar: just joins the provided terms with {@link * SynonymMap#WORD_SEPARATOR}. reuse and its chars * must not be null. */ public static CharsRef join(String[] words, CharsRefBuilder reuse) { int upto = 0; char[] buffer = reuse.chars(); for (String word : words) { final int wordLen = word.length(); final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR if (needed > buffer.length) { reuse.grow(needed); buffer = reuse.chars(); } if (upto > 0) { buffer[upto++] = SynonymMap.WORD_SEPARATOR; } word.getChars(0, wordLen, buffer, upto); upto += wordLen; } reuse.setLength(upto); return reuse.get(); }
/** only used for asserting! */ private boolean hasHoles(CharsRef chars) { final int end = chars.offset + chars.length; for(int idx=chars.offset+1;idx<end;idx++) { if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) { return true; } } if (chars.chars[chars.offset] == '\u0000') { return true; } if (chars.chars[chars.offset + chars.length - 1] == '\u0000') { return true; } return false; }
private void addInternal(CharsRef synset[], int size) { if (size <= 1) { return; // nothing to do } if (expand) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { add(synset[i], synset[j], false); } } } else { for (int i = 0; i < size; i++) { add(synset[i], synset[0], false); } } }
private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: final FST.Arc<CharsRef> scratchArc = new FST.Arc<>(); final FST.BytesReader fstReader = map.getBytesReader(); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); while(true) { assert scratchArc.label != FST.END_LABEL; cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc)); if (scratchArc.isLast()) { break; } map.readNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } } }
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); }
/** * Find the unique stem(s) of the provided word * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> uniqueStems(char word[], int length) { List<CharsRef> stems = stem(word, length); if (stems.size() < 2) { return stems; } CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); List<CharsRef> deduped = new ArrayList<>(); for (CharsRef s : stems) { if (!terms.contains(s)) { deduped.add(s); terms.add(s); } } return deduped; }
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String,String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); } if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String,String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
/** * Add another character sequence to this automaton. The sequence must be * lexicographically larger or equal compared to any previous sequences added * to this automaton (the input must be sorted). */ public void add(CharsRef current) { assert stateRegistry != null : "Automaton already built."; assert previous == null || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: " + previous + " >= " + current; assert setPrevious(current); // Descend in the automaton (find matching prefix). int pos = 0, max = current.length(); State next, state = root; while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) { state = next; // todo, optimize me pos += Character.charCount(Character.codePointAt(current, pos)); } if (state.hasChildren()) replaceOrRegister(state); addSuffix(state, current, pos); }
/** * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. */ public static Automaton build(Collection<BytesRef> input) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); char[] chars = new char[0]; CharsRef ref = new CharsRef(); for (BytesRef b : input) { chars = ArrayUtil.grow(chars, b.length); final int len = UnicodeUtil.UTF8toUTF16(b, chars); ref.chars = chars; ref.length = len; builder.add(ref); } Automaton.Builder a = new Automaton.Builder(); convert(a, builder.complete(), new IdentityHashMap<State,Integer>()); return a.finish(); }
@Override public CharsRef subtract(CharsRef output, CharsRef inc) { assert output != null; assert inc != null; if (inc == NO_OUTPUT) { // no prefix removed return output; } else if (inc.length == output.length) { // entire output removed return NO_OUTPUT; } else { assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; assert inc.length > 0; return new CharsRef(output.chars, output.offset + inc.length, output.length-inc.length); } }
@Override public CharsRef add(CharsRef prefix, CharsRef output) { assert prefix != null; assert output != null; if (prefix == NO_OUTPUT) { return output; } else if (output == NO_OUTPUT) { return prefix; } else { assert prefix.length > 0; assert output.length > 0; CharsRef result = new CharsRef(prefix.length + output.length); System.arraycopy(prefix.chars, prefix.offset, result.chars, 0, prefix.length); System.arraycopy(output.chars, output.offset, result.chars, prefix.length, output.length); result.length = prefix.length + output.length; return result; } }
private List<LookupResult> getLookupResults(SpellingOptions options, Token currentToken) throws IOException { CharsRef scratch = new CharsRef(); scratch.chars = currentToken.buffer(); scratch.offset = 0; scratch.length = currentToken.length(); boolean onlyMorePopular = (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) && !(lookup instanceof WFSTCompletionLookup) && !(lookup instanceof AnalyzingSuggester); List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count); if (suggestions == null || suggestions.size() == 0) { return null; } return suggestions; }
/** only used for asserting! */ private boolean hasHoles(CharsRef chars) { final int end = chars.offset + chars.length; for (int idx = chars.offset + 1; idx < end; idx++) { if (chars.chars[idx] == WORD_SEPARATOR && chars.chars[idx - 1] == WORD_SEPARATOR) { return true; } } if (chars.chars[chars.offset] == '\u0000') { return true; } if (chars.chars[chars.offset + chars.length - 1] == '\u0000') { return true; } return false; }
private CharsRef analyze(Analyzer analyzer, String text) throws IOException { CharsRefBuilder charsRefBuilder = new CharsRefBuilder(); try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } charsRefBuilder.grow(charsRefBuilder.length() + length + 1); /* current + word + separator */ if (charsRefBuilder.length() > 0) { charsRefBuilder.append(CcWordSet.WORD_SEPARATOR); } charsRefBuilder.append(termAtt); } ts.end(); } if (charsRefBuilder.length() == 0) { return null; } charsRefBuilder.append(CcWordSet.WORD_END); return charsRefBuilder.get(); }
/** Sugar: just joins the provided terms with {@link * SynonymMap#WORD_SEPARATOR}. reuse and its chars * must not be null. */ public static CharsRef join(String[] words, CharsRef reuse) { int upto = 0; char[] buffer = reuse.chars; for (String word : words) { final int wordLen = word.length(); final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR if (needed > buffer.length) { reuse.grow(needed); buffer = reuse.chars; } if (upto > 0) { buffer[upto++] = SynonymMap.WORD_SEPARATOR; } word.getChars(0, wordLen, buffer, upto); upto += wordLen; } reuse.length = upto; return reuse; }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field */ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(null); final CharsRef spare = new CharsRef(); BytesRef text; while((text = termsEnum.next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
@Override public void build(TermFreqIterator tfit) throws IOException { if (tfit.getComparator() != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... tfit = new UnsortedTermFreqIteratorWrapper(tfit); } trie = new JaspellTernarySearchTrie(); trie.setMatchAlmostDiff(editDistance); BytesRef spare; final CharsRef charsSpare = new CharsRef(); while ((spare = tfit.next()) != null) { final long weight = tfit.weight(); if (spare.length == 0) { continue; } charsSpare.grow(spare.length); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); trie.put(charsSpare.toString(), Long.valueOf(weight)); } }
@Override public void build(TermFreqIterator tfit) throws IOException { root = new TernaryTreeNode(); // buffer first if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) { // make sure it's sorted and the comparator uses UTF16 sort order tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator()); } ArrayList<String> tokens = new ArrayList<String>(); ArrayList<Number> vals = new ArrayList<Number>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.next()) != null) { charsSpare.grow(spare.length); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); tokens.add(charsSpare.toString()); vals.add(Long.valueOf(tfit.weight())); } autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); }
private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>(); final FST.BytesReader fstReader = map.getBytesReader(); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); while(true) { assert scratchArc.label != FST.END_LABEL; cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc)); if (scratchArc.isLast()) { break; } map.readNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } } }
public void testRandomUnicodeStrings() throws Throwable { char[] buffer = new char[20]; char[] expected = new char[20]; BytesRef utf8 = new BytesRef(20); CharsRef utf16 = new CharsRef(20); int num = atLeast(100000); for (int iter = 0; iter < num; iter++) { boolean hasIllegal = fillUnicode(buffer, expected, 0, 20); UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); if (!hasIllegal) { byte[] b = new String(buffer, 0, 20).getBytes("UTF-8"); assertEquals(b.length, utf8.length); for(int i=0;i<b.length;i++) assertEquals(b[i], utf8.bytes[i]); } UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16); assertEquals(utf16.length, 20); for(int i=0;i<20;i++) assertEquals(expected[i], utf16.chars[i]); } }
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) { LookupResult result; if (hasPayloads) { int sepIndex = -1; for(int i=0;i<output2.length;i++) { if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) { sepIndex = i; break; } } assert sepIndex != -1; spare.grow(sepIndex); final int payloadLen = output2.length - sepIndex - 1; UnicodeUtil.UTF8toUTF16(output2.bytes, output2.offset, sepIndex, spare); BytesRef payload = new BytesRef(payloadLen); System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen); payload.length = payloadLen; result = new LookupResult(spare.toString(), decodeWeight(output1), payload); } else { spare.grow(output2.length); UnicodeUtil.UTF8toUTF16(output2, spare); result = new LookupResult(spare.toString(), decodeWeight(output1)); } return result; }