/** seeks to every term accepted by some automata */ public void testSeeking() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(), DEFAULT_MAX_DETERMINIZED_STATES); TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); ArrayList<BytesRef> unsortedTerms = new ArrayList<>(terms); Collections.shuffle(unsortedTerms, random()); for (BytesRef term : unsortedTerms) { if (Operations.run(automaton, term.utf8ToString())) { // term is accepted if (random().nextBoolean()) { // seek exact assertTrue(te.seekExact(term)); } else { // seek ceil assertEquals(SeekStatus.FOUND, te.seekCeil(term)); assertEquals(term, te.term()); } } } } }
/** mixes up seek and next for all terms */ public void testSeekingAndNexting() throws Exception { for (int i = 0; i < numIterations; i++) { TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); for (BytesRef term : terms) { int c = random().nextInt(3); if (c == 0) { assertEquals(term, te.next()); } else if (c == 1) { assertEquals(SeekStatus.FOUND, te.seekCeil(term)); assertEquals(term, te.term()); } else { assertTrue(te.seekExact(term)); } } } }
/** seeks to every term accepted by some automata */ public void testSeeking() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); ArrayList<BytesRef> unsortedTerms = new ArrayList<BytesRef>(terms); Collections.shuffle(unsortedTerms, random()); for (BytesRef term : unsortedTerms) { if (BasicOperations.run(automaton, term.utf8ToString())) { // term is accepted if (random().nextBoolean()) { // seek exact assertTrue(te.seekExact(term, random().nextBoolean())); } else { // seek ceil assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean())); assertEquals(term, te.term()); } } } } }
/** mixes up seek and next for all terms */ public void testSeekingAndNexting() throws Exception { for (int i = 0; i < numIterations; i++) { TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); for (BytesRef term : terms) { int c = random().nextInt(3); if (c == 0) { assertEquals(term, te.next()); } else if (c == 1) { assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean())); assertEquals(term, te.term()); } else { assertTrue(te.seekExact(term, random().nextBoolean())); } } } }
/** seeks to every term accepted by some automata */ public void testSeeking() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); ArrayList<BytesRef> unsortedTerms = new ArrayList<BytesRef>(terms); Collections.shuffle(unsortedTerms, random()); for (BytesRef term : unsortedTerms) { if (BasicOperations.run(automaton, term.utf8ToString())) { // term is accepted if (random().nextBoolean()) { // seek exact assertTrue(te.seekExact(term)); } else { // seek ceil assertEquals(SeekStatus.FOUND, te.seekCeil(term)); assertEquals(term, te.term()); } } } } }
long lookupTerm(BytesRef key) { try { SeekStatus status = termsEnum.seekCeil(key); if (status == SeekStatus.END) { return -numValues-1; } else if (status == SeekStatus.FOUND) { return termsEnum.ord(); } else { return -termsEnum.ord()-1; } } catch (IOException bogus) { throw new RuntimeException(bogus); } }
@Test public void testDocsEnum() throws Exception { Bits mappedLiveDocs = randomLiveDocs(reader.maxDoc()); TermsEnum termsEnum = reader.terms(DOCS_ENUM_FIELD).iterator(null); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOCS_ENUM_TERM))); DocsEnum docs = termsEnum.docs(mappedLiveDocs, null); int doc; int prev = -1; while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(doc)); assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD))); while (++prev < doc) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev)); } } while (++prev < reader.maxDoc()) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev)); } DocsEnum reuse = docs; docs = termsEnum.docs(mappedLiveDocs, reuse); if (docs instanceof SortingDocsEnum) { assertTrue(((SortingDocsEnum) docs).reused(reuse)); // make sure reuse worked } doc = -1; prev = -1; while ((doc = docs.advance(doc + 1)) != DocIdSetIterator.NO_MORE_DOCS) { assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(doc)); assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD))); while (++prev < doc) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev)); } } while (++prev < reader.maxDoc()) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev)); } }
@Override public long lookupTerm(BytesRef key) { try { if (te.seekCeil(key) == SeekStatus.FOUND) { return te.ord(); } else { return -te.ord()-1; } } catch (IOException e) { throw new RuntimeException(e); } }
private int getTermWithSeekCount(Fields fields, String field) throws IOException { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); SeekStatus seekStatus = termsEnum.seekCeil(new BytesRef("")); if (seekStatus == SeekStatus.END) { return 0; } System.out.println(termsEnum.term().utf8ToString()); int count = 1; while (termsEnum.next() != null) { count++; } return count; }
public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException { return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly); }
@Override public SeekStatus seekCeil(BytesRef target) throws IOException { // already here if (term != null && term.equals(target)) { return SeekStatus.FOUND; } int startIdx = Arrays.binarySearch(indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; return SeekStatus.FOUND; } // we didn't hit the term exactly startIdx = -startIdx-1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; ord = 0; setTerm(); assert term != null; return SeekStatus.NOT_FOUND; } // back up to the start of the block startIdx--; if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; // should be non-null since it's in the index } while (term != null && term.compareTo(target) < 0) { next(); } if (term == null) { return SeekStatus.END; } else if (term.compareTo(target) == 0) { return SeekStatus.FOUND; } else { return SeekStatus.NOT_FOUND; } }
public void testSortedTermsEnum() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); iwconfig.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); Document doc = new Document(); doc.add(new SortedDocValuesField("field", new BytesRef("hello"))); iwriter.addDocument(doc); doc = new Document(); doc.add(new SortedDocValuesField("field", new BytesRef("world"))); iwriter.addDocument(doc); doc = new Document(); doc.add(new SortedDocValuesField("field", new BytesRef("beer"))); iwriter.addDocument(doc); iwriter.forceMerge(1); DirectoryReader ireader = iwriter.getReader(); iwriter.close(); SortedDocValues dv = getOnlySegmentReader(ireader).getSortedDocValues("field"); assertEquals(3, dv.getValueCount()); TermsEnum termsEnum = dv.termsEnum(); // next() assertEquals("beer", termsEnum.next().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals("hello", termsEnum.next().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals("world", termsEnum.next().utf8ToString()); assertEquals(2, termsEnum.ord()); // seekCeil() assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz"))); assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("aba"))); assertEquals(0, termsEnum.ord()); // seekExact() assertTrue(termsEnum.seekExact(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("hello"))); assertEquals(Codec.getDefault().toString(), "hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("world"))); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); assertFalse(termsEnum.seekExact(new BytesRef("bogus"))); // seek(ord) termsEnum.seekExact(0); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); termsEnum.seekExact(1); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); ireader.close(); directory.close(); }
public void testSortedSetTermsEnum() throws IOException { assumeTrue("Codec does not support SORTED_SET", defaultCodecSupportsSortedSet()); Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); iwconfig.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); Document doc = new Document(); doc.add(new SortedSetDocValuesField("field", new BytesRef("hello"))); doc.add(new SortedSetDocValuesField("field", new BytesRef("world"))); doc.add(new SortedSetDocValuesField("field", new BytesRef("beer"))); iwriter.addDocument(doc); DirectoryReader ireader = iwriter.getReader(); iwriter.close(); SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field"); assertEquals(3, dv.getValueCount()); TermsEnum termsEnum = dv.termsEnum(); // next() assertEquals("beer", termsEnum.next().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals("hello", termsEnum.next().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals("world", termsEnum.next().utf8ToString()); assertEquals(2, termsEnum.ord()); // seekCeil() assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz"))); // seekExact() assertTrue(termsEnum.seekExact(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("hello"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("world"))); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); assertFalse(termsEnum.seekExact(new BytesRef("bogus"))); // seek(ord) termsEnum.seekExact(0); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); termsEnum.seekExact(1); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); ireader.close(); directory.close(); }
private void assertTermsSeekingEquals(String info, Terms leftTerms, Terms rightTerms) throws IOException { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = atLeast(20); Random random = random(); // collect this number of terms from the left side HashSet<BytesRef> tests = new HashSet<>(); int numPasses = 0; while (numPasses < 10 && tests.size() < numTests) { leftEnum = leftTerms.iterator(leftEnum); BytesRef term = null; while ((term = leftEnum.next()) != null) { int code = random.nextInt(10); if (code == 0) { // the term tests.add(BytesRef.deepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.deepCopyOf(term); if (term.length > 0) { // truncate it term.length = random.nextInt(term.length); } } else if (code == 2) { // term, but ensure a non-zero offset byte newbytes[] = new byte[term.length+5]; System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); tests.add(new BytesRef(newbytes, 5, term.length)); } else if (code == 3) { switch (random().nextInt(3)) { case 0: tests.add(new BytesRef()); // before the first term break; case 1: tests.add(new BytesRef(new byte[] {(byte) 0xFF, (byte) 0xFF})); // past the last term break; case 2: tests.add(new BytesRef(TestUtil.randomSimpleString(random()))); // random term break; default: throw new AssertionError(); } } } numPasses++; } rightEnum = rightTerms.iterator(rightEnum); ArrayList<BytesRef> shuffledTests = new ArrayList<>(tests); Collections.shuffle(shuffledTests, random); for (BytesRef b : shuffledTests) { if (rarely()) { // reuse the enums leftEnum = leftTerms.iterator(leftEnum); rightEnum = rightTerms.iterator(rightEnum); } final boolean seekExact = random().nextBoolean(); if (seekExact) { assertEquals(info, leftEnum.seekExact(b), rightEnum.seekExact(b)); } else { SeekStatus leftStatus = leftEnum.seekCeil(b); SeekStatus rightStatus = rightEnum.seekCeil(b); assertEquals(info, leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(info, leftEnum.term(), rightEnum.term()); assertTermStatsEquals(info, leftEnum, rightEnum); } } } }
private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = atLeast(20); Random random = random(); // collect this number of terms from the left side HashSet<BytesRef> tests = new HashSet<>(); int numPasses = 0; while (numPasses < 10 && tests.size() < numTests) { leftEnum = leftTerms.iterator(leftEnum); BytesRef term = null; while ((term = leftEnum.next()) != null) { int code = random.nextInt(10); if (code == 0) { // the term tests.add(BytesRef.deepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.deepCopyOf(term); if (term.length > 0) { // truncate it term.length = random.nextInt(term.length); } } else if (code == 2) { // term, but ensure a non-zero offset byte newbytes[] = new byte[term.length+5]; System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); tests.add(new BytesRef(newbytes, 5, term.length)); } } numPasses++; } ArrayList<BytesRef> shuffledTests = new ArrayList<>(tests); Collections.shuffle(shuffledTests, random); for (BytesRef b : shuffledTests) { leftEnum = leftTerms.iterator(leftEnum); rightEnum = rightTerms.iterator(rightEnum); assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.seekCeil(b); rightStatus = rightEnum.seekCeil(b); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } leftStatus = leftEnum.seekCeil(b); rightStatus = rightEnum.seekCeil(b); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } } }
public void testSortedTermsEnum() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); iwconfig.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); Document doc = new Document(); doc.add(new StringField("field", "hello", Field.Store.NO)); iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("field", "world", Field.Store.NO)); iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("field", "beer", Field.Store.NO)); iwriter.addDocument(doc); iwriter.forceMerge(1); DirectoryReader ireader = iwriter.getReader(); iwriter.close(); AtomicReader ar = getOnlySegmentReader(ireader); SortedSetDocValues dv = FieldCache.DEFAULT.getDocTermOrds(ar, "field"); assertEquals(3, dv.getValueCount()); TermsEnum termsEnum = dv.termsEnum(); // next() assertEquals("beer", termsEnum.next().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals("hello", termsEnum.next().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals("world", termsEnum.next().utf8ToString()); assertEquals(2, termsEnum.ord()); // seekCeil() assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz"))); // seekExact() assertTrue(termsEnum.seekExact(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("hello"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("world"))); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); assertFalse(termsEnum.seekExact(new BytesRef("bogus"))); // seek(ord) termsEnum.seekExact(0); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); termsEnum.seekExact(1); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); // lookupTerm(BytesRef) assertEquals(-1, dv.lookupTerm(new BytesRef("apple"))); assertEquals(0, dv.lookupTerm(new BytesRef("beer"))); assertEquals(-2, dv.lookupTerm(new BytesRef("car"))); assertEquals(1, dv.lookupTerm(new BytesRef("hello"))); assertEquals(-3, dv.lookupTerm(new BytesRef("matter"))); assertEquals(2, dv.lookupTerm(new BytesRef("world"))); assertEquals(-4, dv.lookupTerm(new BytesRef("zany"))); ireader.close(); directory.close(); }
public void testSortedTermsEnum() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwconfig.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); Document doc = new Document(); doc.add(new SortedDocValuesField("field", new BytesRef("hello"))); iwriter.addDocument(doc); doc = new Document(); doc.add(new SortedDocValuesField("field", new BytesRef("world"))); iwriter.addDocument(doc); doc = new Document(); doc.add(new SortedDocValuesField("field", new BytesRef("beer"))); iwriter.addDocument(doc); iwriter.forceMerge(1); DirectoryReader ireader = iwriter.getReader(); iwriter.close(); SortedDocValues dv = getOnlySegmentReader(ireader).getSortedDocValues("field"); assertEquals(3, dv.getValueCount()); TermsEnum termsEnum = dv.termsEnum(); // next() assertEquals("beer", termsEnum.next().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals("hello", termsEnum.next().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals("world", termsEnum.next().utf8ToString()); assertEquals(2, termsEnum.ord()); // seekCeil() assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz"))); // seekExact() assertTrue(termsEnum.seekExact(new BytesRef("beer"), true)); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("hello"), true)); assertEquals(Codec.getDefault().toString(), "hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("world"), true)); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); assertFalse(termsEnum.seekExact(new BytesRef("bogus"), true)); // seek(ord) termsEnum.seekExact(0); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); termsEnum.seekExact(1); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); ireader.close(); directory.close(); }
public void testSortedSetTermsEnum() throws IOException { assumeTrue("Codec does not support SORTED_SET", defaultCodecSupportsSortedSet()); Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwconfig.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); Document doc = new Document(); doc.add(new SortedSetDocValuesField("field", new BytesRef("hello"))); doc.add(new SortedSetDocValuesField("field", new BytesRef("world"))); doc.add(new SortedSetDocValuesField("field", new BytesRef("beer"))); iwriter.addDocument(doc); DirectoryReader ireader = iwriter.getReader(); iwriter.close(); SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field"); assertEquals(3, dv.getValueCount()); TermsEnum termsEnum = dv.termsEnum(); // next() assertEquals("beer", termsEnum.next().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals("hello", termsEnum.next().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals("world", termsEnum.next().utf8ToString()); assertEquals(2, termsEnum.ord()); // seekCeil() assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz"))); // seekExact() assertTrue(termsEnum.seekExact(new BytesRef("beer"), true)); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("hello"), true)); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("world"), true)); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); assertFalse(termsEnum.seekExact(new BytesRef("bogus"), true)); // seek(ord) termsEnum.seekExact(0); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); termsEnum.seekExact(1); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); ireader.close(); directory.close(); }
@Override public SeekStatus seekCeil(BytesRef target, boolean useCache) throws IOException { // already here if (term != null && term.equals(target)) { return SeekStatus.FOUND; } int startIdx = Arrays.binarySearch(indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; return SeekStatus.FOUND; } // we didn't hit the term exactly startIdx = -startIdx-1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; ord = 0; setTerm(); assert term != null; return SeekStatus.NOT_FOUND; } // back up to the start of the block startIdx--; if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; // should be non-null since it's in the index } while (term != null && term.compareTo(target) < 0) { next(); } if (term == null) { return SeekStatus.END; } else if (term.compareTo(target) == 0) { return SeekStatus.FOUND; } else { return SeekStatus.NOT_FOUND; } }
private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = atLeast(20); Random random = random(); // collect this number of terms from the left side HashSet<BytesRef> tests = new HashSet<BytesRef>(); int numPasses = 0; while (numPasses < 10 && tests.size() < numTests) { leftEnum = leftTerms.iterator(leftEnum); BytesRef term = null; while ((term = leftEnum.next()) != null) { int code = random.nextInt(10); if (code == 0) { // the term tests.add(BytesRef.deepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.deepCopyOf(term); if (term.length > 0) { // truncate it term.length = random.nextInt(term.length); } } else if (code == 2) { // term, but ensure a non-zero offset byte newbytes[] = new byte[term.length+5]; System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); tests.add(new BytesRef(newbytes, 5, term.length)); } } numPasses++; } ArrayList<BytesRef> shuffledTests = new ArrayList<BytesRef>(tests); Collections.shuffle(shuffledTests, random); for (BytesRef b : shuffledTests) { leftEnum = leftTerms.iterator(leftEnum); rightEnum = rightTerms.iterator(rightEnum); assertEquals(leftEnum.seekExact(b, false), rightEnum.seekExact(b, false)); assertEquals(leftEnum.seekExact(b, true), rightEnum.seekExact(b, true)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.seekCeil(b, false); rightStatus = rightEnum.seekCeil(b, false); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } leftStatus = leftEnum.seekCeil(b, true); rightStatus = rightEnum.seekCeil(b, true); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } } }
public void testSortedTermsEnum() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwconfig.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); Document doc = new Document(); doc.add(new StringField("field", "hello", Field.Store.NO)); iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("field", "world", Field.Store.NO)); iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("field", "beer", Field.Store.NO)); iwriter.addDocument(doc); iwriter.forceMerge(1); DirectoryReader ireader = iwriter.getReader(); iwriter.close(); AtomicReader ar = getOnlySegmentReader(ireader); SortedSetDocValues dv = FieldCache.DEFAULT.getDocTermOrds(ar, "field"); assertEquals(3, dv.getValueCount()); TermsEnum termsEnum = dv.termsEnum(); // next() assertEquals("beer", termsEnum.next().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals("hello", termsEnum.next().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals("world", termsEnum.next().utf8ToString()); assertEquals(2, termsEnum.ord()); // seekCeil() assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz"))); // seekExact() assertTrue(termsEnum.seekExact(new BytesRef("beer"), true)); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("hello"), true)); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("world"), true)); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); assertFalse(termsEnum.seekExact(new BytesRef("bogus"), true)); // seek(ord) termsEnum.seekExact(0); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); termsEnum.seekExact(1); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); ireader.close(); directory.close(); }
private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = atLeast(20); Random random = random(); // collect this number of terms from the left side HashSet<BytesRef> tests = new HashSet<BytesRef>(); int numPasses = 0; while (numPasses < 10 && tests.size() < numTests) { leftEnum = leftTerms.iterator(leftEnum); BytesRef term = null; while ((term = leftEnum.next()) != null) { int code = random.nextInt(10); if (code == 0) { // the term tests.add(BytesRef.deepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.deepCopyOf(term); if (term.length > 0) { // truncate it term.length = random.nextInt(term.length); } } else if (code == 2) { // term, but ensure a non-zero offset byte newbytes[] = new byte[term.length+5]; System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); tests.add(new BytesRef(newbytes, 5, term.length)); } else if (code == 3) { switch (random().nextInt(3)) { case 0: tests.add(new BytesRef()); // before the first term break; case 1: tests.add(new BytesRef(new byte[] {(byte) 0xFF, (byte) 0xFF})); // past the last term break; case 2: tests.add(new BytesRef(_TestUtil.randomSimpleString(random()))); // random term break; default: throw new AssertionError(); } } } numPasses++; } rightEnum = rightTerms.iterator(rightEnum); ArrayList<BytesRef> shuffledTests = new ArrayList<BytesRef>(tests); Collections.shuffle(shuffledTests, random); for (BytesRef b : shuffledTests) { if (rarely()) { // reuse the enums leftEnum = leftTerms.iterator(leftEnum); rightEnum = rightTerms.iterator(rightEnum); } final boolean useCache = random().nextBoolean(); final boolean seekExact = random().nextBoolean(); if (seekExact) { assertEquals(info, leftEnum.seekExact(b, useCache), rightEnum.seekExact(b, useCache)); } else { SeekStatus leftStatus = leftEnum.seekCeil(b, useCache); SeekStatus rightStatus = rightEnum.seekCeil(b, useCache); assertEquals(info, leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(info, leftEnum.term(), rightEnum.term()); assertTermStats(leftEnum, rightEnum); } } } }