public UnionDocsAndPositionsEnum(Bits liveDocs, AtomicReaderContext context, Term[] terms, Map<Term,TermContext> termContexts, TermsEnum termsEnum) throws IOException { List<DocsAndPositionsEnum> docsEnums = new LinkedList<>(); for (int i = 0; i < terms.length; i++) { final Term term = terms[i]; TermState termState = termContexts.get(term).get(context.ord); if (termState == null) { // Term doesn't exist in reader continue; } termsEnum.seekExact(term.bytes(), termState); DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null, DocsEnum.FLAG_NONE); if (postings == null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.text() + ")"); } cost += postings.cost(); docsEnums.add(postings); } _queue = new DocsQueue(docsEnums); _posList = new IntQueue(); }
protected DocsAndPositionsEnum getPosEnum(IndexReader r, int docid, Term t) throws IOException { List<AtomicReaderContext> leaves = r.getContext().leaves(); for (AtomicReaderContext context : leaves) { AtomicReader reader = context.reader(); DocsAndPositionsEnum termPositions = reader.termPositionsEnum(t); int doc; while ((doc = termPositions.nextDoc()) != DocsEnum.NO_MORE_DOCS && doc != docid) { } if (doc != DocsEnum.NO_MORE_DOCS) { return termPositions; } } assertFalse("Expected positions enum for doc " + docid, true); return null; // will never come here }
public void testDocsEnumStart() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); MemoryIndex memory = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); memory.addField("foo", "bar", analyzer); AtomicReader reader = (AtomicReader) memory.createSearcher().getIndexReader(); DocsEnum disi = TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, null, DocsEnum.FLAG_NONE); int docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); // now reuse and check again TermsEnum te = reader.terms("foo").iterator(null); assertTrue(te.seekExact(new BytesRef("bar"))); disi = te.docs(null, disi, DocsEnum.FLAG_NONE); docid = disi.docID(); assertEquals(-1, docid); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); reader.close(); }
@Override public DocsEnum docs(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs, DocsEnum reuse, int flags) throws IOException { final SepTermState termState = (SepTermState) _termState; SepDocsEnum docsEnum; if (reuse == null || !(reuse instanceof SepDocsEnum)) { docsEnum = new SepDocsEnum(); } else { docsEnum = (SepDocsEnum) reuse; if (docsEnum.startDocIn != docIn) { // If you are using ParellelReader, and pass in a // reused DocsAndPositionsEnum, it could have come // from another reader also using sep codec docsEnum = new SepDocsEnum(); } } return docsEnum.init(fieldInfo, termState, liveDocs); }
@Override public DocIdSet getDocIdSet(AtomicReaderContext context, final Bits acceptDocs) throws IOException { Terms terms = context.reader().terms(term.field()); if (terms == null) { return null; } final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(term.bytes())) { return null; } return new DocIdSet() { @Override public DocIdSetIterator iterator() throws IOException { return termsEnum.docs(acceptDocs, null, DocsEnum.FLAG_NONE); } }; }
private SmallDocSet collectDocs(Bits acceptContains) throws IOException { SmallDocSet set = null; docsEnum = termsEnum.docs(acceptContains, docsEnum, DocsEnum.FLAG_NONE); int docid; while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (set == null) { int size = termsEnum.docFreq(); if (size <= 0) size = 16; set = new SmallDocSet(size); } set.set(docid); } return set; }
@Override public DocsEnum docs(Bits liveDocs, DocsEnum reuse, final int flags) throws IOException { final DocsEnum inReuse; final SortingDocsEnum wrapReuse; if (reuse != null && reuse instanceof SortingDocsEnum) { // if we're asked to reuse the given DocsEnum and it is Sorting, return // the wrapped one, since some Codecs expect it. wrapReuse = (SortingDocsEnum) reuse; inReuse = wrapReuse.getWrapped(); } else { wrapReuse = null; inReuse = reuse; } final DocsEnum inDocs = in.docs(newToOld(liveDocs), inReuse, flags); final boolean withFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >=0 && (flags & DocsEnum.FLAG_FREQS) != 0; return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, withFreqs, docMap); }
@Override public boolean score(Collector collector, int max) throws IOException { FakeScorer fakeScorer = new FakeScorer(); collector.setScorer(fakeScorer); if (doc == -1) { doc = nextDocOutOfOrder(); } while(doc < max) { fakeScorer.doc = doc; fakeScorer.score = scores[ords[scoreUpto]]; collector.collect(doc); doc = nextDocOutOfOrder(); } return doc != DocsEnum.NO_MORE_DOCS; }
int nextDocOutOfOrder() throws IOException { while (true) { if (docsEnum != null) { int docId = docsEnumNextDoc(); if (docId == DocIdSetIterator.NO_MORE_DOCS) { docsEnum = null; } else { return doc = docId; } } if (upto == terms.size()) { return doc = DocIdSetIterator.NO_MORE_DOCS; } scoreUpto = upto; if (termsEnum.seekExact(terms.get(ords[upto++], spare))) { docsEnum = reuse = termsEnum.docs(acceptDocs, reuse, DocsEnum.FLAG_NONE); } } }
protected void fillDocsAndScores(FixedBitSet matchingDocs, Bits acceptDocs, TermsEnum termsEnum) throws IOException { BytesRef spare = new BytesRef(); DocsEnum docsEnum = null; for (int i = 0; i < terms.size(); i++) { if (termsEnum.seekExact(terms.get(ords[i], spare))) { docsEnum = termsEnum.docs(acceptDocs, docsEnum, DocsEnum.FLAG_NONE); float score = TermsIncludingScoreQuery.this.scores[ords[i]]; for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) { matchingDocs.set(doc); // In the case the same doc is also related to a another doc, a score might be overwritten. I think this // can only happen in a many-to-many relation scores[doc] = score; } } } }
@Override protected void fillDocsAndScores(FixedBitSet matchingDocs, Bits acceptDocs, TermsEnum termsEnum) throws IOException { BytesRef spare = new BytesRef(); DocsEnum docsEnum = null; for (int i = 0; i < terms.size(); i++) { if (termsEnum.seekExact(terms.get(ords[i], spare))) { docsEnum = termsEnum.docs(acceptDocs, docsEnum, DocsEnum.FLAG_NONE); float score = TermsIncludingScoreQuery.this.scores[ords[i]]; for (int doc = docsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.nextDoc()) { // I prefer this: /*if (scores[doc] < score) { scores[doc] = score; matchingDocs.set(doc); }*/ // But this behaves the same as MVInnerScorer and only then the tests will pass: if (!matchingDocs.get(doc)) { scores[doc] = score; matchingDocs.set(doc); } } } } }
/** Returns docID if found, else -1. */ public int lookup(BytesRef id, long version) throws IOException { for(int seg=0;seg<numSegs;seg++) { if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) { if (VERBOSE) { System.out.println(" found in seg=" + termsEnums[seg]); } docsEnums[seg] = termsEnums[seg].docs(liveDocs[seg], docsEnums[seg], 0); int docID = docsEnums[seg].nextDoc(); if (docID != DocsEnum.NO_MORE_DOCS) { lastVersion = ((IDVersionSegmentTermsEnum) termsEnums[seg]).getVersion(); return docBases[seg] + docID; } assert hasDeletions; } } return -1; }
public static DocsEnum docs(Random random, TermsEnum termsEnum, Bits liveDocs, DocsEnum reuse, int flags) throws IOException { if (random.nextBoolean()) { if (random.nextBoolean()) { final int posFlags; switch (random.nextInt(4)) { case 0: posFlags = 0; break; case 1: posFlags = DocsAndPositionsEnum.FLAG_OFFSETS; break; case 2: posFlags = DocsAndPositionsEnum.FLAG_PAYLOADS; break; default: posFlags = DocsAndPositionsEnum.FLAG_OFFSETS | DocsAndPositionsEnum.FLAG_PAYLOADS; break; } // TODO: cast to DocsAndPositionsEnum? DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, posFlags); if (docsAndPositions != null) { return docsAndPositions; } } flags |= DocsEnum.FLAG_FREQS; } return termsEnum.docs(liveDocs, reuse, flags); }
/** * checks docs + freqs, sequentially */ public void assertDocsEnumEquals(String info, DocsEnum leftDocs, DocsEnum rightDocs, boolean hasFreqs) throws IOException { if (leftDocs == null) { assertNull(rightDocs); return; } assertEquals(info, -1, leftDocs.docID()); assertEquals(info, -1, rightDocs.docID()); int docid; while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(info, docid, rightDocs.nextDoc()); if (hasFreqs) { assertEquals(info, leftDocs.freq(), rightDocs.freq()); } } assertEquals(info, DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); }
public void testReuseDocsEnumNoReuse() throws IOException { Directory dir = newDirectory(); Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat()); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp)); int numdocs = atLeast(20); createRandomIndex(numdocs, writer, random()); writer.commit(); DirectoryReader open = DirectoryReader.open(dir); for (AtomicReaderContext ctx : open.leaves()) { AtomicReader indexReader = ctx.reader(); Terms terms = indexReader.terms("body"); TermsEnum iterator = terms.iterator(null); IdentityHashMap<DocsEnum, Boolean> enums = new IdentityHashMap<>(); MatchNoBits bits = new Bits.MatchNoBits(indexReader.maxDoc()); while ((iterator.next()) != null) { DocsEnum docs = iterator.docs(random().nextBoolean() ? bits : new Bits.MatchNoBits(indexReader.maxDoc()), null, random().nextBoolean() ? DocsEnum.FLAG_FREQS : DocsEnum.FLAG_NONE); enums.put(docs, true); } assertEquals(terms.size(), enums.size()); } IOUtils.close(writer, open, dir); }
/** * Returns the first document number containing the term <code>t</code> * Returns -1 if no document was found. * This method is primarily intended for clients that want to fetch * documents using a unique identifier." * @return the first document number containing the term */ public int getFirstMatch(Term t) throws IOException { Fields fields = atomicReader.fields(); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes)) { return -1; } DocsEnum docs = termsEnum.docs(atomicReader.getLiveDocs(), null, DocsEnum.FLAG_NONE); if (docs == null) return -1; int id = docs.nextDoc(); return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
protected int getFirstMatch(IndexReader r, Term t) throws IOException { Fields fields = MultiFields.getFields(r); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes)) { return -1; } DocsEnum docs = termsEnum.docs(MultiFields.getLiveDocs(r), null, DocsEnum.FLAG_NONE); int id = docs.nextDoc(); if (id != DocIdSetIterator.NO_MORE_DOCS) { int next = docs.nextDoc(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, next); } return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
@Override public void getDocumentsWithWordAsSet(String word, IntOpenHashSet documents) { DocsEnum docs = null; Term term = new Term(fieldName, word); try { int baseDocId; for (int i = 0; i < reader.length; i++) { docs = reader[i].termDocsEnum(term); baseDocId = contexts[i].docBase; if (docs != null) { while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) { documents.add(baseDocId + docs.docID()); } } } } catch (IOException e) { LOGGER.error("Error while requesting documents for word \"" + word + "\".", e); } }
public void testDocsEnumStart() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); MemoryIndex memory = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); memory.addField("foo", "bar", analyzer); AtomicReader reader = (AtomicReader) memory.createSearcher().getIndexReader(); DocsEnum disi = _TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, null, DocsEnum.FLAG_NONE); int docid = disi.docID(); assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); // now reuse and check again TermsEnum te = reader.terms("foo").iterator(null); assertTrue(te.seekExact(new BytesRef("bar"), true)); disi = te.docs(null, disi, DocsEnum.FLAG_NONE); docid = disi.docID(); assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); reader.close(); }
public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception { long totalTF = 0L; for (final AtomicReaderContext ctx : reader.leaves()) { AtomicReader r = ctx.reader(); if (!r.hasDeletions()) { // TODO: we could do this up front, during the scan // (next()), instead of after-the-fact here w/ seek, // if the codec supports it and there are no del // docs... final long totTF = r.totalTermFreq(term); if (totTF != -1) { totalTF += totTF; continue; } // otherwise we fall-through } // note: what should we do if field omits freqs? currently it counts as 1... DocsEnum de = r.termDocsEnum(term); if (de != null) { while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) totalTF += de.freq(); } } return totalTF; }
public void testReuseDocsEnumNoReuse() throws IOException { Directory dir = newDirectory(); Codec cp = _TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat()); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); int numdocs = atLeast(20); createRandomIndex(numdocs, writer, random()); writer.commit(); DirectoryReader open = DirectoryReader.open(dir); for (AtomicReaderContext ctx : open.leaves()) { AtomicReader indexReader = ctx.reader(); Terms terms = indexReader.terms("body"); TermsEnum iterator = terms.iterator(null); IdentityHashMap<DocsEnum, Boolean> enums = new IdentityHashMap<DocsEnum, Boolean>(); MatchNoBits bits = new Bits.MatchNoBits(indexReader.maxDoc()); while ((iterator.next()) != null) { DocsEnum docs = iterator.docs(random().nextBoolean() ? bits : new Bits.MatchNoBits(indexReader.maxDoc()), null, random().nextBoolean() ? DocsEnum.FLAG_FREQS : DocsEnum.FLAG_NONE); enums.put(docs, true); } assertEquals(terms.size(), enums.size()); } IOUtils.close(writer, open, dir); }
/** * Returns the first document number containing the term <code>t</code> * Returns -1 if no document was found. * This method is primarily intended for clients that want to fetch * documents using a unique identifier." * @return the first document number containing the term */ public int getFirstMatch(Term t) throws IOException { Fields fields = atomicReader.fields(); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes, false)) { return -1; } DocsEnum docs = termsEnum.docs(atomicReader.getLiveDocs(), null, DocsEnum.FLAG_NONE); if (docs == null) return -1; int id = docs.nextDoc(); return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
protected int getFirstMatch(IndexReader r, Term t) throws IOException { Fields fields = MultiFields.getFields(r); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes, false)) { return -1; } DocsEnum docs = termsEnum.docs(MultiFields.getLiveDocs(r), null, DocsEnum.FLAG_NONE); int id = docs.nextDoc(); if (id != DocIdSetIterator.NO_MORE_DOCS) { int next = docs.nextDoc(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, next); } return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
@Test public void testTermDocIterable() throws IOException { for (int pass = 0; pass < 1; pass++) { for (int id = 0; id < BLOCKS; id++) { DocsEnum termDocs = reader.termDocsEnum(new Term("id", Integer.toString(id))); TermDocIterable iterable = new TermDocIterable(termDocs, reader); int count = 0; int i = 0; long s = System.nanoTime(); for (Document document : iterable) { count++; assertEquals(i, Integer.parseInt(document.get("field"))); i++; } long time = System.nanoTime() - s; System.out.println(time / 1000000.0 + " " + id + " " + pass); assertEquals(COUNT_PER_BLOCK, count); } } }
/** * Returns the first document number containing the term <code>t</code> Returns -1 if no * document was found. This method is primarily intended for clients that want to fetch * documents using a unique identifier." * * @return the first document number containing the term */ public int getFirstMatch(Term t) throws IOException { Fields fields = atomicReader.fields(); if(fields == null) return -1; Terms terms = fields.terms(t.field()); if(terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if(!termsEnum.seekExact(termBytes, false)) { return -1; } DocsEnum docs = termsEnum.docs(atomicReader.getLiveDocs(), null, DocsEnum.FLAG_NONE); if(docs == null) return -1; int id = docs.nextDoc(); return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
@Override public DocsEnum docs(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsEnum reuse, int flags) throws IOException { if (canReuse(reuse, liveDocs)) { // if (DEBUG) System.out.println("SPR.docs ts=" + termState); return ((SegmentDocsEnumBase) reuse).reset(fieldInfo, (StandardTermState)termState); } return newDocsEnum(liveDocs, fieldInfo, (StandardTermState)termState); }
private boolean canReuse(DocsEnum reuse, Bits liveDocs) { if (reuse != null && (reuse instanceof SegmentDocsEnumBase)) { SegmentDocsEnumBase docsEnum = (SegmentDocsEnumBase) reuse; // If you are using ParellelReader, and pass in a // reused DocsEnum, it could have come from another // reader also using standard codec if (docsEnum.startFreqIn == freqIn) { // we only reuse if the the actual the incoming enum has the same liveDocs as the given liveDocs return liveDocs == docsEnum.liveDocs; } } return false; }
private DocsEnum newDocsEnum(Bits liveDocs, FieldInfo fieldInfo, StandardTermState termState) throws IOException { if (liveDocs == null) { return new AllDocsSegmentDocsEnum(freqIn).reset(fieldInfo, termState); } else { return new LiveDocsSegmentDocsEnum(freqIn, liveDocs).reset(fieldInfo, termState); } }
DocsEnum reset(FieldInfo fieldInfo, StandardTermState termState) throws IOException { indexOmitsTF = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY; storePayloads = fieldInfo.hasPayloads(); storeOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; freqOffset = termState.freqOffset; skipOffset = termState.skipOffset; // TODO: for full enum case (eg segment merging) this // seek is unnecessary; maybe we can avoid in such // cases freqIn.seek(termState.freqOffset); limit = termState.docFreq; assert limit > 0; ord = 0; doc = -1; accum = 0; // if (DEBUG) System.out.println(" sde limit=" + limit + " freqFP=" + freqOffset); skipped = false; start = -1; count = 0; freq = 1; if (indexOmitsTF) { Arrays.fill(freqs, 1); } maxBufferedDocId = -1; return this; }