void seek(TermInfo ti, Term term) throws IOException { count = 0; FieldInfo fi = fieldInfos.fieldInfo(term.field()); this.indexOptions = (fi != null) ? fi.getIndexOptions() : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; currentFieldStoresPayloads = (fi != null) ? fi.hasPayloads() : false; if (ti == null) { df = 0; } else { df = ti.docFreq; doc = 0; freqBasePointer = ti.freqPointer; proxBasePointer = ti.proxPointer; skipPointer = freqBasePointer + ti.skipOffset; freqStream.seek(freqBasePointer); haveSkipped = false; } }
@Override public int setField(FieldInfo fieldInfo) { IndexOptions indexOptions = fieldInfo.getIndexOptions(); fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; fieldHasPayloads = fieldInfo.hasPayloads(); skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads); lastState = emptyState; if (fieldHasPositions) { if (fieldHasPayloads || fieldHasOffsets) { return 3; // doc + pos + pay FP } else { return 2; // doc + pos FP } } else { return 1; // doc FP } }
/** make sure all sims work if TF is omitted */ public void testOmitTF() throws Exception { Directory dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), dir); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_ONLY); ft.freeze(); Field f = newField("foo", "bar", ft); doc.add(f); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher is = newSearcher(ir); for (Similarity sim : sims) { is.setSimilarity(sim); BooleanQuery query = new BooleanQuery(true); query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD); assertEquals(1, is.search(query, 10).totalHits); } ir.close(); dir.close(); }
public void testChangeIndexOptions() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); FieldType docsAndFreqs = new FieldType(TextField.TYPE_NOT_STORED); docsAndFreqs.setIndexOptions(IndexOptions.DOCS_AND_FREQS); FieldType docsOnly = new FieldType(TextField.TYPE_NOT_STORED); docsOnly.setIndexOptions(IndexOptions.DOCS_ONLY); Document doc = new Document(); doc.add(new Field("field", "a b c", docsAndFreqs)); w.addDocument(doc); w.addDocument(doc); doc = new Document(); doc.add(new Field("field", "a b c", docsOnly)); w.addDocument(doc); w.close(); dir.close(); }
public SeedPostings(long seed, int minDocFreq, int maxDocFreq, Bits liveDocs, IndexOptions options) { random = new Random(seed); docRandom = new Random(random.nextLong()); docFreq = TestUtil.nextInt(random, minDocFreq, maxDocFreq); this.liveDocs = liveDocs; // TODO: more realistic to inversely tie this to numDocs: maxDocSpacing = TestUtil.nextInt(random, 1, 100); if (random.nextInt(10) == 7) { // 10% of the time create big payloads: payloadSize = 1 + random.nextInt(3); } else { payloadSize = 1 + random.nextInt(1); } fixedPayloads = random.nextBoolean(); byte[] payloadBytes = new byte[payloadSize]; payload = new BytesRef(payloadBytes); this.options = options; doPositions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS.compareTo(options) <= 0; }
private static SeedPostings getSeedPostings(String term, long seed, boolean withLiveDocs, IndexOptions options) { int minDocFreq, maxDocFreq; if (term.startsWith("big_")) { minDocFreq = RANDOM_MULTIPLIER * 50000; maxDocFreq = RANDOM_MULTIPLIER * 70000; } else if (term.startsWith("medium_")) { minDocFreq = RANDOM_MULTIPLIER * 3000; maxDocFreq = RANDOM_MULTIPLIER * 6000; } else if (term.startsWith("low_")) { minDocFreq = RANDOM_MULTIPLIER; maxDocFreq = RANDOM_MULTIPLIER * 40; } else { minDocFreq = 1; maxDocFreq = 3; } return new SeedPostings(seed, minDocFreq, maxDocFreq, withLiveDocs ? globalLiveDocs : null, options); }
/** * Utility method to create a {@link org.apache.lucene.document.FieldType} * based on the {@link SchemaField} */ public static org.apache.lucene.document.FieldType createFieldType(SchemaField field) { if (!field.indexed() && !field.stored()) { if (log.isTraceEnabled()) log.trace("Ignoring unindexed/unstored field: " + field); return null; } org.apache.lucene.document.FieldType newType = new org.apache.lucene.document.FieldType(); newType.setIndexed(field.indexed()); newType.setTokenized(field.isTokenized()); newType.setStored(field.stored()); newType.setOmitNorms(field.omitNorms()); IndexOptions options = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; if (field.omitTermFreqAndPositions()) { options = IndexOptions.DOCS_ONLY; } else if (field.omitPositions()) { options = IndexOptions.DOCS_AND_FREQS; } else if (field.storeOffsetsWithPositions()) { options = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } newType.setIndexOptions(options); newType.setStoreTermVectors(field.storeTermVector()); newType.setStoreTermVectorOffsets(field.storeTermOffsets()); newType.setStoreTermVectorPositions(field.storeTermPositions()); return newType; }
private void testTerms(final Fields fieldsSource, final EnumSet<Option> options, final IndexOptions maxTestOptions, final IndexOptions maxIndexOptions, final boolean alwaysTestMax) throws Exception { if (options.contains(Option.THREADS)) { int numThreads = TestUtil.nextInt(random(), 2, 5); Thread[] threads = new Thread[numThreads]; for(int threadUpto=0;threadUpto<numThreads;threadUpto++) { threads[threadUpto] = new TestThread(this, fieldsSource, options, maxTestOptions, maxIndexOptions, alwaysTestMax); threads[threadUpto].start(); } for(int threadUpto=0;threadUpto<numThreads;threadUpto++) { threads[threadUpto].join(); } } else { testTermsOneThread(fieldsSource, options, maxTestOptions, maxIndexOptions, alwaysTestMax); } }
public void testBasic() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); Field f = newField("foo", "this is a test test", ft); doc.add(f); for (int i = 0; i < 100; i++) { w.addDocument(doc); } IndexReader reader = w.getReader(); w.close(); assertNull(MultiFields.getTermPositionsEnum(reader, null, "foo", new BytesRef("test"))); DocsEnum de = TestUtil.docs(random(), reader, "foo", new BytesRef("test"), null, null, DocsEnum.FLAG_FREQS); while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(2, de.freq()); } reader.close(); dir.close(); }
private void decodeMetaData() { if (!didDecode) { buffer.reset(current.output.bytes, current.output.offset, current.output.length); docFreq = buffer.readVInt(); if (field.getIndexOptions() != IndexOptions.DOCS_ONLY) { totalTermFreq = docFreq + buffer.readVLong(); } else { totalTermFreq = -1; } postingsSpare.bytes = current.output.bytes; postingsSpare.offset = buffer.getPosition(); postingsSpare.length = current.output.length - (buffer.getPosition() - current.output.offset); //System.out.println(" df=" + docFreq + " totTF=" + totalTermFreq + " offset=" + buffer.getPosition() + " len=" + current.output.length); didDecode = true; } }
@Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { boolean hasOffsets = field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; if (field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { return null; } decodeMetaData(); FSTDocsAndPositionsEnum docsAndPositionsEnum; if (reuse == null || !(reuse instanceof FSTDocsAndPositionsEnum)) { docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.hasPayloads(), hasOffsets); } else { docsAndPositionsEnum = (FSTDocsAndPositionsEnum) reuse; if (!docsAndPositionsEnum.canReuse(field.hasPayloads(), hasOffsets)) { docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.hasPayloads(), hasOffsets); } } //System.out.println("D&P reset this=" + this); return docsAndPositionsEnum.reset(postingsSpare, liveDocs, docFreq); }
/** test that when freqs are omitted, that totalTermFreq and sumTotalTermFreq are -1 */ public void testStats() throws Exception { Directory dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_ONLY); ft.freeze(); Field f = newField("foo", "bar", ft); doc.add(f); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); assertEquals(-1, ir.totalTermFreq(new Term("foo", new BytesRef("bar")))); assertEquals(-1, ir.getSumTotalTermFreq("foo")); ir.close(); dir.close(); }
@Override public void startDoc(int docID, int termDocFreq) throws IOException { if (!wroteTerm) { // we lazily do this, in case the term had zero docs write(TERM); write(term); newline(); wroteTerm = true; } write(DOC); write(Integer.toString(docID)); newline(); if (indexOptions != IndexOptions.DOCS_ONLY) { write(FREQ); write(Integer.toString(termDocFreq)); newline(); } lastStartOffset = 0; }
@Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { // Positions were not indexed return null; } SimpleTextDocsAndPositionsEnum docsAndPositionsEnum; if (reuse != null && reuse instanceof SimpleTextDocsAndPositionsEnum && ((SimpleTextDocsAndPositionsEnum) reuse).canReuse(SimpleTextFieldsReader.this.in)) { docsAndPositionsEnum = (SimpleTextDocsAndPositionsEnum) reuse; } else { docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum(); } return docsAndPositionsEnum.reset(docsStart, liveDocs, indexOptions, docFreq); }
private void checkTokens(Token[] tokens) throws IOException { Directory dir = newDirectory(); RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc); boolean success = false; try { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Document doc = new Document(); doc.add(new Field("body", new CannedTokenStream(tokens), ft)); riw.addDocument(doc); success = true; } finally { if (success) { IOUtils.close(riw, dir); } else { IOUtils.closeWhileHandlingException(riw, dir); } } }
@Override public void startTerm() throws IOException { docIndex.mark(); //System.out.println("SEPW: startTerm docIndex=" + docIndex); if (indexOptions != IndexOptions.DOCS_ONLY) { freqIndex.mark(); } if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { posIndex.mark(); payloadStart = payloadOut.getFilePointer(); lastPayloadLength = -1; } skipListWriter.resetSkip(docIndex, freqIndex, posIndex); }
/** Adds a new doc in this term. If this returns null * then we just skip consuming positions/payloads. */ @Override public void startDoc(int docID, int termDocFreq) throws IOException { final int delta = docID - lastDocID; //System.out.println("SEPW: startDoc: write doc=" + docID + " delta=" + delta + " out.fp=" + docOut); if (docID < 0 || (df > 0 && delta <= 0)) { throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")"); } if ((++df % skipInterval) == 0) { // TODO: -- awkward we have to make these two // separate calls to skipper //System.out.println(" buffer skip lastDocID=" + lastDocID); skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); skipListWriter.bufferSkip(df); } lastDocID = docID; docOut.write(delta); if (indexOptions != IndexOptions.DOCS_ONLY) { //System.out.println(" sepw startDoc: write freq=" + termDocFreq); freqOut.write(termDocFreq); } }
public void testLegalbutVeryLargeOffsets() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500); if (random().nextBoolean()) { t1.setPayload(new BytesRef("test")); } Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE); TokenStream tokenStream = new CannedTokenStream( new Token[] { t1, t2 } ); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Field field = new Field("foo", tokenStream, ft); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); }
@Override public DocsEnum docs(Bits liveDocs, DocsEnum reuse, final int flags) throws IOException { final DocsEnum inReuse; final SortingDocsEnum wrapReuse; if (reuse != null && reuse instanceof SortingDocsEnum) { // if we're asked to reuse the given DocsEnum and it is Sorting, return // the wrapped one, since some Codecs expect it. wrapReuse = (SortingDocsEnum) reuse; inReuse = wrapReuse.getWrapped(); } else { wrapReuse = null; inReuse = reuse; } final DocsEnum inDocs = in.docs(newToOld(liveDocs), inReuse, flags); final boolean withFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >=0 && (flags & DocsEnum.FLAG_FREQS) != 0; return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, withFreqs, docMap); }
@Override public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { if (fr.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { // Positions were not indexed: return null; } currentFrame.decodeMetaData(); return fr.parent.postingsReader.docsAndPositions(fr.fieldInfo, currentFrame.termState, skipDocs, reuse, flags); }
/** Optimized implementation. */ public int read(final int[] docs, final int[] freqs) throws IOException { final int length = docs.length; if (indexOptions == IndexOptions.DOCS_ONLY) { return readNoTf(docs, freqs, length); } else { int i = 0; while (i < length && count < df) { // manually inlined call to next() for speed final int docCode = freqStream.readVInt(); doc += docCode >>> 1; // shift off low bit if ((docCode & 1) != 0) // if low bit is set freq = 1; // freq is one else freq = freqStream.readVInt(); // else read freq count++; if (liveDocs == null || liveDocs.get(doc)) { docs[i] = doc; freqs[i] = freq; ++i; } } return i; } }
@Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { PreDocsAndPositionsEnum docsPosEnum; if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { return null; } else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) { docsPosEnum = new PreDocsAndPositionsEnum(); } else { docsPosEnum = (PreDocsAndPositionsEnum) reuse; if (docsPosEnum.getFreqStream() != freqStream) { docsPosEnum = new PreDocsAndPositionsEnum(); } } return docsPosEnum.reset(termEnum, liveDocs); }
public final int nextPosition() throws IOException { if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) // This field does not store positions, payloads return 0; // perform lazy skips if necessary lazySkip(); proxCount--; return position += readDeltaPosition(); }
private void _decodeTerm(DataInput in, FieldInfo fieldInfo, IntBlockTermState termState) throws IOException { final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; final boolean fieldHasPayloads = fieldInfo.hasPayloads(); if (termState.docFreq == 1) { termState.singletonDocID = in.readVInt(); } else { termState.singletonDocID = -1; termState.docStartFP += in.readVLong(); } if (fieldHasPositions) { termState.posStartFP += in.readVLong(); if (termState.totalTermFreq > BLOCK_SIZE) { termState.lastPosBlockOffset = in.readVLong(); } else { termState.lastPosBlockOffset = -1; } if ((fieldHasPayloads || fieldHasOffsets) && termState.totalTermFreq >= BLOCK_SIZE) { termState.payStartFP += in.readVLong(); } } if (termState.docFreq > BLOCK_SIZE) { termState.skipOffset = in.readVLong(); } else { termState.skipOffset = -1; } }
@Override public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; boolean indexHasPayloads = fieldInfo.hasPayloads(); if ((!indexHasOffsets || (flags & DocsAndPositionsEnum.FLAG_OFFSETS) == 0) && (!indexHasPayloads || (flags & DocsAndPositionsEnum.FLAG_PAYLOADS) == 0)) { BlockDocsAndPositionsEnum docsAndPositionsEnum; if (reuse instanceof BlockDocsAndPositionsEnum) { docsAndPositionsEnum = (BlockDocsAndPositionsEnum) reuse; if (!docsAndPositionsEnum.canReuse(docIn, fieldInfo)) { docsAndPositionsEnum = new BlockDocsAndPositionsEnum(fieldInfo); } } else { docsAndPositionsEnum = new BlockDocsAndPositionsEnum(fieldInfo); } return docsAndPositionsEnum.reset(liveDocs, (IntBlockTermState) termState); } else { EverythingEnum everythingEnum; if (reuse instanceof EverythingEnum) { everythingEnum = (EverythingEnum) reuse; if (!everythingEnum.canReuse(docIn, fieldInfo)) { everythingEnum = new EverythingEnum(fieldInfo); } } else { everythingEnum = new EverythingEnum(fieldInfo); } return everythingEnum.reset(liveDocs, (IntBlockTermState) termState, flags); } }
public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { this.startDocIn = Lucene41PostingsReader.this.docIn; this.docIn = null; indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasPayloads = fieldInfo.hasPayloads(); encoded = new byte[MAX_ENCODED_SIZE]; }
public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException { this.startDocIn = Lucene41PostingsReader.this.docIn; this.docIn = null; this.posIn = Lucene41PostingsReader.this.posIn.clone(); encoded = new byte[MAX_ENCODED_SIZE]; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasPayloads = fieldInfo.hasPayloads(); }
public EverythingEnum(FieldInfo fieldInfo) throws IOException { this.startDocIn = Lucene41PostingsReader.this.docIn; this.docIn = null; this.posIn = Lucene41PostingsReader.this.posIn.clone(); this.payIn = Lucene41PostingsReader.this.payIn.clone(); encoded = new byte[MAX_ENCODED_SIZE]; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; if (indexHasOffsets) { offsetStartDeltaBuffer = new int[MAX_DATA_SIZE]; offsetLengthBuffer = new int[MAX_DATA_SIZE]; } else { offsetStartDeltaBuffer = null; offsetLengthBuffer = null; startOffset = -1; endOffset = -1; } indexHasPayloads = fieldInfo.hasPayloads(); if (indexHasPayloads) { payloadLengthBuffer = new int[MAX_DATA_SIZE]; payloadBytes = new byte[128]; payload = new BytesRef(); } else { payloadLengthBuffer = null; payloadBytes = null; payload = null; } }
private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber, boolean isIndexed, boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, DocValuesType normType) { FieldInfo fi = fieldInfo(name); if (fi == null) { // This field wasn't yet added to this in-RAM // segment's FieldInfo, so now we get a global // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, docValues); fi = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, normType, -1, null); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); } else { fi.update(isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions); if (docValues != null) { // Only pay the synchronization cost if fi does not already have a DVType boolean updateGlobal = !fi.hasDocValues(); if (updateGlobal) { // Must also update docValuesType map so it's // aware of this field's DocValueType. This will throw IllegalArgumentException if // an illegal type change was attempted. globalFieldNumbers.setDocValuesType(fi.number, name, docValues); } fi.setDocValuesType(docValues); // this will also perform the consistency check. } if (!fi.omitsNorms() && normType != null) { fi.setNormValueType(normType); } } return fi; }
@Nightly public void testTooManyTokens() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); doc.add(new Field("foo", new TokenStream() { CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); long num = 0; @Override public boolean incrementToken() throws IOException { if (num == Integer.MAX_VALUE + 1) { return false; } clearAttributes(); if (num == 0) { posIncAtt.setPositionIncrement(1); } else { posIncAtt.setPositionIncrement(0); } termAtt.append("a"); num++; if (VERBOSE && num % 1000000 == 0) { System.out.println("indexed: " + num); } return true; } }, ft)); try { iw.addDocument(doc); fail("didn't hit exception"); } catch (IllegalArgumentException expected) { assertTrue(expected.getMessage().contains("too many tokens")); } iw.close(); dir.close(); }
public void testOneSentence() throws Exception { Directory dir = newDirectory(); // use simpleanalyzer for more natural tokenization (else "test." is a token) IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.add(body); body.setStringValue("This is a test."); iw.addDocument(doc); body.setStringValue("Test a one sentence document."); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher searcher = newSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighter(); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.totalHits); String snippets[] = highlighter.highlight("body", query, searcher, topDocs); assertEquals(2, snippets.length); assertEquals("This is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); ir.close(); dir.close(); }
public void testMultipleFields() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Field title = new Field("title", "", offsetsType); Document doc = new Document(); doc.add(body); doc.add(title); body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); title.setStringValue("I am hoping for the best."); iw.addDocument(doc); body.setStringValue("Highlighting the first term. Hope it works."); title.setStringValue("But best may not be good enough."); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher searcher = newSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighter(); BooleanQuery query = new BooleanQuery(); query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD); query.add(new TermQuery(new Term("title", "best")), BooleanClause.Occur.SHOULD); TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.totalHits); Map<String,String[]> snippets = highlighter.highlightFields(new String [] { "body", "title" }, query, searcher, topDocs); assertEquals(2, snippets.size()); assertEquals("Just a test <b>highlighting</b> from postings. ", snippets.get("body")[0]); assertEquals("<b>Highlighting</b> the first term. ", snippets.get("body")[1]); assertEquals("I am hoping for the <b>best</b>.", snippets.get("title")[0]); assertEquals("But <b>best</b> may not be good enough.", snippets.get("title")[1]); ir.close(); dir.close(); }
public void testMultiplePassages() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.add(body); body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); iw.addDocument(doc); body.setStringValue("This test is another test. Not a good sentence. Test test test test."); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher searcher = newSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighter(); Query query = new TermQuery(new Term("body", "test")); TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.totalHits); String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); assertEquals(2, snippets.length); assertEquals("This is a <b>test</b>. Just a <b>test</b> highlighting from postings. ", snippets[0]); assertEquals("This <b>test</b> is another <b>test</b>. ... <b>Test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[1]); ir.close(); dir.close(); }
public void testBuddhism() throws Exception { String text = "This eight-volume set brings together seminal papers in Buddhist studies from a vast " + "range of academic disciplines published over the last forty years. With a new introduction " + "by the editor, this collection is a unique and unrivalled research resource for both " + "student and scholar. Coverage includes: - Buddhist origins; early history of Buddhism in " + "South and Southeast Asia - early Buddhist Schools and Doctrinal History; Theravada Doctrine " + "- the Origins and nature of Mahayana Buddhism; some Mahayana religious topics - Abhidharma " + "and Madhyamaka - Yogacara, the Epistemological tradition, and Tathagatagarbha - Tantric " + "Buddhism (Including China and Japan); Buddhism in Nepal and Tibet - Buddhism in South and " + "Southeast Asia, and - Buddhism in China, East Asia, and Japan."; Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); FieldType positionsType = new FieldType(TextField.TYPE_STORED); positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", text, positionsType); Document document = new Document(); document.add(body); iw.addDocument(document); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher searcher = newSearcher(ir); PhraseQuery query = new PhraseQuery(); query.add(new Term("body", "buddhist")); query.add(new Term("body", "origins")); TopDocs topDocs = searcher.search(query, 10); assertEquals(1, topDocs.totalHits); PostingsHighlighter highlighter = new PostingsHighlighter(); String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); assertEquals(1, snippets.length); assertTrue(snippets[0].contains("<b>Buddhist</b> <b>origins</b>")); ir.close(); dir.close(); }
public void test() throws IOException { assertTrue(dir != null); assertTrue(fieldInfos != null); IndexReader reader = DirectoryReader.open(dir); Document doc = reader.document(0); assertTrue(doc != null); assertTrue(doc.getField(DocHelper.TEXT_FIELD_1_KEY) != null); Field field = (Field) doc.getField(DocHelper.TEXT_FIELD_2_KEY); assertTrue(field != null); assertTrue(field.fieldType().storeTermVectors()); assertFalse(field.fieldType().omitNorms()); assertTrue(field.fieldType().indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); field = (Field) doc.getField(DocHelper.TEXT_FIELD_3_KEY); assertTrue(field != null); assertFalse(field.fieldType().storeTermVectors()); assertTrue(field.fieldType().omitNorms()); assertTrue(field.fieldType().indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); field = (Field) doc.getField(DocHelper.NO_TF_KEY); assertTrue(field != null); assertFalse(field.fieldType().storeTermVectors()); assertFalse(field.fieldType().omitNorms()); assertTrue(field.fieldType().indexOptions() == IndexOptions.DOCS_ONLY); DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(DocHelper.TEXT_FIELD_3_KEY); reader.document(0, visitor); final List<IndexableField> fields = visitor.getDocument().getFields(); assertEquals(1, fields.size()); assertEquals(DocHelper.TEXT_FIELD_3_KEY, fields.get(0).name()); reader.close(); }