/** * @return the estimate for loading the entire term set into field data, or 0 if unavailable */ public long estimateStringFieldData() { try { LeafReader reader = context.reader(); Terms terms = reader.terms(getFieldName()); Fields fields = reader.fields(); final Terms fieldTerms = fields.terms(getFieldName()); if (fieldTerms instanceof FieldReader) { final Stats stats = ((FieldReader) fieldTerms).getStats(); long totalTermBytes = stats.totalTermBytes; if (logger.isTraceEnabled()) { logger.trace("totalTermBytes: {}, terms.size(): {}, terms.getSumDocFreq(): {}", totalTermBytes, terms.size(), terms.getSumDocFreq()); } long totalBytes = totalTermBytes + (2 * terms.size()) + (4 * terms.getSumDocFreq()); return totalBytes; } } catch (Exception e) { logger.warn("Unable to estimate memory overhead", e); } return 0; }
@Override protected MultiTermVectorsResponse executeMultiTermVectors(MultiTermVectorsRequest mtvRequest) { try { MultiTermVectorsItemResponse[] responses = new MultiTermVectorsItemResponse[mtvRequest.size()]; int i = 0; for (TermVectorsRequest request : mtvRequest) { TermVectorsResponse response = new TermVectorsResponse(request.index(), request.type(), request.id()); response.setExists(true); Fields generatedFields; if (request.doc() != null) { generatedFields = generateFields(randomFields, request.doc().utf8ToString()); } else { generatedFields = generateFields(request.selectedFields().toArray(new String[request.selectedFields().size()]), request.id()); } EnumSet<TermVectorsRequest.Flag> flags = EnumSet.of(TermVectorsRequest.Flag.Positions, TermVectorsRequest.Flag.Offsets); response.setFields(generatedFields, request.selectedFields(), flags, generatedFields); responses[i++] = new MultiTermVectorsItemResponse(response, null); } return new MultiTermVectorsResponse(responses); } catch (IOException ex) { throw new ElasticsearchException("boom", ex); } }
public void testDuelESLucene() throws Exception { TestFieldSetting[] testFieldSettings = getFieldSettings(); createIndexBasedOnFieldSettings("test", "alias", testFieldSettings); //we generate as many docs as many shards we have TestDoc[] testDocs = generateTestDocs("test", testFieldSettings); DirectoryReader directoryReader = indexDocsWithLucene(testDocs); TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings); for (TestConfig test : testConfigs) { TermVectorsRequestBuilder request = getRequestForConfig(test); if (test.expectedException != null) { assertThrows(request, test.expectedException); continue; } TermVectorsResponse response = request.get(); Fields luceneTermVectors = getTermVectorsFromLucene(directoryReader, test.doc); validateResponse(response, luceneTermVectors, test); } }
@Override public Fields get(int docID) throws IOException { if (tvx != null) { Fields fields = new TVFields(docID); if (fields.size() == 0) { // TODO: we can improve writer here, eg write 0 into // tvx file, so we know on first read from tvx that // this doc has no TVs return null; } else { return fields; } } else { return null; } }
/** Merges in the term vectors from the readers in * <code>mergeState</code>. The default implementation skips * over deleted documents, and uses {@link #startDocument(int)}, * {@link #startField(FieldInfo, int, boolean, boolean, boolean)}, * {@link #startTerm(BytesRef, int)}, {@link #addPosition(int, int, int, BytesRef)}, * and {@link #finish(FieldInfos, int)}, * returning the number of documents that were written. * Implementations can override this method for more sophisticated * merging (bulk-byte copying, etc). */ public int merge(MergeState mergeState) throws IOException { int docCount = 0; for (int i = 0; i < mergeState.readers.size(); i++) { final AtomicReader reader = mergeState.readers.get(i); final int maxDoc = reader.maxDoc(); final Bits liveDocs = reader.getLiveDocs(); for (int docID = 0; docID < maxDoc; docID++) { if (liveDocs != null && !liveDocs.get(docID)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Fields vectors = reader.getTermVectors(docID); addAllDocVectors(vectors, mergeState); docCount++; mergeState.checkAbort.work(300); } } finish(mergeState.fieldInfos, docCount); return docCount; }
public DfsOnlyRequest(Fields termVectorsFields, String[] indices, String[] types, Set<String> selectedFields) throws IOException { super(indices); // build a search request with a query of all the terms final BoolQueryBuilder boolBuilder = boolQuery(); for (String fieldName : termVectorsFields) { if ((selectedFields != null) && (!selectedFields.contains(fieldName))) { continue; } Terms terms = termVectorsFields.terms(fieldName); TermsEnum iterator = terms.iterator(); while (iterator.next() != null) { String text = iterator.term().utf8ToString(); boolBuilder.should(QueryBuilders.termQuery(fieldName, text)); } } // wrap a search request object this.searchRequest = new SearchRequest(indices).types(types).source(new SearchSourceBuilder().query(boolBuilder)); }
@Override public Fields fields() throws IOException { return new FilterFields(super.fields()) { @Override public Terms terms(String field) throws IOException { return super.terms(DelegatingAtomicReader.FIELD_NAME); } @Override public Iterator<String> iterator() { return Collections.singletonList(DelegatingAtomicReader.FIELD_NAME).iterator(); } @Override public int size() { return 1; } }; }
/** * A convenience method that tries to first get a TermPositionVector for the * specified docId, then, falls back to using the passed in * {@link org.apache.lucene.document.Document} to retrieve the TokenStream. * This is useful when you already have the document, but would prefer to use * the vector first. * * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try * and get the vector from * @param docId The docId to retrieve. * @param field The field to retrieve on the document * @param doc The document to fall back on * @param analyzer The analyzer to use for creating the TokenStream if the * vector doesn't exist * @return The {@link org.apache.lucene.analysis.TokenStream} for the * {@link org.apache.lucene.index.IndexableField} on the * {@link org.apache.lucene.document.Document} * @throws IOException if there was an error loading */ public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, Document doc, Analyzer analyzer) throws IOException { TokenStream ts = null; Fields vectors = reader.getTermVectors(docId); if (vectors != null) { Terms vector = vectors.terms(field); if (vector != null) { ts = getTokenStream(vector); } } // No token info stored so fall back to analyzing raw content if (ts == null) { ts = getTokenStream(doc, field, analyzer); } return ts; }
/** * A convenience method that tries a number of approaches to getting a token * stream. The cost of finding there are no termVectors in the index is * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?) * approach to coding is probably acceptable * * @return null if field not stored correctly * @throws IOException If there is a low-level I/O error */ public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer) throws IOException { TokenStream ts = null; Fields vectors = reader.getTermVectors(docId); if (vectors != null) { Terms vector = vectors.terms(field); if (vector != null) { ts = getTokenStream(vector); } } // No token info stored so fall back to analyzing raw content if (ts == null) { ts = getTokenStream(reader, docId, field, analyzer); } return ts; }
/** * Returns a {@link TokenStream} with positions and offsets constructed from * field termvectors. If the field has no termvectors, or positions or offsets * are not included in the termvector, return null. * @param reader the {@link IndexReader} to retrieve term vectors from * @param docId the document to retrieve termvectors for * @param field the field to retrieve termvectors for * @return a {@link TokenStream}, or null if positions and offsets are not available * @throws IOException If there is a low-level I/O error */ public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId, String field) throws IOException { Fields vectors = reader.getTermVectors(docId); if (vectors == null) { return null; } Terms vector = vectors.terms(field); if (vector == null) { return null; } if (!vector.hasPositions() || !vector.hasOffsets()) { return null; } return getTokenStream(vector); }
@Override public void createWeight(Map context, IndexSearcher searcher) throws IOException { long sumTotalTermFreq = 0; for (AtomicReaderContext readerContext : searcher.getTopReaderContext().leaves()) { Fields fields = readerContext.reader().fields(); if (fields == null) continue; Terms terms = fields.terms(indexedField); if (terms == null) continue; long v = terms.getSumTotalTermFreq(); if (v == -1) { sumTotalTermFreq = -1; break; } else { sumTotalTermFreq += v; } } final long ttf = sumTotalTermFreq; context.put(this, new LongDocValues(this) { @Override public long longVal(int doc) { return ttf; } }); }
/** * Fields api equivalency */ public void assertFieldsEquals(String info, IndexReader leftReader, Fields leftFields, Fields rightFields, boolean deep) throws IOException { // Fields could be null if there are no postings, // but then it must be null for both if (leftFields == null || rightFields == null) { assertNull(info, leftFields); assertNull(info, rightFields); return; } assertFieldStatisticsEquals(info, leftFields, rightFields); Iterator<String> leftEnum = leftFields.iterator(); Iterator<String> rightEnum = rightFields.iterator(); while (leftEnum.hasNext()) { String field = leftEnum.next(); assertEquals(info, field, rightEnum.next()); assertTermsEquals(info, leftReader, leftFields.terms(field), rightFields.terms(field), deep); } assertFalse(rightEnum.hasNext()); }
/** * checks that norms are the same across all fields */ public void assertNormsEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException { Fields leftFields = MultiFields.getFields(leftReader); Fields rightFields = MultiFields.getFields(rightReader); // Fields could be null if there are no postings, // but then it must be null for both if (leftFields == null || rightFields == null) { assertNull(info, leftFields); assertNull(info, rightFields); return; } for (String field : leftFields) { NumericDocValues leftNorms = MultiDocValues.getNormValues(leftReader, field); NumericDocValues rightNorms = MultiDocValues.getNormValues(rightReader, field); if (leftNorms != null && rightNorms != null) { assertDocValuesEquals(info, leftReader.maxDoc(), leftNorms, rightNorms); } else { assertNull(info, leftNorms); assertNull(info, rightNorms); } } }
private void testTermVectors() throws Exception { // check: int numDocs = reader.numDocs(); long start = 0L; for (int docId = 0; docId < numDocs; docId++) { start = System.currentTimeMillis(); Fields vectors = reader.getTermVectors(docId); timeElapsed += System.currentTimeMillis()-start; // verify vectors result verifyVectors(vectors, docId); start = System.currentTimeMillis(); Terms vector = reader.getTermVectors(docId).terms("field"); timeElapsed += System.currentTimeMillis()-start; verifyVector(vector.iterator(null), docId); } }
/** * Returns the first document number containing the term <code>t</code> * Returns -1 if no document was found. * This method is primarily intended for clients that want to fetch * documents using a unique identifier." * @return the first document number containing the term */ public int getFirstMatch(Term t) throws IOException { Fields fields = atomicReader.fields(); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes)) { return -1; } DocsEnum docs = termsEnum.docs(atomicReader.getLiveDocs(), null, DocsEnum.FLAG_NONE); if (docs == null) return -1; int id = docs.nextDoc(); return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
protected int getFirstMatch(IndexReader r, Term t) throws IOException { Fields fields = MultiFields.getFields(r); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes)) { return -1; } DocsEnum docs = termsEnum.docs(MultiFields.getLiveDocs(r), null, DocsEnum.FLAG_NONE); int id = docs.nextDoc(); if (id != DocIdSetIterator.NO_MORE_DOCS) { int next = docs.nextDoc(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, next); } return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
/** * * @param reader * @return Map of term and its inverse document frequency * * @throws IOException */ public Map<String, Float> getIdfs(IndexReader reader) throws IOException { Fields fields = MultiFields.getFields(reader); //get the fields of the index for (String field: fields) { TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator(null); BytesRef bytesRef; while ((bytesRef = termEnum.next()) != null) { if (termEnum.seekExact(bytesRef)) { String term = bytesRef.utf8ToString(); float idf = tfidfSIM.idf( termEnum.docFreq(), reader.numDocs() ); inverseDocFreq.put(term, idf); System.out.println(term +" idf= "+ idf); } } } return inverseDocFreq; }
/** * Returns the first document number containing the term <code>t</code> * Returns -1 if no document was found. * This method is primarily intended for clients that want to fetch * documents using a unique identifier." * @return the first document number containing the term */ public int getFirstMatch(Term t) throws IOException { Fields fields = atomicReader.fields(); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes, false)) { return -1; } DocsEnum docs = termsEnum.docs(atomicReader.getLiveDocs(), null, DocsEnum.FLAG_NONE); if (docs == null) return -1; int id = docs.nextDoc(); return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
protected int getFirstMatch(IndexReader r, Term t) throws IOException { Fields fields = MultiFields.getFields(r); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes, false)) { return -1; } DocsEnum docs = termsEnum.docs(MultiFields.getLiveDocs(r), null, DocsEnum.FLAG_NONE); int id = docs.nextDoc(); if (id != DocIdSetIterator.NO_MORE_DOCS) { int next = docs.nextDoc(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, next); } return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
private void runOldMergeSortRowIdCheckAndDelete(boolean emitDeletes, IndexReader currentIndexReader, BlurPartitioner blurPartitioner, Text key, int numberOfShards, int shardId, Action action, AtomicReader atomicReader) throws IOException { MergeSortRowIdLookup lookup = new MergeSortRowIdLookup(currentIndexReader); Fields fields = atomicReader.fields(); Terms terms = fields.terms(BlurConstants.ROW_ID); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef ref = null; while ((ref = termsEnum.next()) != null) { key.set(ref.bytes, ref.offset, ref.length); int partition = blurPartitioner.getPartition(key, null, numberOfShards); if (shardId != partition) { throw new IOException("Index is corrupted, RowIds are found in wrong shard, partition [" + partition + "] does not shard [" + shardId + "], this can happen when rows are not hashed correctly."); } if (emitDeletes) { lookup.lookup(ref, action); } } } }
private IterableRow getIterableRow(String rowId, IndexSearcherCloseable searcher) throws IOException { IndexReader indexReader = searcher.getIndexReader(); BytesRef rowIdRef = new BytesRef(rowId); List<AtomicReaderTermsEnum> possibleRowIds = new ArrayList<AtomicReaderTermsEnum>(); for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) { AtomicReader atomicReader = atomicReaderContext.reader(); Fields fields = atomicReader.fields(); if (fields == null) { continue; } Terms terms = fields.terms(BlurConstants.ROW_ID); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(rowIdRef, true)) { continue; } // need atomic read as well... possibleRowIds.add(new AtomicReaderTermsEnum(atomicReader, termsEnum)); } if (possibleRowIds.isEmpty()) { return null; } return new IterableRow(rowId, getRecords(possibleRowIds)); }
/** * Returns the first document number containing the term <code>t</code> Returns -1 if no * document was found. This method is primarily intended for clients that want to fetch * documents using a unique identifier." * * @return the first document number containing the term */ public int getFirstMatch(Term t) throws IOException { Fields fields = atomicReader.fields(); if(fields == null) return -1; Terms terms = fields.terms(t.field()); if(terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if(!termsEnum.seekExact(termBytes, false)) { return -1; } DocsEnum docs = termsEnum.docs(atomicReader.getLiveDocs(), null, DocsEnum.FLAG_NONE); if(docs == null) return -1; int id = docs.nextDoc(); return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }