/** * This method iterates all terms in the given {@link TermsEnum} and * associates each terms ordinal with the terms documents. The caller must * exhaust the returned {@link BytesRefIterator} which returns all values * where the first returned value is associated with the ordinal <tt>1</tt> * etc. * <p> * If the {@link TermsEnum} contains prefix coded numerical values the terms * enum should be wrapped with either {@link #wrapNumeric32Bit(TermsEnum)} * or {@link #wrapNumeric64Bit(TermsEnum)} depending on its precision. If * the {@link TermsEnum} is not wrapped the returned * {@link BytesRefIterator} will contain partial precision terms rather than * only full-precision terms. * </p> */ public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException { return new BytesRefIterator() { private PostingsEnum docsEnum = null; @Override public BytesRef next() throws IOException { BytesRef ref; if ((ref = termsEnum.next()) != null) { docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE); nextOrdinal(); int docId; while ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { addDoc(docId); } } return ref; } }; }
public FreqTermsEnum(IndexReader reader, String field, boolean needDocFreq, boolean needTotalTermFreq, @Nullable Query filter, BigArrays bigArrays) throws IOException { super(reader, field, needTotalTermFreq ? PostingsEnum.FREQS : PostingsEnum.NONE, filter); this.bigArrays = bigArrays; this.needDocFreqs = needDocFreq; this.needTotalTermFreqs = needTotalTermFreq; if (needDocFreq) { termDocFreqs = bigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, false); } else { termDocFreqs = null; } if (needTotalTermFreq) { termsTotalFreqs = bigArrays.newLongArray(INITIAL_NUM_TERM_FREQS_CACHED, false); } else { termsTotalFreqs = null; } cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, bigArrays); }
private PostingsEnum writeTermWithDocsAndPos(TermsEnum iterator, PostingsEnum docsAndPosEnum, boolean positions, boolean offsets, boolean payloads) throws IOException { docsAndPosEnum = iterator.postings(docsAndPosEnum, PostingsEnum.ALL); // for each term (iterator next) in this field (field) // iterate over the docs (should only be one) int nextDoc = docsAndPosEnum.nextDoc(); assert nextDoc != DocIdSetIterator.NO_MORE_DOCS; final int freq = docsAndPosEnum.freq(); writeFreq(freq); for (int j = 0; j < freq; j++) { int curPos = docsAndPosEnum.nextPosition(); if (positions) { writePosition(curPos); } if (offsets) { writeOffsets(docsAndPosEnum.startOffset(), docsAndPosEnum.endOffset()); } if (payloads) { writePayload(docsAndPosEnum.getPayload()); } } nextDoc = docsAndPosEnum.nextDoc(); assert nextDoc == DocIdSetIterator.NO_MORE_DOCS; return docsAndPosEnum; }
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException { // start term, optimized writing BytesRef term = termIter.next(); spare.copyUTF8Bytes(term); builder.startObject(spare.toString()); buildTermStatistics(builder, termIter); // finally write the term vectors PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL); int termFreq = posEnum.freq(); builder.field(FieldStrings.TERM_FREQ, termFreq); initMemory(curTerms, termFreq); initValues(curTerms, posEnum, termFreq); buildValues(builder, curTerms, termFreq); buildScore(builder, boostAtt); builder.endObject(); }
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { currentPositions[j] = nextPos; } if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }
public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) { this.postings = postings; this.position = position; nTerms = terms==null ? 0 : terms.length; if (nTerms>0) { if (terms.length==1) { this.terms = terms; } else { Term[] terms2 = new Term[terms.length]; System.arraycopy(terms, 0, terms2, 0, terms.length); Arrays.sort(terms2); this.terms = terms2; } } else { this.terms = null; } }
/** * Creates the TermsEnum (if not already created) and must be called before any calls to getBackgroundFrequency * @param context The aggregation context * @return The number of documents in the index (after an optional filter might have been applied) */ public long prepareBackground(AggregationContext context) { if (termsEnum != null) { // already prepared - return return termsEnum.getNumDocs(); } SearchContext searchContext = context.searchContext(); IndexReader reader = searchContext.searcher().getIndexReader(); try { if (numberOfAggregatorsCreated == 1) { // Setup a termsEnum for sole use by one aggregator termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter); } else { // When we have > 1 agg we have possibility of duplicate term frequency lookups // and so use a TermsEnum that caches results of all term lookups termsEnum = new FreqTermsEnum(reader, indexedFieldName, true, false, filter, searchContext.bigArrays()); } } catch (IOException e) { throw new ElasticsearchException("failed to build terms enumeration", e); } return termsEnum.getNumDocs(); }
/** * This method iterates all terms in the given {@link TermsEnum} and * associates each terms ordinal with the terms documents. The caller must * exhaust the returned {@link BytesRefIterator} which returns all values * where the first returned value is associted with the ordinal <tt>1</tt> * etc. * <p> * If the {@link TermsEnum} contains prefix coded numerical values the terms * enum should be wrapped with either {@link #wrapNumeric32Bit(TermsEnum)} * or {@link #wrapNumeric64Bit(TermsEnum)} depending on its precision. If * the {@link TermsEnum} is not wrapped the returned * {@link BytesRefIterator} will contain partial precision terms rather than * only full-precision terms. * </p> */ public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException { return new BytesRefIterator() { private PostingsEnum docsEnum = null; @Override public BytesRef next() throws IOException { BytesRef ref; if ((ref = termsEnum.next()) != null) { docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE); nextOrdinal(); int docId; while ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { addDoc(docId); } } return ref; } }; }
private NamedList<Object> buildEntryValue(long count, Term t, List<Entry<LeafReader, Bits>> leaves) throws IOException { NamedList<Object> entry = new NamedList<>(); entry.add("count", count); int i = -1; for (Entry<LeafReader, Bits> e : leaves) { PostingsEnum postings = e.getKey().postings(t, PostingsEnum.PAYLOADS); Bits liveDocs = e.getValue(); while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { if (!liveDocs.get(postings.docID())) { continue; } i++; NamedList<Object> documentEntry = new NamedList<>(); entry.add("doc" + i, documentEntry); for (int j = 0; j < postings.freq(); j++) { postings.nextPosition(); String extra = postings.getPayload().utf8ToString(); documentEntry.add("position" + j, extra); } } } return entry; }
/** * Gets the 1 - entropy (i.e. 1+ plogp) of a term, * a function that favors terms that are focally distributed * We use the definition of log-entropy weighting provided in * Martin and Berry (2007): * Entropy = 1 + sum ((Pij log2(Pij)) / log2(n)) * where Pij = frequency of term i in doc j / global frequency of term i * n = number of documents in collection * @param term whose entropy you want * Thanks to Vidya Vasuki for adding the hash table to * eliminate redundant calculation */ private float getEntropy(Term term) { if (termEntropy.containsKey(term)) return termEntropy.get(term); int gf = getGlobalTermFreq(term); double entropy = 0; try { PostingsEnum docsEnum = this.getDocsForTerm(term); while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { double p = docsEnum.freq(); //frequency in this document p = p / gf; //frequency across all documents entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P) } int n = this.getNumDocs(); double log2n = Math.log(n) / Math.log(2); entropy = entropy / log2n; } catch (IOException e) { logger.info("Couldn't get term entropy for term " + term.text()); } termEntropy.put(term, 1 + (float) entropy); return (float) (1 + entropy); }
public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{ List<String> textList = new ArrayList<String>(); BytesRef text; PostingsEnum postings = null; while ((text = te.next()) != null) { postings = te.postings(postings, PostingsEnum.FREQS); int iterDoc = postings.advance(docid); if (iterDoc == docid) { textList.add(text.utf8ToString()); } } StringBuilder buf = new StringBuilder(); for (String s : textList) { buf.append(s+" "); } return buf.toString(); }
@Override public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { if ((flags & PostingsEnum.POSITIONS) != 0) { thrower.maybeThrow(Flags.DocsAndPositionsEnum); } else { thrower.maybeThrow(Flags.DocsEnum); } return super.postings(reuse, flags); }
private FilterableTermsEnum getTermsEnum(String field) throws IOException { if (termsEnum != null) { return termsEnum; } IndexReader reader = context.searcher().getIndexReader(); if (numberOfAggregatorsCreated > 1) { termsEnum = new FreqTermsEnum(reader, field, true, false, filter, context.bigArrays()); } else { termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter); } return termsEnum; }
/** * Returns a DocIdSet per segments containing the matching docs for the specified slice. */ private DocIdSet build(LeafReader reader) throws IOException { final DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc()); final Terms terms = reader.terms(getField()); final TermsEnum te = terms.iterator(); PostingsEnum docsEnum = null; for (BytesRef term = te.next(); term != null; term = te.next()) { int hashCode = term.hashCode(); if (contains(hashCode)) { docsEnum = te.postings(docsEnum, PostingsEnum.NONE); builder.add(docsEnum); } } return builder.build(); }
private int convertToLuceneFlags(int flags) { int lucenePositionsFlags = PostingsEnum.NONE; lucenePositionsFlags |= (flags & IndexLookup.FLAG_FREQUENCIES) > 0 ? PostingsEnum.FREQS : 0x0; lucenePositionsFlags |= (flags & IndexLookup.FLAG_POSITIONS) > 0 ? PostingsEnum.POSITIONS : 0x0; lucenePositionsFlags |= (flags & IndexLookup.FLAG_PAYLOADS) > 0 ? PostingsEnum.PAYLOADS : 0x0; lucenePositionsFlags |= (flags & IndexLookup.FLAG_OFFSETS) > 0 ? PostingsEnum.OFFSETS : 0x0; return lucenePositionsFlags; }
private PostingsEnum getPostings(int luceneFlags, LeafReader reader) throws IOException { assert identifier.field() != null; assert identifier.bytes() != null; final Fields fields = reader.fields(); PostingsEnum newPostings = null; if (fields != null) { final Terms terms = fields.terms(identifier.field()); if (terms != null) { TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(identifier.bytes())) { newPostings = termsEnum.postings(postings, luceneFlags); final Bits liveDocs = reader.getLiveDocs(); if (liveDocs != null) { newPostings = new FilterPostingsEnum(newPostings) { private int doNext(int d) throws IOException { for (; d != NO_MORE_DOCS; d = super.nextDoc()) { if (liveDocs.get(d)) { return d; } } return NO_MORE_DOCS; } @Override public int nextDoc() throws IOException { return doNext(super.nextDoc()); } @Override public int advance(int target) throws IOException { return doNext(super.advance(target)); } }; } } } } return newPostings; }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms */ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null); int freq = 0; while(docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
private PostingsEnum reset(int[] positions, int[] startOffsets, int[] endOffsets, BytesRefBuilder[] payloads, int freq) { curPos = -1; doc = -1; this.hasPositions = positions != null; this.hasOffsets = startOffsets != null; this.hasPayloads = payloads != null; this.freq = freq; this.startOffsets = startOffsets; this.endOffsets = endOffsets; this.payloads = payloads; this.positions = positions; return this; }
private PostingsEnum writeTermWithDocsOnly(TermsEnum iterator, PostingsEnum docsEnum) throws IOException { docsEnum = iterator.postings(docsEnum); int nextDoc = docsEnum.nextDoc(); assert nextDoc != DocIdSetIterator.NO_MORE_DOCS; writeFreq(docsEnum.freq()); nextDoc = docsEnum.nextDoc(); assert nextDoc == DocIdSetIterator.NO_MORE_DOCS; return docsEnum; }
public void testDefaultPositionIncrementGap() throws IOException { String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") .startObject("properties").startObject("field").field("type", "text").endObject().endObject() .endObject().endObject().string(); DocumentMapper mapper = indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, false); assertEquals(mapping, mapper.mappingSource().toString()); ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder() .startObject() .array("field", new String[] {"a", "b"}) .endObject() .bytes()); IndexableField[] fields = doc.rootDoc().getFields("field"); assertEquals(2, fields.length); assertEquals("a", fields[0].stringValue()); assertEquals("b", fields[1].stringValue()); IndexShard shard = indexService.getShard(0); shard.index(new Engine.Index(new Term("_uid", doc.uid() ), doc)); shard.refresh("test"); try (Engine.Searcher searcher = shard.acquireSearcher("test")) { LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader(); TermsEnum terms = leaf.terms("field").iterator(); assertTrue(terms.seekExact(new BytesRef("b"))); PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS); assertEquals(0, postings.nextDoc()); assertEquals(TextFieldMapper.Defaults.POSITION_INCREMENT_GAP + 1, postings.nextPosition()); } }
public void testPositionIncrementGap() throws IOException { final int positionIncrementGap = randomIntBetween(1, 1000); String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") .startObject("properties").startObject("field") .field("type", "text") .field("position_increment_gap", positionIncrementGap) .endObject().endObject() .endObject().endObject().string(); DocumentMapper mapper = indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, false); assertEquals(mapping, mapper.mappingSource().toString()); ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder() .startObject() .array("field", new String[] {"a", "b"}) .endObject() .bytes()); IndexableField[] fields = doc.rootDoc().getFields("field"); assertEquals(2, fields.length); assertEquals("a", fields[0].stringValue()); assertEquals("b", fields[1].stringValue()); IndexShard shard = indexService.getShard(0); shard.index(new Engine.Index(new Term("_uid", doc.uid()), doc)); shard.refresh("test"); try (Engine.Searcher searcher = shard.acquireSearcher("test")) { LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader(); TermsEnum terms = leaf.terms("field").iterator(); assertTrue(terms.seekExact(new BytesRef("b"))); PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS); assertEquals(0, postings.nextDoc()); assertEquals(positionIncrementGap + 1, postings.nextPosition()); } }
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException { String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; int[] freq = {1, 1, 1, 1, 1, 1, 1, 2}; int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}}; int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}}; int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}}; Terms terms = fields.terms(fieldName); assertThat(terms.size(), equalTo(8L)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, notNullValue()); // do not test ttf or doc frequency, because here we have many // shards and do not know how documents are distributed PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); if (withPayloads) { assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } } assertThat(iterator.next(), nullValue()); }
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException { Terms terms0 = fields0.terms(fieldName); Terms terms1 = fields1.terms(fieldName); assertThat(terms0, notNullValue()); assertThat(terms1, notNullValue()); assertThat(terms0.size(), equalTo(terms1.size())); TermsEnum iter0 = terms0.iterator(); TermsEnum iter1 = terms1.iterator(); for (int i = 0; i < terms0.size(); i++) { BytesRef next0 = iter0.next(); assertThat(next0, notNullValue()); BytesRef next1 = iter1.next(); assertThat(next1, notNullValue()); // compare field value String string0 = next0.utf8ToString(); String string1 = next1.utf8ToString(); assertThat("expected: " + string0, string0, equalTo(string1)); // compare df and ttf assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq())); assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq())); // compare freq and docs PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL); PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL); assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc())); assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq())); // compare position, start offsets and end offsets for (int j = 0; j < docsAndPositions0.freq(); j++) { assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition())); assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset())); assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset())); } } assertThat(iter0.next(), nullValue()); assertThat(iter1.next(), nullValue()); }
/** * Construct an <code>query.{@link AugmentedTermScorer}</code>. * * @param weight * The weight of the <code>Term</code> in the query. * @param mainTerm * An iterator over the documents matching the main <code>Term</code>. * @param similarPostings * A list of <code>PostingsEnumWeightTuple</code>: term iterator, weight pairs * @param docScorer * The <code>Similarity.SimScorer</code> implementation * to be used for score computations. */ public AugmentedTermScorer(Weight weight, PostingsEnum mainTerm, List<PostingsEnumWeightTuple> similarPostings, Similarity.SimScorer docScorer) { super(weight); this.postings = new PostingsEnumWeightTuple[similarPostings.size() + 1]; this.postings[0] = new PostingsEnumWeightTuple(mainTerm,1f); for (int i = 0; i < similarPostings.size(); i++) { this.postings[i + 1] = similarPostings.get(i); } this.iterator = new MultiDocIdSetIterator(this.postings); this.docScorer = docScorer; }
public FieldFeatureTFIDFExtractor(PostingsEnum pe, int numDocs, int docFreq){ this.pe = pe; assert numDocs >= docFreq; this.numDocs = numDocs + 1; this.docFreq = docFreq <= 0 ? 1 : docFreq; idf = (float)Math.log((double)this.numDocs/(double)this.docFreq); }
static double[] getFeatures(IndexReader ir, String fieldName, BytesRef rawPhrase, int docId, int docSize, int numDocs, boolean inc) throws IOException { PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase); int ret = de.advance(docId); if(ret == PostingsEnum.NO_MORE_DOCS){ throw new RuntimeException("no more docs..."); } else{ int freq = de.freq(); if(freq < 2) return null; PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase); int ret2 = pe.advance(docId); if(ret2 == PostingsEnum.NO_MORE_DOCS){ throw new RuntimeException("no more docs..."); } else{ double[] features = new double[2]; int pos = pe.nextPosition(); int docFreq = ir.docFreq(new Term(fieldName, rawPhrase)); if(inc){ docFreq++; numDocs++; } features[0] = Commons.calcTfIdf(freq, docSize, docFreq, numDocs); features[1] = Commons.calcFirstOccurrence(pos, docSize); return features; } } }
private void processTermVectorsFields(Vectorizer vectorizer, Fields termVectorsFields) throws IOException { for (String fieldName : termVectorsFields) { TermsEnum termsEnum = termVectorsFields.terms(fieldName).iterator(); while (termsEnum.next() != null) { Term term = new Term(fieldName, termsEnum.term()); TermStatistics termStatistics = new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq()); int freq = termsEnum.postings(null, null, PostingsEnum.ALL).freq(); vectorizer.add(term, termStatistics, freq); } } }
/** Prints the terms indexed under the given fields with full postings information. */ public static void printFieldTermsWithInfo(LeafReader reader, String... fields) throws IOException { for (final String field : fields) { System.out.println(format("Terms for field [%s], with positional info:", field)); final TermsEnum te = reader.terms(field).iterator(); BytesRef scratch; PostingsEnum postings = null; while ((scratch = te.next()) != null) { System.out.println(format(" %s", scratch.utf8ToString())); postings = te.postings(postings, PostingsEnum.ALL); for (postings.nextDoc(); postings.docID() != DocIdSetIterator.NO_MORE_DOCS; postings.nextDoc()) { final Map<Integer, BytesRef> positions = Maps.newTreeMap(); boolean addedPayload = false; for (int i = 0; i < postings.freq(); i++) { final int pos = postings.nextPosition(); final BytesRef payload = postings.getPayload(); if (payload != null) { positions.put(pos, BytesRef.deepCopyOf(payload)); addedPayload = true; } else { positions.put(pos, null); } } if (addedPayload) { System.out.println(format(" doc=%d, freq=%d", postings.docID(), postings.freq(), positions)); for (final Entry<Integer, BytesRef> e : positions.entrySet()) { System.out.println(format(" pos=%d, payload=%s", e.getKey(), e.getValue())); } } else { System.out.println(format(" doc=%d, freq=%d, pos=%s", postings.docID(), postings.freq(), positions.keySet())); } } } } }
public static void printAnnotations(LeafReader reader, Term term) throws IOException { System.out.println("Annotations for " + term); final ByteArrayDataInput in = new ByteArrayDataInput(); final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS); for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) { final int freq = postings.freq(); System.out.println(" doc=" + docID + ", freq=" + freq); for (int i = 0; i < freq; i++) { postings.nextPosition(); final BytesRef payload = postings.getPayload(); in.reset(payload.bytes, payload.offset, payload.length); System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt()); } } }
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException { TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id) .setOffsets(false).setPositions(true).setFieldStatistics(false) .setTermStatistics(false) .setSelectedFields(field). execute().actionGet(); Map<Integer,String> map = new HashMap<>(); Terms terms = response.getFields().terms(field); if (terms==null){ return map; } TermsEnum iterator = terms.iterator(); PostingsEnum postings = null; for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) { String term = termBytes.utf8ToString(); postings = iterator.postings(postings, PostingsEnum.ALL); //there can only be one doc since we are getting with id. get the doc and the position postings.nextDoc(); int tf = postings.freq(); for (int i = 0; i < tf; i++) { int pos = postings.nextPosition(); map.put(pos,term); } } return map; }
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException { List<MWESentenceContext> result = new ArrayList<>(); TermsEnum tiRef= termVectorLookup.iterator(); BytesRef luceneTerm = tiRef.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = tiRef.next(); continue; } String tString = luceneTerm.utf8ToString(); if(!allCandidates.contains(tString)) { luceneTerm=tiRef.next(); continue; } PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL); //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS); int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV if (doc != PostingsEnum.NO_MORE_DOCS) { int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { postingsEnum.nextPosition(); int start = postingsEnum.startOffset(); int end = postingsEnum.endOffset(); BytesRef payload=postingsEnum.getPayload(); int sentenceId=-1; if(payload!=null){ sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId(); } result.add(new MWESentenceContext(tString,sentenceId, start, end)); } } luceneTerm = tiRef.next(); } Collections.sort(result); return result; }
private void executeNeedleTests(Analyzer analyzer) throws Exception { String needle = getNeedle(analyzer); int numFieldValues = 23; Directory directory = buildNeedleIndex(needle, analyzer, numFieldValues); IndexReader reader = DirectoryReader.open(directory); LeafReaderContext ctx = reader.leaves().get(0); LeafReader r = ctx.reader(); PostingsEnum dpe = r.postings(new Term(FIELD, needle), PostingsEnum.ALL); int numTests = 0; try { while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int frq = dpe.freq(); int advanced = 0; String[] fieldValues = r.document(dpe.docID()).getValues(FIELD); while (++advanced < frq) { dpe.nextPosition(); String rebuilt = SimpleAnalyzerUtil.substringFromMultiValuedFields(dpe.startOffset(), dpe.endOffset(), fieldValues, analyzer.getOffsetGap(FIELD), " | "); assertEquals(needle, rebuilt); numTests++; } } } finally { reader.close(); directory.close(); } assertEquals("number of tests", numFieldValues - 1, numTests); }
private Document getDoc(String s, IndexReader reader) throws IOException { //TODO: normalize s? BytesRef bytesRef = new BytesRef(s); PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, SyntacticSynsConfig.getSynsTargetFieldName(), bytesRef); if (docsEnum == null) { //couldn't find search term return null; } int i = 0; int tmpDocID = docsEnum.nextDoc(); int docID = -1; while (tmpDocID != PostingsEnum.NO_MORE_DOCS) { docID = tmpDocID; tmpDocID = docsEnum.nextDoc(); i++; } if (i > 1) { //TODO: log or do something "there should only be one key term!" } if (docID > -1) { System.out.println(docID); return reader.document(docID); } return null; }