@Override public void writeTo(final StreamOutput out) throws IOException { out.writeVInt(termStatistics.size()); for (ObjectObjectCursor<Term, TermStatistics> c : termStatistics()) { Term term = c.key; out.writeString(term.field()); out.writeBytesRef(term.bytes()); TermStatistics stats = c.value; out.writeBytesRef(stats.term()); out.writeVLong(stats.docFreq()); out.writeVLong(DfsSearchResult.addOne(stats.totalTermFreq())); } DfsSearchResult.writeFieldStats(out, fieldStatistics); out.writeVLong(maxDoc); }
public static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOException { int termsStatsSize = in.readVInt(); final TermStatistics[] termStatistics; if (termsStatsSize == 0) { termStatistics = EMPTY_TERM_STATS; } else { termStatistics = new TermStatistics[termsStatsSize]; assert terms.length == termsStatsSize; for (int i = 0; i < termStatistics.length; i++) { BytesRef term = terms[i].bytes(); final long docFreq = in.readVLong(); assert docFreq >= 0; final long totalTermFreq = subOne(in.readVLong()); termStatistics[i] = new TermStatistics(term, docFreq, totalTermFreq); } } return termStatistics; }
protected SeqSpanWeight(SeqSpanQuery query, IndexSearcher searcher) throws IOException { super(query); this.selfQuery = query; this.similarity = searcher.getSimilarity(needsScores); this.positions = selfQuery.getPositions(); this.terms = selfQuery.getTerms(); this.field = terms[0].field(); if (positions.length < 2) { throw new IllegalStateException("PhraseWeight does not support less than 2 terms, call rewrite first"); } else if (positions[0] != 0) { throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first"); } final IndexReaderContext context = searcher.getTopReaderContext(); states = new TermContext[terms.length]; TermStatistics termStats[] = new TermStatistics[terms.length]; for (int i = 0; i < terms.length; i++) { final Term term = terms[i]; states[i] = TermContext.build(context, term); termStats[i] = searcher.termStatistics(term, states[i]); } stats = similarity.computeWeight(searcher.collectionStatistics(terms[0].field()), termStats); }
@Override public void writeTo(final StreamOutput out) throws IOException { out.writeVInt(termStatistics.size()); for (ObjectObjectCursor<Term, TermStatistics> c : termStatistics()) { Term term = (Term) c.key; out.writeString(term.field()); out.writeBytesRef(term.bytes()); TermStatistics stats = (TermStatistics) c.value; out.writeBytesRef(stats.term()); out.writeVLong(stats.docFreq()); out.writeVLong(DfsSearchResult.addOne(stats.totalTermFreq())); } DfsSearchResult.writeFieldStats(out, fieldStatistics); out.writeVLong(maxDoc); }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { float N, n, idf, adl; idf = 1.0f; N = collectionStats.maxDoc(); adl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf = log(N/n); } else { for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf += log(N/n); } } return new TFIDFWeight(collectionStats.field(), idf, adl); }
/** * Computes a score factor for a phrase. * * <p> * The default implementation sums the idf factor for each term in the * phrase. * * @param collectionStats * collection-level statistics * @param termStats * term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf score factor for the * phrase and an explanation for each term. */ public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats) { final long docFreq = stat.docFreq(); final float termIdf = idf(docFreq, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + docFreq + ", maxDocs=" + max + ")")); idf += termIdf; } exp.setValue(idf); return exp; }
@Override public void readFrom(StreamInput in) throws IOException { int size = in.readVInt(); termStatistics = HppcMaps.newMap(size); for (int i = 0; i < size; i++) { Term term = new Term(in.readString(), in.readBytesRef()); TermStatistics stats = new TermStatistics(in.readBytesRef(), in.readVLong(), DfsSearchResult.subOne(in.readVLong())); termStatistics.put(term, stats); } fieldStatistics = DfsSearchResult.readFieldStats(in); maxDoc = in.readVLong(); }
private void writeTermStatistics(TermStatistics termStatistics) throws IOException { int docFreq = (int) termStatistics.docFreq(); assert (docFreq >= -1); writePotentiallyNegativeVInt(docFreq); long ttf = termStatistics.totalTermFreq(); assert (ttf >= -1); writePotentiallyNegativeVLong(ttf); }
public void testFailPhaseOnException() throws IOException { AtomicArray<DfsSearchResult> results = new AtomicArray<>(2); AtomicReference<AtomicArray<QuerySearchResultProvider>> responseRef = new AtomicReference<>(); results.set(0, new DfsSearchResult(1, new SearchShardTarget("node1", new Index("test", "na"), 0))); results.set(1, new DfsSearchResult(2, new SearchShardTarget("node2", new Index("test", "na"), 0))); results.get(0).termsStatistics(new Term[0], new TermStatistics[0]); results.get(1).termsStatistics(new Term[0], new TermStatistics[0]); SearchPhaseController controller = new SearchPhaseController(Settings.EMPTY, BigArrays.NON_RECYCLING_INSTANCE, null); SearchTransportService searchTransportService = new SearchTransportService( Settings.builder().put("search.remote.connect", false).build(), null, null) { @Override public void sendExecuteQuery(Transport.Connection connection, QuerySearchRequest request, SearchTask task, ActionListener<QuerySearchResult> listener) { if (request.id() == 1) { QuerySearchResult queryResult = new QuerySearchResult(123, new SearchShardTarget("node1", new Index("test", "na"), 0)); queryResult.topDocs(new TopDocs(1, new ScoreDoc[] {new ScoreDoc(42, 1.0F)}, 2.0F), new DocValueFormat[0]); queryResult.size(2); // the size of the result set listener.onResponse(queryResult); } else if (request.id() == 2) { throw new UncheckedIOException(new MockDirectoryWrapper.FakeIOException()); } else { fail("no such request ID: " + request.id()); } } }; MockSearchPhaseContext mockSearchPhaseContext = new MockSearchPhaseContext(2); mockSearchPhaseContext.searchTransport = searchTransportService; DfsQueryPhase phase = new DfsQueryPhase(results, controller, (response) -> new SearchPhase("test") { @Override public void run() throws IOException { responseRef.set(response.results); } }, mockSearchPhaseContext); assertEquals("dfs_query", phase.getName()); expectThrows(UncheckedIOException.class, () -> phase.run()); assertTrue(mockSearchPhaseContext.releasedSearchContexts.isEmpty()); // phase execution will clean up on the contexts }
@Override public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { BasicStats stats[] = new BasicStats[termStats.length]; for (int i = 0; i < termStats.length; i++) { stats[i] = newStats(collectionStats.field(), queryBoost); fillBasicStats(stats[i], collectionStats, termStats[i]); } return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats); }
/** Fills all member fields defined in {@code BasicStats} in {@code stats}. * Subclasses can override this method to fill additional stats. */ protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { // #positions(field) must be >= #positions(term) assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq(); long numberOfDocuments = collectionStats.maxDoc(); long docFreq = termStats.docFreq(); long totalTermFreq = termStats.totalTermFreq(); // codec does not supply totalTermFreq: substitute docFreq if (totalTermFreq == -1) { totalTermFreq = docFreq; } final long numberOfFieldTokens; final float avgFieldLength; long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { // field does not exist; // We have to provide something if codec doesnt supply these measures, // or if someone omitted frequencies for the field... negative values cause // NaN/Inf for some scorers. numberOfFieldTokens = docFreq; avgFieldLength = 1; } else { numberOfFieldTokens = sumTotalTermFreq; avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; } // TODO: add sumDocFreq for field (numberOfFieldPostings) stats.setNumberOfDocuments(numberOfDocuments); stats.setNumberOfFieldTokens(numberOfFieldTokens); stats.setAvgFieldLength(avgFieldLength); stats.setDocFreq(docFreq); stats.setTotalTermFreq(totalTermFreq); }
/** * Computes a score factor for a phrase. * * <p> * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats ) { final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } exp.setValue(idf); return exp; }
@Override public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); // compute freq-independent part of bm25 equation across all norm values float cache[] = new float[256]; for (int i = 0; i < cache.length; i++) { cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl); } return new BM25Stats(collectionStats.field(), idf, queryBoost, avgdl, cache); }
@Override public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { PerFieldSimWeight weight = new PerFieldSimWeight(); weight.delegate = get(collectionStats.field()); weight.delegateWeight = weight.delegate.computeWeight(queryBoost, collectionStats, termStats); return weight; }
@Override public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { SimWeight subStats[] = new SimWeight[sims.length]; for (int i = 0; i < subStats.length; i++) { subStats[i] = sims[i].computeWeight(queryBoost, collectionStats, termStats); } return new MultiStats(subStats); }
/** * Computes the collection probability of the current term in addition to the * usual statistics. */ @Override protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { super.fillBasicStats(stats, collectionStats, termStats); LMStats lmStats = (LMStats) stats; lmStats.setCollectionProbability(collectionModel.computeProbability(stats)); }
@Override public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { final Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); return new IDFStats(collectionStats.field(), idf, queryBoost); }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); return new BM25StatsFixed(collectionStats.field(), k1, b, idf, avgdl); }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idfManager.getIDF(termStats.term().utf8ToString()); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); // compute freq-independent part of bm25 equation across all norm values float cache[] = new float[256]; for (int i = 0; i < cache.length; i++) { cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl); } return new BM25Stats(collectionStats.field(), idf, avgdl, cache); }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { long N, n; float idf_, avdl; idf_ = 1.0f; N = collectionStats.docCount(); if (N == -1) N = collectionStats.maxDoc(); avdl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf_ = idf(n, N); } else { /* computation for a phrase */ for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf_ += idf(n, N); } } return new TFIDFWeight(collectionStats.field(), idf_, avdl); }
@Override public final SimWeight computeWeight(final float queryBoost, final CollectionStatistics collectionStats, final TermStatistics... termStats) { final Explanation idf = termStats.length == 1 ? this.idfExplain( collectionStats, termStats[0]) : this.idfExplain( collectionStats, termStats); return new IDFStats(collectionStats.field(), idf, queryBoost); }
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { if (termStats.length == 1) { return new SimpleScore(boost, collectionStats, termStats[0]); } else { return new SimpleScore(boost, collectionStats, termStats); } }
SimpleScore(float boost, CollectionStatistics collectionStats, TermStatistics termStats[]) { float total = 0.0f; List<Explanation> scores = new ArrayList<>(); for (final TermStatistics stat : termStats) { String description = String.format("simple score for (%s:%s)", collectionStats.field(), stat.term().utf8ToString()); scores.add(Explanation.match(1.0f, description)); total += 1.0f; } this.score = Explanation.match(total, "total score, sum of:", scores); this.boost = Explanation.match(boost, "boost"); }
@Override public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { return new SimWeight() { @Override public void normalize(float queryNorm, float topLevelBoost) { } @Override public float getValueForNormalization() { return 0; } }; }
private void processTermVectorsFields(Vectorizer vectorizer, Fields termVectorsFields) throws IOException { for (String fieldName : termVectorsFields) { TermsEnum termsEnum = termVectorsFields.terms(fieldName).iterator(); while (termsEnum.next() != null) { Term term = new Term(fieldName, termsEnum.term()); TermStatistics termStatistics = new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq()); int freq = termsEnum.postings(null, null, PostingsEnum.ALL).freq(); vectorizer.add(term, termStatistics, freq); } } }