private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException { // start term, optimized writing BytesRef term = termIter.next(); spare.copyUTF8Bytes(term); builder.startObject(spare.toString()); buildTermStatistics(builder, termIter); // finally write the term vectors PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL); int termFreq = posEnum.freq(); builder.field(FieldStrings.TERM_FREQ, termFreq); initMemory(curTerms, termFreq); initValues(curTerms, posEnum, termFreq); buildValues(builder, curTerms, termFreq); buildScore(builder, boostAtt); builder.endObject(); }
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException { String fieldName = fieldIter.next(); builder.startObject(fieldName); Terms curTerms = theFields.terms(fieldName); // write field statistics buildFieldStatistics(builder, curTerms); builder.startObject(FieldStrings.TERMS); TermsEnum termIter = curTerms.iterator(); BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class); for (int i = 0; i < curTerms.size(); i++) { buildTerm(builder, spare, curTerms, termIter, boostAtt); } builder.endObject(); builder.endObject(); }
private void buildScore(XContentBuilder builder, BoostAttribute boostAtt) throws IOException { if (hasScores) { builder.field(FieldStrings.SCORE, boostAtt.getBoost()); } }
/** * Provide spelling corrections based on several parameters. * * @param term The term to suggest spelling corrections for * @param numSug The maximum number of spelling corrections * @param ir The index reader to fetch the candidate spelling corrections from * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included * @param editDistance The maximum edit distance candidates are allowed to have * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included * @param spare a chars scratch * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. * @throws IOException If I/O related errors occur */ protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException { AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); Terms terms = MultiFields.getTerms(ir, term.field()); if (terms == null) { return Collections.emptyList(); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>(); BytesRef queryTerm = new BytesRef(term.text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class); while ((candidateTerm = e.next()) != null) { final float boost = boostAtt.getBoost(); // ignore uncompetitive hits if (stQueue.size() >= numSug && boost <= stQueue.peek().boost) continue; // ignore exact match of the same term if (queryTerm.bytesEquals(candidateTerm)) continue; int df = e.docFreq(); // check docFreq if required if (df <= docfreq) continue; final float score; final String termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.getScaleFactor() + e.getMinSimilarity(); } else { spare.copyUTF8Bytes(candidateTerm); termAsString = spare.toString(); score = distance.getDistance(term.text(), termAsString); } if (score < accuracy) continue; // add new entry in PQ st.term = BytesRef.deepCopyOf(candidateTerm); st.boost = boost; st.docfreq = df; st.termAsString = termAsString; st.score = score; stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); } return stQueue; }
/** * Provide spelling corrections based on several parameters. * * @param term The term to suggest spelling corrections for * @param numSug The maximum number of spelling corrections * @param ir The index reader to fetch the candidate spelling corrections from * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included * @param editDistance The maximum edit distance candidates are allowed to have * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included * @param spare a chars scratch * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. * @throws IOException If I/O related errors occur */ protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRef spare) throws IOException { AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); Terms terms = MultiFields.getTerms(ir, term.field()); if (terms == null) { return Collections.emptyList(); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); BytesRef queryTerm = new BytesRef(term.text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class); while ((candidateTerm = e.next()) != null) { final float boost = boostAtt.getBoost(); // ignore uncompetitive hits if (stQueue.size() >= numSug && boost <= stQueue.peek().boost) continue; // ignore exact match of the same term if (queryTerm.bytesEquals(candidateTerm)) continue; int df = e.docFreq(); // check docFreq if required if (df <= docfreq) continue; final float score; final String termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.getScaleFactor() + e.getMinSimilarity(); } else { UnicodeUtil.UTF8toUTF16(candidateTerm, spare); termAsString = spare.toString(); score = distance.getDistance(term.text(), termAsString); } if (score < accuracy) continue; // add new entry in PQ st.term = BytesRef.deepCopyOf(candidateTerm); st.boost = boost; st.docfreq = df; st.termAsString = termAsString; st.score = score; stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); } return stQueue; }