Java 类org.apache.lucene.index.Terms 实例源码

项目:Elasticsearch    文件:TermVectorsResponse.java   
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}
项目:elasticsearch_my    文件:PagedBytesIndexFieldData.java   
/**
 * @return the estimate for loading the entire term set into field data, or 0 if unavailable
 */
public long estimateStringFieldData() {
    try {
        LeafReader reader = context.reader();
        Terms terms = reader.terms(getFieldName());

        Fields fields = reader.fields();
        final Terms fieldTerms = fields.terms(getFieldName());

        if (fieldTerms instanceof FieldReader) {
            final Stats stats = ((FieldReader) fieldTerms).getStats();
            long totalTermBytes = stats.totalTermBytes;
            if (logger.isTraceEnabled()) {
                logger.trace("totalTermBytes: {}, terms.size(): {}, terms.getSumDocFreq(): {}",
                        totalTermBytes, terms.size(), terms.getSumDocFreq());
            }
            long totalBytes = totalTermBytes + (2 * terms.size()) + (4 * terms.getSumDocFreq());
            return totalBytes;
        }
    } catch (Exception e) {
        logger.warn("Unable to estimate memory overhead", e);
    }
    return 0;
}
项目:elasticsearch_my    文件:AbstractIndexOrdinalsFieldData.java   
protected TermsEnum filter(Terms terms, TermsEnum iterator, LeafReader reader) throws IOException {
    if (iterator == null) {
        return null;
    }
    int docCount = terms.getDocCount();
    if (docCount == -1) {
        docCount = reader.maxDoc();
    }
    if (docCount >= minSegmentSize) {
        final int minFreq = minFrequency > 1.0
                ? (int) minFrequency
                : (int)(docCount * minFrequency);
        final int maxFreq = maxFrequency > 1.0
                ? (int) maxFrequency
                : (int)(docCount * maxFrequency);
        if (minFreq > 1 || maxFreq < docCount) {
            iterator = new FrequencyFilter(iterator, minFreq, maxFreq);
        }
    }
    return iterator;
}
项目:Elasticsearch    文件:DfsOnlyRequest.java   
public DfsOnlyRequest(Fields termVectorsFields, String[] indices, String[] types, Set<String> selectedFields) throws IOException {
    super(indices);

    // build a search request with a query of all the terms
    final BoolQueryBuilder boolBuilder = boolQuery();
    for (String fieldName : termVectorsFields) {
        if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
            continue;
        }
        Terms terms = termVectorsFields.terms(fieldName);
        TermsEnum iterator = terms.iterator();
        while (iterator.next() != null) {
            String text = iterator.term().utf8ToString();
            boolBuilder.should(QueryBuilders.termQuery(fieldName, text));
        }
    }
    // wrap a search request object
    this.searchRequest = new SearchRequest(indices).types(types).source(new SearchSourceBuilder().query(boolBuilder));
}
项目:elasticsearch_my    文件:AllTermQuery.java   
@Override
public Query rewrite(IndexReader reader) throws IOException {
    Query rewritten = super.rewrite(reader);
    if (rewritten != this) {
        return rewritten;
    }
    boolean hasPayloads = false;
    for (LeafReaderContext context : reader.leaves()) {
        final Terms terms = context.reader().terms(term.field());
        if (terms != null) {
            if (terms.hasPayloads()) {
                hasPayloads = true;
                break;
            }
        }
    }
    // if the terms does not exist we could return a MatchNoDocsQuery but this would break the unified highlighter
    // which rewrites query with an empty reader.
    if (hasPayloads == false) {
        return new TermQuery(term);
    }
    return this;
}
项目:Elasticsearch    文件:VersionFieldUpgrader.java   
static CodecReader wrap(CodecReader reader) throws IOException {
    final FieldInfos fieldInfos = reader.getFieldInfos();
    final FieldInfo versionInfo = fieldInfos.fieldInfo(VersionFieldMapper.NAME);
    if (versionInfo != null && versionInfo.getDocValuesType() != DocValuesType.NONE) {
        // the reader is a recent one, it has versions and they are stored
        // in a numeric doc values field
        return reader;
    }
    // The segment is an old one, look at the _uid field
    final Terms terms = reader.terms(UidFieldMapper.NAME);
    if (terms == null || !terms.hasPayloads()) {
        // The segment doesn't have an _uid field or doesn't have payloads
        // don't try to do anything clever. If any other segment has versions
        // all versions of this segment will be initialized to 0
        return reader;
    }
    // convert _uid payloads -> _version docvalues
    return new VersionFieldUpgrader(reader);
}
项目:Elasticsearch    文件:TermVectorsResponse.java   
private void buildFieldStatistics(XContentBuilder builder, Terms curTerms) throws IOException {
    long sumDocFreq = curTerms.getSumDocFreq();
    int docCount = curTerms.getDocCount();
    long sumTotalTermFrequencies = curTerms.getSumTotalTermFreq();
    if (docCount > 0) {
        assert ((sumDocFreq > 0)) : "docCount >= 0 but sumDocFreq ain't!";
        assert ((sumTotalTermFrequencies > 0)) : "docCount >= 0 but sumTotalTermFrequencies ain't!";
        builder.startObject(FieldStrings.FIELD_STATISTICS);
        builder.field(FieldStrings.SUM_DOC_FREQ, sumDocFreq);
        builder.field(FieldStrings.DOC_COUNT, docCount);
        builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies);
        builder.endObject();
    } else if (docCount == -1) { // this should only be -1 if the field
        // statistics were not requested at all. In
        // this case all 3 values should be -1
        assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!";
        assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!";
    } else {
        throw new IllegalStateException(
                "Something is wrong with the field statistics of the term vector request: Values are " + "\n"
                        + FieldStrings.SUM_DOC_FREQ + " " + sumDocFreq + "\n" + FieldStrings.DOC_COUNT + " " + docCount + "\n"
                        + FieldStrings.SUM_TTF + " " + sumTotalTermFrequencies);
    }
}
项目:elasticsearch_my    文件:TermVectorsResponse.java   
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}
项目:elasticsearch_my    文件:TermVectorsResponse.java   
private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException {
    if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) {
        return;
    }

    builder.startArray(FieldStrings.TOKENS);
    for (int i = 0; i < termFreq; i++) {
        builder.startObject();
        if (curTerms.hasPositions()) {
            builder.field(FieldStrings.POS, currentPositions[i]);
        }
        if (curTerms.hasOffsets()) {
            builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]);
            builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]);
        }
        if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) {
            builder.field(FieldStrings.PAYLOAD, currentPayloads[i]);
        }
        builder.endObject();
    }
    builder.endArray();
}
项目:elasticsearch_my    文件:TermVectorsResponse.java   
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}
项目:elasticsearch_my    文件:TermVectorsResponse.java   
private void buildFieldStatistics(XContentBuilder builder, Terms curTerms) throws IOException {
    long sumDocFreq = curTerms.getSumDocFreq();
    int docCount = curTerms.getDocCount();
    long sumTotalTermFrequencies = curTerms.getSumTotalTermFreq();
    if (docCount > 0) {
        assert ((sumDocFreq > 0)) : "docCount >= 0 but sumDocFreq ain't!";
        assert ((sumTotalTermFrequencies > 0)) : "docCount >= 0 but sumTotalTermFrequencies ain't!";
        builder.startObject(FieldStrings.FIELD_STATISTICS);
        builder.field(FieldStrings.SUM_DOC_FREQ, sumDocFreq);
        builder.field(FieldStrings.DOC_COUNT, docCount);
        builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies);
        builder.endObject();
    } else if (docCount == -1) { // this should only be -1 if the field
        // statistics were not requested at all. In
        // this case all 3 values should be -1
        assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!";
        assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!";
    } else {
        throw new IllegalStateException(
                "Something is wrong with the field statistics of the term vector request: Values are " + "\n"
                        + FieldStrings.SUM_DOC_FREQ + " " + sumDocFreq + "\n" + FieldStrings.DOC_COUNT + " " + docCount + "\n"
                        + FieldStrings.SUM_TTF + " " + sumTotalTermFrequencies);
    }
}
项目:Elasticsearch    文件:TermVectorsResponse.java   
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}
项目:lams    文件:Lucene3xTermVectorsReader.java   
@Override
public Terms terms(String field) throws IOException {
  final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
  if (fieldInfo == null) {
    // No such field
    return null;
  }

  final Integer fieldIndex = fieldNumberToIndex.get(fieldInfo.number);
  if (fieldIndex == null) {
    // Term vectors were not indexed for this field
    return null;
  }

  return new TVTerms(fieldFPs[fieldIndex]);
}
项目:Elasticsearch    文件:TransportFieldStatsTransportAction.java   
@Override
protected FieldStatsShardResponse shardOperation(FieldStatsShardRequest request) {
    ShardId shardId = request.shardId();
    Map<String, FieldStats> fieldStats = new HashMap<>();
    IndexService indexServices = indicesService.indexServiceSafe(shardId.getIndex());
    MapperService mapperService = indexServices.mapperService();
    IndexShard shard = indexServices.shardSafe(shardId.id());
    try (Engine.Searcher searcher = shard.acquireSearcher("fieldstats")) {
        for (String field : request.getFields()) {
            MappedFieldType fieldType = mapperService.fullName(field);
            if (fieldType != null) {
                IndexReader reader = searcher.reader();
                Terms terms = MultiFields.getTerms(reader, field);
                if (terms != null) {
                    fieldStats.put(field, fieldType.stats(terms, reader.maxDoc()));
                }
            } else {
                throw new IllegalArgumentException("field [" + field + "] doesn't exist");
            }
        }
    } catch (IOException e) {
        throw ExceptionsHelper.convertToElastic(e);
    }
    return new FieldStatsShardResponse(shardId, fieldStats);
}
项目:lams    文件:QueryAutoStopWordAnalyzer.java   
/**
 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 * given selection of fields from terms with a document frequency greater than
 * the given maxDocFreq
 *
 * @param delegate Analyzer whose TokenStream will be filtered
 * @param indexReader IndexReader to identify the stopwords from
 * @param fields Selection of fields to calculate stopwords for
 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 * @throws IOException Can be thrown while reading from the IndexReader
 */
public QueryAutoStopWordAnalyzer(
    Analyzer delegate,
    IndexReader indexReader,
    Collection<String> fields,
    int maxDocFreq) throws IOException {
  super(delegate.getReuseStrategy());
  this.delegate = delegate;

  for (String field : fields) {
    Set<String> stopWords = new HashSet<>();
    Terms terms = MultiFields.getTerms(indexReader, field);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (terms != null) {
      TermsEnum te = terms.iterator(null);
      BytesRef text;
      while ((text = te.next()) != null) {
        if (te.docFreq() > maxDocFreq) {
          spare.copyUTF8Bytes(text);
          stopWords.add(spare.toString());
        }
      }
    }
    stopWordsPerField.put(field, stopWords);
  }
}
项目:lams    文件:CompiledAutomaton.java   
/** Return a {@link TermsEnum} intersecting the provided {@link Terms}
 *  with the terms accepted by this automaton. */
public TermsEnum getTermsEnum(Terms terms) throws IOException {
  switch(type) {
  case NONE:
    return TermsEnum.EMPTY;
  case ALL:
    return terms.iterator(null);
  case SINGLE:
    return new SingleTermsEnum(terms.iterator(null), term);
  case PREFIX:
    // TODO: this is very likely faster than .intersect,
    // but we should test and maybe cutover
    return new PrefixTermsEnum(terms.iterator(null), term);
  case NORMAL:
    return terms.intersect(this, null);
  default:
    // unreachable
    throw new RuntimeException("unhandled case");
  }
}
项目:lams    文件:SrndTermQuery.java   
@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  /* check term presence in index here for symmetry with other SimpleTerm's */
  Terms terms = MultiFields.getTerms(reader, fieldName);
  if (terms != null) {
    TermsEnum termsEnum = terms.iterator(null);

    TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText()));
    if (status == TermsEnum.SeekStatus.FOUND) {
      mtv.visitMatchingTerm(getLuceneTerm(fieldName));
    }
  }
}
项目:Elasticsearch    文件:TermVectorsResponse.java   
private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException {
    if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) {
        return;
    }

    builder.startArray(FieldStrings.TOKENS);
    for (int i = 0; i < termFreq; i++) {
        builder.startObject();
        if (curTerms.hasPositions()) {
            builder.field(FieldStrings.POS, currentPositions[i]);
        }
        if (curTerms.hasOffsets()) {
            builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]);
            builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]);
        }
        if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) {
            builder.field(FieldStrings.PAYLOAD, currentPayloads[i]);
        }
        builder.endObject();
    }
    builder.endArray();
}
项目:Elasticsearch    文件:GeoPointArrayIndexFieldData.java   
@Override
public AtomicGeoPointFieldData loadDirect(LeafReaderContext context) throws Exception {
    LeafReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    AtomicGeoPointFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker(CircuitBreaker.FIELDDATA));
    if (terms == null) {
        data = AbstractAtomicGeoPointFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.ramBytesUsed());
        return data;
    }
    return (Version.indexCreated(indexSettings).before(Version.V_2_2_0)) ?
        loadLegacyFieldData(reader, estimator, terms, data) : loadFieldData22(reader, estimator, terms, data);
}
项目:elasticsearch_my    文件:PercolatorFieldMapper.java   
Query createCandidateQuery(IndexReader indexReader) throws IOException {
    List<BytesRef> extractedTerms = new ArrayList<>();
    LeafReader reader = indexReader.leaves().get(0).reader();
    Fields fields = reader.fields();
    for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {
            continue;
        }

        BytesRef fieldBr = new BytesRef(field);
        TermsEnum tenum = terms.iterator();
        for (BytesRef term = tenum.next(); term != null; term = tenum.next()) {
            BytesRefBuilder builder = new BytesRefBuilder();
            builder.append(fieldBr);
            builder.append(FIELD_VALUE_SEPARATOR);
            builder.append(term);
            extractedTerms.add(builder.toBytesRef());
        }
    }
    Query extractionSuccess = new TermInSetQuery(queryTermsField.name(), extractedTerms);
    // include extractionResultField:failed, because docs with this term have no extractedTermsField
    // and otherwise we would fail to return these docs. Docs that failed query term extraction
    // always need to be verified by MemoryIndex:
    Query extractionFailure = new TermQuery(new Term(extractionResultField.name(), EXTRACTION_FAILED));

    return new BooleanQuery.Builder()
            .add(extractionSuccess, Occur.SHOULD)
            .add(extractionFailure, Occur.SHOULD)
            .build();
}
项目:elasticsearch_my    文件:CompletionFieldStats.java   
/**
 * Returns total in-heap bytes used by all suggesters.  This method has CPU cost <code>O(numIndexedFields)</code>.
 *
 * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes
 * separately in the returned {@link CompletionStats}
 */
public static CompletionStats completionStats(IndexReader indexReader, String ... fieldNamePatterns) {
    long sizeInBytes = 0;
    ObjectLongHashMap<String> completionFields = null;
    if (fieldNamePatterns != null  && fieldNamePatterns.length > 0) {
        completionFields = new ObjectLongHashMap<>(fieldNamePatterns.length);
    }
    for (LeafReaderContext atomicReaderContext : indexReader.leaves()) {
        LeafReader atomicReader = atomicReaderContext.reader();
        try {
            Fields fields = atomicReader.fields();
            for (String fieldName : fields) {
                Terms terms = fields.terms(fieldName);
                if (terms instanceof CompletionTerms) {
                    // TODO: currently we load up the suggester for reporting its size
                    long fstSize = ((CompletionTerms) terms).suggester().ramBytesUsed();
                    if (fieldNamePatterns != null && fieldNamePatterns.length > 0 && Regex.simpleMatch(fieldNamePatterns, fieldName)) {
                        completionFields.addTo(fieldName, fstSize);
                    }
                    sizeInBytes += fstSize;
                }
            }
        } catch (IOException ioe) {
            throw new ElasticsearchException(ioe);
        }
    }
    return new CompletionStats(sizeInBytes, completionFields == null ? null : new FieldMemoryStats(completionFields));
}
项目:elasticsearch_my    文件:LinearInterpolatingScorer.java   
public LinearInterpolatingScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator,
        double trigramLambda, double bigramLambda, double unigramLambda) throws IOException {
    super(reader, terms, field, realWordLikelyhood, separator);
    double sum = unigramLambda + bigramLambda + trigramLambda;
    this.unigramLambda = unigramLambda / sum;
    this.bigramLambda = bigramLambda / sum;
    this.trigramLambda = trigramLambda / sum;
}
项目:Elasticsearch    文件:ShortFieldMapper.java   
@Override
public FieldStats stats(Terms terms, int maxDoc) throws IOException {
    long minValue = NumericUtils.getMinInt(terms);
    long maxValue = NumericUtils.getMaxInt(terms);
    return new FieldStats.Long(
        maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue
    );
}
项目:Elasticsearch    文件:TermVectorsResponse.java   
private void initMemory(Terms curTerms, int termFreq) {
    // init memory for performance reasons
    if (curTerms.hasPositions()) {
        currentPositions = ArrayUtil.grow(currentPositions, termFreq);
    }
    if (curTerms.hasOffsets()) {
        currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq);
        currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq);
    }
    if (curTerms.hasPayloads()) {
        currentPayloads = new BytesArray[termFreq];
    }
}
项目:elasticsearch_my    文件:TermsSliceQuery.java   
/**
 * Returns a DocIdSet per segments containing the matching docs for the specified slice.
 */
private DocIdSet build(LeafReader reader) throws IOException {
    final DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
    final Terms terms = reader.terms(getField());
    final TermsEnum te = terms.iterator();
    PostingsEnum docsEnum = null;
    for (BytesRef term = te.next(); term != null; term = te.next()) {
        int hashCode = term.hashCode();
        if (contains(hashCode)) {
            docsEnum = te.postings(docsEnum, PostingsEnum.NONE);
            builder.add(docsEnum);
        }
    }
    return builder.build();
}
项目:Elasticsearch    文件:LongFieldMapper.java   
@Override
public FieldStats stats(Terms terms, int maxDoc) throws IOException {
    long minValue = NumericUtils.getMinLong(terms);
    long maxValue = NumericUtils.getMaxLong(terms);
    return new FieldStats.Long(
        maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue
    );
}
项目:elasticsearch_my    文件:PagedBytesIndexFieldData.java   
/**
 * Determine whether the BlockTreeTermsReader.FieldReader can be used
 * for estimating the field data, adding the estimate to the circuit
 * breaker if it can, otherwise wrapping the terms in a
 * RamAccountingTermsEnum to be estimated on a per-term basis.
 *
 * @param terms terms to be estimated
 * @return A possibly wrapped TermsEnum for the terms
 */
@Override
public TermsEnum beforeLoad(Terms terms) throws IOException {
    LeafReader reader = context.reader();

    TermsEnum iterator = terms.iterator();
    TermsEnum filteredIterator = filter(terms, iterator, reader);
    final boolean filtered = iterator != filteredIterator;
    iterator = filteredIterator;

    if (filtered) {
        if (logger.isTraceEnabled()) {
            logger.trace("Filter exists, can't circuit break normally, using RamAccountingTermsEnum");
        }
        return new RamAccountingTermsEnum(iterator, breaker, this, this.fieldName);
    } else {
        estimatedBytes = this.estimateStringFieldData();
        // If we weren't able to estimate, wrap in the RamAccountingTermsEnum
        if (estimatedBytes == 0) {
            iterator = new RamAccountingTermsEnum(iterator, breaker, this, this.fieldName);
        } else {
            breaker.addEstimateBytesAndMaybeBreak(estimatedBytes, fieldName);
        }

        return iterator;
    }
}
项目:elasticsearch_my    文件:XMoreLikeThis.java   
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField fields[] = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector, fieldName);
        }
    }

    return createQueue(termFreqMap);
}
项目:elasticsearch_my    文件:XMoreLikeThis.java   
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 * @param fieldName Optional field name of the terms for skip terms
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        if (isSkipTerm(fieldName, term)) {
            continue;
        }

        final PostingsEnum docs = termsEnum.postings(null);
        int freq = 0;
        while(docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            freq += docs.freq();
        }

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}
项目:elasticsearch_my    文件:MultiPhrasePrefixQuery.java   
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
        Terms _terms = leaf.reader().terms(field);
        if (_terms == null) {
            continue;
        }

        TermsEnum termsEnum = _terms.iterator();
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
        if (TermsEnum.SeekStatus.END == seekStatus) {
            continue;
        }

        for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
            if (!StringHelper.startsWith(term, prefix.bytes())) {
                break;
            }

            terms.add(new Term(field, BytesRef.deepCopyOf(term)));
            if (terms.size() >= maxExpansions) {
                return;
            }
        }
    }
}
项目:Elasticsearch    文件:DoubleFieldMapper.java   
@Override
public FieldStats stats(Terms terms, int maxDoc) throws IOException {
    double minValue = NumericUtils.sortableLongToDouble(NumericUtils.getMinLong(terms));
    double maxValue = NumericUtils.sortableLongToDouble(NumericUtils.getMaxLong(terms));
    return new FieldStats.Double(
        maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue
    );
}
项目:elasticsearch_my    文件:TermVectorsWriter.java   
private void writeFieldStatistics(Terms topLevelTerms) throws IOException {
    long sttf = topLevelTerms.getSumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = topLevelTerms.getSumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);
    int dc = topLevelTerms.getDocCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}
项目:elasticsearch_my    文件:TermVectorsResponse.java   
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
    String fieldName = fieldIter.next();
    builder.startObject(fieldName);
    Terms curTerms = theFields.terms(fieldName);
    // write field statistics
    buildFieldStatistics(builder, curTerms);
    builder.startObject(FieldStrings.TERMS);
    TermsEnum termIter = curTerms.iterator();
    BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
    for (int i = 0; i < curTerms.size(); i++) {
        buildTerm(builder, spare, curTerms, termIter, boostAtt);
    }
    builder.endObject();
    builder.endObject();
}
项目:elasticsearch_my    文件:TermVectorsResponse.java   
private void initMemory(Terms curTerms, int termFreq) {
    // init memory for performance reasons
    if (curTerms.hasPositions()) {
        currentPositions = ArrayUtil.grow(currentPositions, termFreq);
    }
    if (curTerms.hasOffsets()) {
        currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq);
        currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq);
    }
    if (curTerms.hasPayloads()) {
        currentPayloads = new BytesArray[termFreq];
    }
}
项目:Elasticsearch    文件:ByteFieldMapper.java   
@Override
public FieldStats stats(Terms terms, int maxDoc) throws IOException {
    long minValue = NumericUtils.getMinInt(terms);
    long maxValue = NumericUtils.getMaxInt(terms);
    return new FieldStats.Long(
        maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue
    );
}
项目:elasticsearch_my    文件:GetTermVectorsIT.java   
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException {
    String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
    int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
    int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
    int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
    int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};

    Terms terms = fields.terms(fieldName);
    assertThat(terms.size(), equalTo(8L));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, notNullValue());
        // do not test ttf or doc frequency, because here we have many
        // shards and do not know how documents are distributed
        PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            if (withPayloads) {
                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
            }
        }
    }
    assertThat(iterator.next(), nullValue());
}
项目:elasticsearch_my    文件:GetTermVectorsIT.java   
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
    Terms terms0 = fields0.terms(fieldName);
    Terms terms1 = fields1.terms(fieldName);
    assertThat(terms0, notNullValue());
    assertThat(terms1, notNullValue());
    assertThat(terms0.size(), equalTo(terms1.size()));

    TermsEnum iter0 = terms0.iterator();
    TermsEnum iter1 = terms1.iterator();
    for (int i = 0; i < terms0.size(); i++) {
        BytesRef next0 = iter0.next();
        assertThat(next0, notNullValue());
        BytesRef next1 = iter1.next();
        assertThat(next1, notNullValue());

        // compare field value
        String string0 = next0.utf8ToString();
        String string1 = next1.utf8ToString();
        assertThat("expected: " + string0, string0, equalTo(string1));

        // compare df and ttf
        assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
        assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));

        // compare freq and docs
        PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL);
        PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL);
        assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
        assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));

        // compare position, start offsets and end offsets
        for (int j = 0; j < docsAndPositions0.freq(); j++) {
            assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
            assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
            assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
        }
    }
    assertThat(iter0.next(), nullValue());
    assertThat(iter1.next(), nullValue());
}
项目:elasticsearch_my    文件:GetTermVectorsIT.java   
public void testArtificialNoDoc() throws IOException {
    // setup indices
    Settings.Builder settings = Settings.builder()
            .put(indexSettings())
            .put("index.analysis.analyzer", "standard");
    assertAcked(prepareCreate("test")
            .setSettings(settings)
            .addMapping("type1", "field1", "type=text"));
    ensureGreen();

    // request tvs from artificial document
    String text = "the quick brown fox jumps over the lazy dog";
    TermVectorsResponse resp = client().prepareTermVectors()
            .setIndex("test")
            .setType("type1")
            .setDoc(jsonBuilder()
                    .startObject()
                    .field("field1", text)
                    .endObject())
            .setOffsets(true)
            .setPositions(true)
            .setFieldStatistics(true)
            .setTermStatistics(true)
            .get();
    assertThat(resp.isExists(), equalTo(true));
    checkBrownFoxTermVector(resp.getFields(), "field1", false);

    // Since the index is empty, all of artificial document's "term_statistics" should be 0/absent
    Terms terms = resp.getFields().terms("field1");
    assertEquals("sumDocFreq should be 0 for a non-existing field!", 0, terms.getSumDocFreq());
    assertEquals("sumTotalTermFreq should be 0 for a non-existing field!", 0, terms.getSumTotalTermFreq());
    TermsEnum termsEnum = terms.iterator(); // we're guaranteed to receive terms for that field
    while (termsEnum.next() != null) {
        String term = termsEnum.term().utf8ToString();
        assertEquals("term [" + term + "] does not exist in the index; ttf should be 0!", 0, termsEnum.totalTermFreq());
    }
}
项目:elasticsearch_my    文件:GetTermVectorsIT.java   
private void checkBestTerms(Terms terms, List<String> expectedTerms) throws IOException {
    final TermsEnum termsEnum = terms.iterator();
    List<String> bestTerms = new ArrayList<>();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
        bestTerms.add(text.utf8ToString());
    }
    Collections.sort(expectedTerms);
    Collections.sort(bestTerms);
    assertArrayEquals(expectedTerms.toArray(), bestTerms.toArray());
}
项目:lams    文件:CompressingTermVectorsReader.java   
@Override
public Terms terms(String field) throws IOException {
  final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
  if (fieldInfo == null) {
    return null;
  }
  int idx = -1;
  for (int i = 0; i < fieldNumOffs.length; ++i) {
    if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) {
      idx = i;
      break;
    }
  }

  if (idx == -1 || numTerms[idx] == 0) {
    // no term
    return null;
  }
  int fieldOff = 0, fieldLen = -1;
  for (int i = 0; i < fieldNumOffs.length; ++i) {
    if (i < idx) {
      fieldOff += fieldLengths[i];
    } else {
      fieldLen = fieldLengths[i];
      break;
    }
  }
  assert fieldLen >= 0;
  return new TVTerms(numTerms[idx], fieldFlags[idx],
      prefixLengths[idx], suffixLengths[idx], termFreqs[idx],
      positionIndex[idx], positions[idx], startOffsets[idx], lengths[idx],
      payloadIndex[idx], payloadBytes,
      new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen));
}