private void duelRun(PercolateQuery.QueryStore queryStore, MemoryIndex memoryIndex, IndexSearcher shardSearcher) throws IOException { boolean requireScore = randomBoolean(); IndexSearcher percolateSearcher = memoryIndex.createSearcher(); Query percolateQuery = fieldType.percolateQuery("type", queryStore, new BytesArray("{}"), percolateSearcher); Query query = requireScore ? percolateQuery : new ConstantScoreQuery(percolateQuery); TopDocs topDocs = shardSearcher.search(query, 10); Query controlQuery = new ControlQuery(memoryIndex, queryStore); controlQuery = requireScore ? controlQuery : new ConstantScoreQuery(controlQuery); TopDocs controlTopDocs = shardSearcher.search(controlQuery, 10); assertThat(topDocs.totalHits, equalTo(controlTopDocs.totalHits)); assertThat(topDocs.scoreDocs.length, equalTo(controlTopDocs.scoreDocs.length)); for (int j = 0; j < topDocs.scoreDocs.length; j++) { assertThat(topDocs.scoreDocs[j].doc, equalTo(controlTopDocs.scoreDocs[j].doc)); assertThat(topDocs.scoreDocs[j].score, equalTo(controlTopDocs.scoreDocs[j].score)); if (requireScore) { Explanation explain1 = shardSearcher.explain(query, topDocs.scoreDocs[j].doc); Explanation explain2 = shardSearcher.explain(controlQuery, controlTopDocs.scoreDocs[j].doc); assertThat(explain1.isMatch(), equalTo(explain2.isMatch())); assertThat(explain1.getValue(), equalTo(explain2.getValue())); } } }
private Fields generateTermVectors(Collection<GetField> getFields, boolean withOffsets, @Nullable Map<String, String> perFieldAnalyzer, Set<String> fields) throws IOException { /* store document in memory index */ MemoryIndex index = new MemoryIndex(withOffsets); for (GetField getField : getFields) { String field = getField.getName(); if (fields.contains(field) == false) { // some fields are returned even when not asked for, eg. _timestamp continue; } Analyzer analyzer = getAnalyzerAtField(field, perFieldAnalyzer); for (Object text : getField.getValues()) { index.addField(field, text.toString(), analyzer); } } /* and read vectors from it */ return MultiFields.getFields(index.createSearcher().getIndexReader()); }
MemoryIndex indexDoc(ParseContext.Document d, Analyzer analyzer, MemoryIndex memoryIndex) { for (IndexableField field : d.getFields()) { if (field.fieldType().indexOptions() == IndexOptions.NONE && field.name().equals(UidFieldMapper.NAME)) { continue; } try { // TODO: instead of passing null here, we can have a CTL<Map<String,TokenStream>> and pass previous, // like the indexer does try (TokenStream tokenStream = field.tokenStream(analyzer, null)) { if (tokenStream != null) { memoryIndex.addField(field.name(), tokenStream, field.boost()); } } } catch (IOException e) { throw new ElasticsearchException("Failed to create token stream", e); } } return memoryIndex; }
@Override public void prepare(PercolateContext context, ParsedDocument parsedDocument) { MemoryIndex memoryIndex = cache.get(); for (IndexableField field : parsedDocument.rootDoc().getFields()) { if (field.fieldType().indexOptions() == IndexOptions.NONE && field.name().equals(UidFieldMapper.NAME)) { continue; } try { Analyzer analyzer = context.mapperService().documentMapper(parsedDocument.type()).mappers().indexAnalyzer(); // TODO: instead of passing null here, we can have a CTL<Map<String,TokenStream>> and pass previous, // like the indexer does try (TokenStream tokenStream = field.tokenStream(analyzer, null)) { if (tokenStream != null) { memoryIndex.addField(field.name(), tokenStream, field.boost()); } } } catch (Exception e) { throw new ElasticsearchException("Failed to create token stream for [" + field.name() + "]", e); } } context.initialize(new DocEngineSearcher(memoryIndex), parsedDocument); }
public static void main(String[] args) throws ParseException { Analyzer analyzer = new StandardAnalyzer(); MemoryIndex index = new MemoryIndex(); Map<String, String> event = new HashMap<String, String>(); event.put("content", "Readings about Salmons and other select Alaska fishing Manuals"); event.put("author", "Tales of James"); for(Entry<String, String> entry : event.entrySet()){ index.addField(entry.getKey(), entry.getValue(),analyzer); } QueryParser parser = new QueryParser("content", analyzer); Query query = parser.parse("+author:james +salmon~ +fish* manual~"); float score = index.search(query); if (score > 0.0f) { System.out.println("it's a match"); } else { System.out.println("no match found"); } System.out.println("indexData=" + index.toString()); }
private void bestScoreClassification(Client client, IndexDocument document, LanguageEnum lang, MemoryIndex index) throws ParseException, SearchLibException, SyntaxError, IOException { ClassifierItem selectedItem = null; float maxScore = 0; for (ClassifierItem item : valueSet) { float score = item.score(client, lang, index); if (score > maxScore) { selectedItem = item; maxScore = score; } } if (selectedItem != null) { document.add(getFieldName(), selectedItem.getValue(), selectedItem.getBoost()); if (scoreFieldName != null && scoreFieldName.length() > 0) { document.addString(scoreFieldName, scoreFormat.format(maxScore)); } } else { if (defaultValue != null && defaultValue.length() > 0) document.add(fieldName, defaultValue, 1.0F); } }
public void classification(Client client, IndexDocument document) throws SearchLibException, ParseException, SyntaxError, IOException { rwl.r.lock(); try { MemoryIndex index = new MemoryIndex(); LanguageEnum lang = document.getLang(); Analyzer analyzer = client.getSchema().getIndexPerFieldAnalyzer(lang); for (FieldContent fieldContent : document) { String fieldName = fieldContent.getField(); String concatValues = fieldContent.getMergedValues(" "); index.addField(fieldName, concatValues, analyzer); } if (method == ClassificationMethodEnum.MULTIVALUED) multivaluedClassification(client, document, lang, index); else if (method == ClassificationMethodEnum.BESTSCORE) bestScoreClassification(client, document, lang, index); } finally { rwl.r.unlock(); } }
public void testCreateCandidateQuery() throws Exception { addQueryMapping(); MemoryIndex memoryIndex = new MemoryIndex(false); memoryIndex.addField("field1", "the quick brown fox jumps over the lazy dog", new WhitespaceAnalyzer()); memoryIndex.addField("field2", "some more text", new WhitespaceAnalyzer()); memoryIndex.addField("_field3", "unhide me", new WhitespaceAnalyzer()); memoryIndex.addField("field4", "123", new WhitespaceAnalyzer()); memoryIndex.addField(new LongPoint("number_field", 10L), new WhitespaceAnalyzer()); IndexReader indexReader = memoryIndex.createSearcher().getIndexReader(); BooleanQuery candidateQuery = (BooleanQuery) fieldType.createCandidateQuery(indexReader); assertEquals(2, candidateQuery.clauses().size()); assertEquals(Occur.SHOULD, candidateQuery.clauses().get(0).getOccur()); TermInSetQuery termsQuery = (TermInSetQuery) candidateQuery.clauses().get(0).getQuery(); PrefixCodedTerms terms = termsQuery.getTermData(); assertThat(terms.size(), equalTo(14L)); PrefixCodedTerms.TermIterator termIterator = terms.iterator(); assertTermIterator(termIterator, "_field3\u0000me", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "_field3\u0000unhide", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field1\u0000brown", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field1\u0000dog", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field1\u0000fox", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field1\u0000jumps", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field1\u0000lazy", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field1\u0000over", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field1\u0000quick", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field1\u0000the", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field2\u0000more", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field2\u0000some", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field2\u0000text", fieldType.queryTermsField.name()); assertTermIterator(termIterator, "field4\u0000123", fieldType.queryTermsField.name()); assertEquals(Occur.SHOULD, candidateQuery.clauses().get(1).getOccur()); assertEquals(new TermQuery(new Term(fieldType.extractionResultField.name(), EXTRACTION_FAILED)), candidateQuery.clauses().get(1).getQuery()); }
@Override protected Query rewrite(Query query) throws IOException { // TemplateQueryBuilder adds some optimization if the template and query builder have boosts / query names that wraps // the actual QueryBuilder that comes from the template into a BooleanQueryBuilder to give it an outer boost / name // this causes some queries to be not exactly equal but equivalent such that we need to rewrite them before comparing. if (query != null) { MemoryIndex idx = new MemoryIndex(); return idx.createSearcher().rewrite(query); } return new MatchAllDocsQuery(); // null == *:* }
@Override protected Query rewrite(Query query) throws IOException { // WrapperQueryBuilder adds some optimization if the wrapper and query builder have boosts / query names that wraps // the actual QueryBuilder that comes from the binary blob into a BooleanQueryBuilder to give it an outer boost / name // this causes some queries to be not exactly equal but equivalent such that we need to rewrite them before comparing. if (query != null) { MemoryIndex idx = new MemoryIndex(); return idx.createSearcher().rewrite(query); } return new MatchAllDocsQuery(); // null == *:* }
/** * Here we could go overboard and use a pre-generated indexed random document for a given Item, * but for now we'd prefer to simply return the id as the content of the document and that for * every field. */ private static Fields generateFields(String[] fieldNames, String text) throws IOException { MemoryIndex index = new MemoryIndex(); for (String fieldName : fieldNames) { index.addField(fieldName, text, new WhitespaceAnalyzer()); } return MultiFields.getFields(index.createSearcher().getIndexReader()); }
@Inject public PercolatorService(Settings settings, IndexNameExpressionResolver indexNameExpressionResolver, IndicesService indicesService, PageCacheRecycler pageCacheRecycler, BigArrays bigArrays, HighlightPhase highlightPhase, ClusterService clusterService, AggregationPhase aggregationPhase, ScriptService scriptService, MappingUpdatedAction mappingUpdatedAction) { super(settings); this.indexNameExpressionResolver = indexNameExpressionResolver; this.parseFieldMatcher = new ParseFieldMatcher(settings); this.indicesService = indicesService; this.pageCacheRecycler = pageCacheRecycler; this.bigArrays = bigArrays; this.clusterService = clusterService; this.highlightPhase = highlightPhase; this.aggregationPhase = aggregationPhase; this.scriptService = scriptService; this.mappingUpdatedAction = mappingUpdatedAction; this.sortParseElement = new SortParseElement(); final long maxReuseBytes = settings.getAsBytesSize("indices.memory.memory_index.size_per_thread", new ByteSizeValue(1, ByteSizeUnit.MB)).bytes(); cache = new CloseableThreadLocal<MemoryIndex>() { @Override protected MemoryIndex initialValue() { // TODO: should we expose payloads as an option? should offsets be turned on always? return new ExtendedMemoryIndex(true, false, maxReuseBytes); } }; single = new SingleDocumentPercolatorIndex(cache); multi = new MultiDocumentPercolatorIndex(cache); percolatorTypes = new IntObjectHashMap<>(6); percolatorTypes.put(countPercolator.id(), countPercolator); percolatorTypes.put(queryCountPercolator.id(), queryCountPercolator); percolatorTypes.put(matchPercolator.id(), matchPercolator); percolatorTypes.put(queryPercolator.id(), queryPercolator); percolatorTypes.put(scoringPercolator.id(), scoringPercolator); percolatorTypes.put(topMatchingPercolator.id(), topMatchingPercolator); }
public void buildTermVector(int docid) throws IOException { /* */ Set<String> fieldList = new HashSet<>(); fieldList.add("content"); Document doc = reader.document(docid, fieldList); MemoryIndex mi = MemoryIndex.fromDocument(doc, new StandardAnalyzer()); IndexReader mr = mi.createSearcher().getIndexReader(); Terms t = mr.leaves().get(0).reader().terms("content"); if ((t != null) && (t.size()>0)) { TermsEnum te = t.iterator(); BytesRef term = null; System.out.println(t.size()); while ((term = te.next()) != null) { System.out.println("BytesRef: " + term.utf8ToString()); System.out.println("docFreq: " + te.docFreq()); System.out.println("totalTermFreq: " + te.totalTermFreq()); } } }
@Test public void testRewrite() throws IOException { MemoryIndex memoryIndex = new MemoryIndex(); TaggedQuery taggedQuery = new TaggedQuery(new TermQuery(new Term("field", "value")), "tag"); Query rewrittenQuery = taggedQuery.rewrite(memoryIndex.createSearcher().getTopReaderContext().reader()); assertTrue(rewrittenQuery instanceof TermQuery); assertEquals("field", ((TermQuery) rewrittenQuery).getTerm().field()); assertEquals("value", ((TermQuery) rewrittenQuery).getTerm().text()); }
protected static void search(MemoryIndex index, Query query) { float score = index.search(query); if (score > 0.0f) { System.out.println("it's a match for " + query); } else { System.out.println("no match found for " + query); } }
public static void main(String args[]) throws ParseException, IOException{ MemoryIndex index = new MemoryIndex(); Analyzer analyzer = new StandardAnalyzer(); StringField field3 = new StringField(AUTHOR, FULL_NAME, Store.YES); index.addField(field3, analyzer); Query query = new TermQuery(new Term(AUTHOR,FULL_NAME)); search(index,query); query = new TermQuery(new Term(AUTHOR,FIRST_NAME)); search(index,query); query = new TermQuery(new Term(AUTHOR,LAST_NAME)); search(index,query); }
private void multivaluedClassification(Client client, IndexDocument document, LanguageEnum lang, MemoryIndex index) throws ParseException, SearchLibException, SyntaxError, IOException { boolean setDefaultValue = defaultValue != null && defaultValue.length() > 0; for (ClassifierItem item : valueSet) { float score = item.score(client, lang, index); if (score > 0.0f) { document.add(fieldName, item.getValue(), item.getBoost()); if (scoreFieldName != null && scoreFieldName.length() > 0) document.addString(scoreFieldName, Float.toString(score)); setDefaultValue = false; } } if (setDefaultValue) document.add(fieldName, defaultValue, 1.0F); }
protected final float score(Client client, LanguageEnum lang, MemoryIndex index) throws ParseException, SearchLibException, SyntaxError, IOException { Query qry = queryMap.get(lang); if (qry == null) { AbstractSearchRequest searchRequest = getSearchRequest(client, lang); qry = searchRequest.getQuery(); queryMap.put(lang, qry); } return index.search(qry); }
public final double searchScore(final String fieldName, final CompiledAnalyzer analyzer, final Query query) { searchScore = 0; if (query == null || analyzer == null) return 0; MemoryIndex index = new MemoryIndex(); index.addField(fieldName, originalText, analyzer); searchScore = index.search(query); return searchScore; }
public void testDuelSpecificQueries() throws Exception { List<ParseContext.Document> documents = new ArrayList<>(); CommonTermsQuery commonTermsQuery = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128); commonTermsQuery.add(new Term("field", "quick")); commonTermsQuery.add(new Term("field", "brown")); commonTermsQuery.add(new Term("field", "fox")); addQuery(commonTermsQuery, documents); BlendedTermQuery blendedTermQuery = BlendedTermQuery.booleanBlendedQuery(new Term[]{new Term("field", "quick"), new Term("field", "brown"), new Term("field", "fox")}, false); addQuery(blendedTermQuery, documents); SpanNearQuery spanNearQuery = new SpanNearQuery.Builder("field", true) .addClause(new SpanTermQuery(new Term("field", "quick"))) .addClause(new SpanTermQuery(new Term("field", "brown"))) .addClause(new SpanTermQuery(new Term("field", "fox"))) .build(); addQuery(spanNearQuery, documents); SpanNearQuery spanNearQuery2 = new SpanNearQuery.Builder("field", true) .addClause(new SpanTermQuery(new Term("field", "the"))) .addClause(new SpanTermQuery(new Term("field", "lazy"))) .addClause(new SpanTermQuery(new Term("field", "doc"))) .build(); SpanOrQuery spanOrQuery = new SpanOrQuery( spanNearQuery, spanNearQuery2 ); addQuery(spanOrQuery, documents); SpanNotQuery spanNotQuery = new SpanNotQuery(spanNearQuery, spanNearQuery); addQuery(spanNotQuery, documents); long lowerLong = randomIntBetween(0, 256); long upperLong = lowerLong + randomIntBetween(0, 32); addQuery(LongPoint.newRangeQuery("long_field", lowerLong, upperLong), documents); indexWriter.addDocuments(documents); indexWriter.close(); directoryReader = DirectoryReader.open(directory); IndexSearcher shardSearcher = newSearcher(directoryReader); // Disable query cache, because ControlQuery cannot be cached... shardSearcher.setQueryCache(null); Document document = new Document(); document.add(new TextField("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO)); long randomLong = randomIntBetween((int) lowerLong, (int) upperLong); document.add(new LongPoint("long_field", randomLong)); MemoryIndex memoryIndex = MemoryIndex.fromDocument(document, new WhitespaceAnalyzer()); duelRun(queryStore, memoryIndex, shardSearcher); }
private ControlQuery(MemoryIndex memoryIndex, PercolateQuery.QueryStore queryStore) { this.memoryIndex = memoryIndex; this.queryStore = queryStore; }
MultiDocumentPercolatorIndex(CloseableThreadLocal<MemoryIndex> cache) { this.cache = cache; }
private DocSearcher(IndexSearcher searcher, MemoryIndex rootDocMemoryIndex) { super("percolate", searcher); this.rootDocMemoryIndex = rootDocMemoryIndex; }
SingleDocumentPercolatorIndex(CloseableThreadLocal<MemoryIndex> cache) { this.cache = cache; }
public DocEngineSearcher(MemoryIndex memoryIndex) { super("percolate", memoryIndex.createSearcher()); this.memoryIndex = memoryIndex; }
public Map<String, Map<String, List<Integer>>> buildTermVectorWithPosition(int docid, Set<String> fields) throws IOException { Map<String, Map<String, List<Integer>>> fieldToTermVector = new HashMap<>(); Document doc = reader.document(docid, fields); MemoryIndex mi = MemoryIndex.fromDocument(doc, new StandardAnalyzer()); IndexReader mr = mi.createSearcher().getIndexReader(); for (LeafReaderContext leafContext : mr.leaves()) { LeafReader leaf = leafContext.reader(); for (String field : fields) { Map<String, List<Integer>> termToPositions = new HashMap<>(); Terms t = leaf.terms(field); if(t != null) { fieldToTermVector.put(field, termToPositions); TermsEnum tenum = t.iterator(); BytesRef termBytes = null; PostingsEnum postings = null; while ((termBytes = tenum.next()) != null) { List<Integer> positions = new ArrayList<>(); termToPositions.put(termBytes.utf8ToString(), positions); postings = tenum.postings(postings); postings.advance(0); for (int i = 0; i < postings.freq(); i++) { positions.add(postings.nextPosition()); } } } } } return fieldToTermVector; }