@Override public String highlight(String locale, boolean useStopWords, String query, String content, String pre, String post, int preview) { Analyzer analyzer = termAnalyzers.findAnalyzer(locale, useStopWords); QueryParser parser = new QueryParser(defaultField, analyzer); String summary = null; try { SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(pre, post); Highlighter hg = new Highlighter(formatter, new QueryTermScorer(parser.parse(query))); hg.setMaxDocCharsToAnalyze(preview); hg.setTextFragmenter(new SimpleFragmenter(100)); TokenStream tokens = TokenSources.getTokenStream(defaultField, content, analyzer); summary = hg.getBestFragments(tokens, content, 4, " ... "); } catch (InvalidTokenOffsetsException | IOException | ParseException ex) { LOG.error("Failed to highlight", ex); } return StringUtils.isBlank(summary) ? null : summary; }
public void testHits() throws Exception { IndexSearcher searcher = new IndexSearcher(TestUtil.getBookIndexDirectory()); TermQuery query = new TermQuery(new Term("title", "action")); TopDocs hits = searcher.search(query, 10); QueryScorer scorer = new QueryScorer(query, "title"); Highlighter highlighter = new Highlighter(scorer); highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer)); Analyzer analyzer = new SimpleAnalyzer(); for (ScoreDoc sd : hits.scoreDocs) { StoredDocument doc = searcher.doc(sd.doc); String title = doc.get("title"); TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), sd.doc, "title", doc, analyzer); String fragment = highlighter.getBestFragment(stream, title); LOGGER.info(fragment); } }
/** * Performs highlighting for a given query and a given document. * * @param indexSearcher the IndexSearcher performing the query * @param query the Tripod LuceneQuery * @param scoreDoc the Lucene ScoreDoc * @param doc the Lucene Document * @param highlighter the Highlighter to use * @param result the QueryResult to add the highlights to * @throws IOException if an error occurs performing the highlighting * @throws InvalidTokenOffsetsException if an error occurs performing the highlighting */ protected void performHighlighting(final IndexSearcher indexSearcher, final Query query, final ScoreDoc scoreDoc, final Document doc, final Highlighter highlighter, final QR result) throws IOException, InvalidTokenOffsetsException { if (query.getHighlightFields() == null || query.getHighlightFields().isEmpty()) { return; } final List<Highlight> highlights = new ArrayList<>(); final List<String> hlFieldNames = getHighlightFieldNames(query, doc); // process each field to highlight on for (String hlField : hlFieldNames) { final String text = doc.get(hlField); if (StringUtils.isEmpty(text)) { continue; } final List<String> snippets = new ArrayList<>(); final Fields tvFields = indexSearcher.getIndexReader().getTermVectors(scoreDoc.doc); final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() -1; // get the snippets for the given field final TokenStream tokenStream = TokenSources.getTokenStream(hlField, tvFields, text, analyzer, maxStartOffset); final TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, false, 10); for (TextFragment textFragment : textFragments) { if (textFragment != null && textFragment.getScore() > 0) { snippets.add(textFragment.toString()); } } // if we have snippets then add a highlight result to the QueryResult if (snippets.size() > 0) { highlights.add(new Highlight(hlField, snippets)); } } result.setHighlights(highlights); }
@Override protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); return new BenchmarkHighlighter(){ @Override public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception { TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); return frag != null ? frag.length : 0; } }; }
@Override public BenchmarkHighlighter getBenchmarkHighlighter(Query q) { highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); return new BenchmarkHighlighter() { @Override public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception { TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); numHighlightedResults += frag != null ? frag.length : 0; return frag != null ? frag.length : 0; } }; }
private String searchData(String key) throws IOException, ParseException, InvalidTokenOffsetsException { Directory directory = FSDirectory.open(new File(filePath)); IndexSearcher indexSearcher = new IndexSearcher(directory); QueryParser queryParser = new QueryParser(Version.LUCENE_31, "foods", new SmartChineseAnalyzer(Version.LUCENE_31, true)); //queryParser.setDefaultOperator(Operator.AND); Query query = queryParser.parse(key); TopDocs docs = indexSearcher.search(query, 10); QueryScorer queryScorer = new QueryScorer(query, "foods"); Highlighter highlighter = new Highlighter(queryScorer); highlighter.setTextFragmenter(new SimpleSpanFragmenter(queryScorer)); List<SearchResult> searchResults = new ArrayList<SearchResult>(); if (docs != null) { for (ScoreDoc scoreDoc : docs.scoreDocs) { Document doc = indexSearcher.doc(scoreDoc.doc); TokenStream tokenStream = TokenSources.getAnyTokenStream( indexSearcher.getIndexReader(), scoreDoc.doc, "foods", doc, new SmartChineseAnalyzer(Version.LUCENE_31, true)); SearchResult searchResult = new SearchResult(); searchResult.setRestaurantId(Long.valueOf(doc.get("id"))); searchResult.setRestaurantName(doc.get("restaurant_name")); searchResult.setKey(key); searchResult.setFoods(Arrays.asList(highlighter. getBestFragment(tokenStream, doc.get("foods")).split(" "))); searchResults.add(searchResult); } } else { searchResults = null; } indexSearcher.close(); directory.close(); return new Gson().toJson(searchResults); }
/** * NOTE: This method will not preserve the correct field types. * * @param preTag * @param postTag */ public static Document highlight(int docId, Document document, Query query, FieldManager fieldManager, IndexReader reader, String preTag, String postTag) throws IOException, InvalidTokenOffsetsException { String fieldLessFieldName = fieldManager.getFieldLessFieldName(); Query fixedQuery = fixSuperQuery(query, null, fieldLessFieldName); Analyzer analyzer = fieldManager.getAnalyzerForQuery(); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(preTag, postTag); Document result = new Document(); for (IndexableField f : document) { String name = f.name(); if (fieldLessFieldName.equals(name) || FIELDS_NOT_TO_HIGHLIGHT.contains(name)) { result.add(f); continue; } String text = f.stringValue(); Number numericValue = f.numericValue(); Query fieldFixedQuery; if (fieldManager.isFieldLessIndexed(name)) { fieldFixedQuery = fixSuperQuery(query, name, fieldLessFieldName); } else { fieldFixedQuery = fixedQuery; } if (numericValue != null) { if (shouldNumberBeHighlighted(name, numericValue, fieldFixedQuery)) { String numberHighlight = preTag + text + postTag; result.add(new StringField(name, numberHighlight, Store.YES)); } } else { Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(fieldFixedQuery, name)); TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, docId, name, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10); for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { result.add(new StringField(name, frag[j].toString(), Store.YES)); } } } } return result; }
public static void main(String[] args) throws Exception { if (args.length != 0) { QUERY = args[0]; } // 将庖丁封装成符合Lucene要求的Analyzer规范 Analyzer analyzer = new PaodingAnalyzer(); //读取本类目录下的text.txt文件 String content = ContentReader.readText(English.class); //接下来是标准的Lucene建立索引和检索的代码 Directory ramDir = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDir, analyzer); Document doc = new Document(); Field fd = new Field(FIELD_NAME, content, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(fd); writer.addDocument(doc); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(ramDir); String queryString = QUERY; QueryParser parser = new QueryParser(FIELD_NAME, analyzer); Query query = parser.parse(queryString); Searcher searcher = new IndexSearcher(ramDir); query = query.rewrite(reader); System.out.println("Searching for: " + query.toString(FIELD_NAME)); Hits hits = searcher.search(query); BoldFormatter formatter = new BoldFormatter(); Highlighter highlighter = new Highlighter(formatter, new QueryScorer( query)); highlighter.setTextFragmenter(new SimpleFragmenter(50)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get(FIELD_NAME); int maxNumFragmentsRequired = 5; String fragmentSeparator = "..."; TermPositionVector tpv = (TermPositionVector) reader .getTermFreqVector(hits.id(i), FIELD_NAME); TokenStream tokenStream = TokenSources.getTokenStream(tpv); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\n" + result); } reader.close(); }
public static void main(String[] args) throws Exception { if (args.length != 0) { QUERY = args[0]; } // 将庖丁封装成符合Lucene要求的Analyzer规范 Analyzer analyzer = new PaodingAnalyzer(); //读取本类目录下的text.txt文件 String content = ContentReader.readText(Chinese.class); //接下来是标准的Lucene建立索引和检索的代码 Directory ramDir = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDir, analyzer); Document doc = new Document(); Field fd = new Field(FIELD_NAME, content, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(fd); writer.addDocument(doc); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(ramDir); String queryString = QUERY; QueryParser parser = new QueryParser(FIELD_NAME, analyzer); Query query = parser.parse(queryString); Searcher searcher = new IndexSearcher(ramDir); query = query.rewrite(reader); System.out.println("Searching for: " + query.toString(FIELD_NAME)); Hits hits = searcher.search(query); BoldFormatter formatter = new BoldFormatter(); Highlighter highlighter = new Highlighter(formatter, new QueryScorer( query)); highlighter.setTextFragmenter(new SimpleFragmenter(50)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get(FIELD_NAME); int maxNumFragmentsRequired = 5; String fragmentSeparator = "..."; TermPositionVector tpv = (TermPositionVector) reader .getTermFreqVector(hits.id(i), FIELD_NAME); TokenStream tokenStream = TokenSources.getTokenStream(tpv); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\n" + result); } reader.close(); }