private static final void populate(final TermPositionVector termVector, final String[] terms, final Collection<SnippetVector> vectors, Timer parentTimer) throws SearchLibException { Timer t = new Timer(parentTimer, "indexesOf"); int[] termsIdx = termVector.indexesOf(terms, 0, terms.length); t.end(null); int i = 0; for (int termId : termsIdx) { Timer termTimer = new Timer(parentTimer, "term " + terms[i]); if (termId != -1) { t = new Timer(termTimer, "getOffsets"); TermVectorOffsetInfo[] offsets = termVector.getOffsets(termId); t.end(null); t = new Timer(termTimer, "getTermPositions"); int[] positions = termVector.getTermPositions(termId); t.end(null); t = new Timer(termTimer, "SnippetVector"); int j = 0; for (TermVectorOffsetInfo offset : offsets) vectors.add(new SnippetVector(i, offset, positions[j++])); t.end(null); } termTimer.end(null); i++; } }
final static Iterator<SnippetVector> extractTermVectorIterator( final int docId, final ReaderInterface reader, final SnippetQueries snippetQueries, final String fieldName, List<FieldValueItem> values, CompiledAnalyzer analyzer, final Timer parentTimer, final long expiration) throws IOException, ParseException, SyntaxError, SearchLibException { if (ArrayUtils.isEmpty(snippetQueries.terms)) return null; Timer t = new Timer(parentTimer, "getTermPositionVector " + fieldName); TermPositionVector termVector = getTermPositionVector( snippetQueries.terms, reader, docId, fieldName, values, analyzer, t); t.end(null); if (termVector == null) return null; Collection<SnippetVector> vectors = new ArrayList<SnippetVector>(); t = new Timer(parentTimer, "populate"); populate(termVector, snippetQueries.terms, vectors, t); t.end(null); t = new Timer(parentTimer, "removeIncludes"); vectors = removeIncludes(vectors); t.end(null); t = new Timer(parentTimer, "checkQueries"); snippetQueries.checkQueries(vectors, t, expiration); t.end(null); t = new Timer(parentTimer, "removeNonQuery"); vectors = removeNonQuery(vectors); t.end(null); return vectors.iterator(); }
private static final TermPositionVector getTermPositionVector( final String[] terms, final ReaderInterface readerInterface, final int docId, final String field, List<FieldValueItem> values, CompiledAnalyzer analyzer, Timer timer) throws IOException, SearchLibException, ParseException, SyntaxError { TermFreqVector termFreqVector = readerInterface.getTermFreqVector( docId, field); if (termFreqVector != null) if (termFreqVector instanceof TermPositionVector) return (TermPositionVector) termFreqVector; if (analyzer == null) return null; SnippetTermPositionVector stpv = new SnippetTermPositionVector(field, terms); int positionOffset = 0; int characterOffset = 0; List<TokenTerm> tokenTerms = new ArrayList<TokenTerm>(); for (FieldValueItem fieldValueItem : values) { if (fieldValueItem.value == null) continue; analyzer.populate(fieldValueItem.value, tokenTerms); positionOffset = stpv.addCollection(tokenTerms, characterOffset, positionOffset); characterOffset += fieldValueItem.value.length() + 1; tokenTerms.clear(); } stpv.compile(); return stpv; }
public final static List<IndexTerm> toList(TermFreqVector termVector) { if (termVector == null) return null; String[] terms = termVector.getTerms(); if (terms == null) return null; int[] frequencies = termVector.getTermFrequencies(); List<IndexTerm> indexTerms = new ArrayList<IndexTerm>(terms.length); if (termVector instanceof TermPositionVector) toListPosition((TermPositionVector) termVector, terms, frequencies, indexTerms); else toListFreq(termVector, terms, frequencies, indexTerms); return indexTerms; }
private final static void toListPosition(TermPositionVector termVector, String[] terms, int[] frequencies, List<IndexTerm> indexTerms) { int i = 0; for (String term : terms) { IndexTerm indexTerm = new IndexTerm(term, frequencies[i], termVector.getTermPositions(i), termVector.getOffsets(i)); indexTerms.add(indexTerm); i++; } }
public static void main(String[] args) throws Exception { if (args.length != 0) { QUERY = args[0]; } // 将庖丁封装成符合Lucene要求的Analyzer规范 Analyzer analyzer = new PaodingAnalyzer(); //读取本类目录下的text.txt文件 String content = ContentReader.readText(English.class); //接下来是标准的Lucene建立索引和检索的代码 Directory ramDir = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDir, analyzer); Document doc = new Document(); Field fd = new Field(FIELD_NAME, content, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(fd); writer.addDocument(doc); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(ramDir); String queryString = QUERY; QueryParser parser = new QueryParser(FIELD_NAME, analyzer); Query query = parser.parse(queryString); Searcher searcher = new IndexSearcher(ramDir); query = query.rewrite(reader); System.out.println("Searching for: " + query.toString(FIELD_NAME)); Hits hits = searcher.search(query); BoldFormatter formatter = new BoldFormatter(); Highlighter highlighter = new Highlighter(formatter, new QueryScorer( query)); highlighter.setTextFragmenter(new SimpleFragmenter(50)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get(FIELD_NAME); int maxNumFragmentsRequired = 5; String fragmentSeparator = "..."; TermPositionVector tpv = (TermPositionVector) reader .getTermFreqVector(hits.id(i), FIELD_NAME); TokenStream tokenStream = TokenSources.getTokenStream(tpv); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\n" + result); } reader.close(); }
public static void main(String[] args) throws Exception { if (args.length != 0) { QUERY = args[0]; } // 将庖丁封装成符合Lucene要求的Analyzer规范 Analyzer analyzer = new PaodingAnalyzer(); //读取本类目录下的text.txt文件 String content = ContentReader.readText(Chinese.class); //接下来是标准的Lucene建立索引和检索的代码 Directory ramDir = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDir, analyzer); Document doc = new Document(); Field fd = new Field(FIELD_NAME, content, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(fd); writer.addDocument(doc); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(ramDir); String queryString = QUERY; QueryParser parser = new QueryParser(FIELD_NAME, analyzer); Query query = parser.parse(queryString); Searcher searcher = new IndexSearcher(ramDir); query = query.rewrite(reader); System.out.println("Searching for: " + query.toString(FIELD_NAME)); Hits hits = searcher.search(query); BoldFormatter formatter = new BoldFormatter(); Highlighter highlighter = new Highlighter(formatter, new QueryScorer( query)); highlighter.setTextFragmenter(new SimpleFragmenter(50)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get(FIELD_NAME); int maxNumFragmentsRequired = 5; String fragmentSeparator = "..."; TermPositionVector tpv = (TermPositionVector) reader .getTermFreqVector(hits.id(i), FIELD_NAME); TokenStream tokenStream = TokenSources.getTokenStream(tpv); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\n" + result); } reader.close(); }