我正在寻找一种在Lucene中执行查询自动完成/建议的方法。我已经在Google上搜索了一些,并玩了一些,但是我看到的所有示例似乎都在Solr中设置了过滤器。我们不使用Solr,也不打算在不久的将来使用Solr,而且Solr显然无论如何都只是围绕Lucene,所以我想一定有办法做到这一点!
我已经研究过使用EdgeNGramFilter,但我意识到我必须在索引字段上运行过滤器并取出令牌,然后将其与输入的Query进行比较…我只是在努力使两者之间建立连接这两个代码有点复杂,因此非常感谢帮助!
为了清楚我在寻找什么(我意识到我并不太清楚),我正在寻找一种解决方案,其中在搜索术语时会返回建议查询的列表。在搜索字段中输入“ inter”时,它将返回建议查询的列表,例如“ internet”,“ international”等。
基于@Alexandre Victoor的回答,我在contrib程序包中(并使用其中包含的LuceneDictionary)编写了一个基于Lucene Spellchecker的小类,它完全可以实现我想要的功能。
这允许从具有单个字段的单个源索引重新索引,并提供术语建议。结果将按照原始索引中带有该术语的匹配文档的数量进行排序,因此,较流行的术语会首先出现。似乎工作得很好:)
import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ISOLatin1AccentFilter; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.spell.LuceneDictionary; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; /** * Search term auto-completer, works for single terms (so use on the last term * of the query). * <p> * Returns more popular terms first. * * @author Mat Mannion, M.Mannion@warwick.ac.uk */ public final class Autocompleter { private static final String GRAMMED_WORDS_FIELD = "words"; private static final String SOURCE_WORD_FIELD = "sourceWord"; private static final String COUNT_FIELD = "count"; private static final String[] ENGLISH_STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "i", "if", "in", "into", "is", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private final Directory autoCompleteDirectory; private IndexReader autoCompleteReader; private IndexSearcher autoCompleteSearcher; public Autocompleter(String autoCompleteDir) throws IOException { this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir, null); reOpenReader(); } public List<String> suggestTermsFor(String term) throws IOException { // get the top 5 terms for query Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, term)); Sort sort = new Sort(COUNT_FIELD, true); TopDocs docs = autoCompleteSearcher.search(query, null, 5, sort); List<String> suggestions = new ArrayList<String>(); for (ScoreDoc doc : docs.scoreDocs) { suggestions.add(autoCompleteReader.document(doc.doc).get( SOURCE_WORD_FIELD)); } return suggestions; } @SuppressWarnings("unchecked") public void reIndex(Directory sourceDirectory, String fieldToAutocomplete) throws CorruptIndexException, IOException { // build a dictionary (from the spell package) IndexReader sourceReader = IndexReader.open(sourceDirectory); LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete); // code from // org.apache.lucene.search.spell.SpellChecker.indexDictionary( // Dictionary) IndexReader.unlock(autoCompleteDirectory); // use a custom analyzer so we can do EdgeNGramFiltering IndexWriter writer = new IndexWriter(autoCompleteDirectory, new Analyzer() { public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new ISOLatin1AccentFilter(result); result = new StopFilter(result, ENGLISH_STOP_WORDS); result = new EdgeNGramTokenFilter( result, Side.FRONT,1, 20); return result; } }, true); writer.setMergeFactor(300); writer.setMaxBufferedDocs(150); // go through every word, storing the original word (incl. n-grams) // and the number of times it occurs Map<String, Integer> wordsMap = new HashMap<String, Integer>(); Iterator<String> iter = (Iterator<String>) dict.getWordsIterator(); while (iter.hasNext()) { String word = iter.next(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (wordsMap.containsKey(word)) { throw new IllegalStateException( "This should never happen in Lucene 2.3.2"); // wordsMap.put(word, wordsMap.get(word) + 1); } else { // use the number of documents this word appears in wordsMap.put(word, sourceReader.docFreq(new Term( fieldToAutocomplete, word))); } } for (String word : wordsMap.keySet()) { // ok index the word Document doc = new Document(); doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.TOKENIZED)); // grammed doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)); // count writer.addDocument(doc); } sourceReader.close(); // close writer writer.optimize(); writer.close(); // re-open our reader reOpenReader(); } private void reOpenReader() throws CorruptIndexException, IOException { if (autoCompleteReader == null) { autoCompleteReader = IndexReader.open(autoCompleteDirectory); } else { autoCompleteReader.reopen(); } autoCompleteSearcher = new IndexSearcher(autoCompleteReader); } public static void main(String[] args) throws Exception { Autocompleter autocomplete = new Autocompleter("/index/autocomplete"); // run this to re-index from the current index, shouldn't need to do // this very often // autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null), // "content"); String term = "steve"; System.out.println(autocomplete.suggestTermsFor(term)); // prints [steve, steven, stevens, stevenson, stevenage] } }