public CollectiveTestApproach(boolean fuzzy, boolean standardSeacher, boolean withDescription) { File indexDir = new File(entIndexDirectory); File indexDir1 = new File(docIndexDirectory); this.fuzzy = fuzzy; this.withDescription = withDescription; try { Directory dir = FSDirectory.open(indexDir); Directory dir1 = FSDirectory.open(indexDir1); entISearcher = new IndexSearcher(DirectoryReader.open(dir)); entIReader = DirectoryReader.open(dir); docISearcher = new IndexSearcher(DirectoryReader.open(dir1)); docIReader = DirectoryReader.open(dir1); if (!standardSeacher) { entISearcher.setSimilarity(new BM25Similarity()); } } catch (IOException e) { e.printStackTrace(); } }
public BM25SimilarityProvider(String name, Settings settings, Settings indexSettings) { super(name); float k1 = settings.getAsFloat("k1", 1.2f); float b = settings.getAsFloat("b", 0.75f); final DeprecationLogger deprecationLogger = new DeprecationLogger(ESLoggerFactory.getLogger(getClass())); boolean discountOverlaps = settings.getAsBooleanLenientForPreEs6Indices(Version.indexCreated(indexSettings), "discount_overlaps", true, deprecationLogger); this.similarity = new BM25Similarity(k1, b); this.similarity.setDiscountOverlaps(discountOverlaps); }
@Inject public BM25SimilarityProvider(@Assisted String name, @Assisted Settings settings) { super(name); float k1 = settings.getAsFloat("k1", 1.2f); float b = settings.getAsFloat("b", 0.75f); boolean discountOverlaps = settings.getAsBoolean("discount_overlaps", true); this.similarity = new BM25Similarity(k1, b); this.similarity.setDiscountOverlaps(discountOverlaps); }
private void openReader() throws IOException { this.hitsReader = DirectoryReader.open(FSDirectory.open(this.hitsDirectory)); this.hitsSearcher = new IndexSearcher(this.hitsReader); this.hitsSearcher.setSimilarity(new BM25Similarity(0.0f, 0.0f)); // simple idf scoring //searcher.setSimilarity(new BM25Similarity(1.2f, 0.75f)); // k1, b //searcher.setSimilarity(new LMDirichletSimilarity(200f)); // mu //searcher.setSimilarity(new LMJelinekMercerSimilarity(0.5f)); // lambda }
@Before public void setupIndex() throws IOException { dirUnderTest = newDirectory(); List<Similarity> sims = Arrays.asList( new ClassicSimilarity(), new SweetSpotSimilarity(), // extends Classic new BM25Similarity(), new LMDirichletSimilarity(), new BooleanSimilarity(), new LMJelinekMercerSimilarity(0.2F), new AxiomaticF3LOG(0.5F, 10), new DFISimilarity(new IndependenceChiSquared()), new DFRSimilarity(new BasicModelBE(), new AfterEffectB(), new NormalizationH1()), new IBSimilarity(new DistributionLL(), new LambdaDF(), new NormalizationH3()) ); similarity = sims.get(random().nextInt(sims.size())); indexWriterUnderTest = new RandomIndexWriter(random(), dirUnderTest, newIndexWriterConfig().setSimilarity(similarity)); for (int i = 0; i < docs.length; i++) { Document doc = new Document(); doc.add(newStringField("id", "" + i, Field.Store.YES)); doc.add(newField("field", docs[i], Store.YES)); indexWriterUnderTest.addDocument(doc); } indexWriterUnderTest.commit(); indexWriterUnderTest.forceMerge(1); indexWriterUnderTest.flush(); indexReaderUnderTest = indexWriterUnderTest.getReader(); searcherUnderTest = newSearcher(indexReaderUnderTest); searcherUnderTest.setSimilarity(similarity); }
public LuceneReRank(String indexLocation) throws IOException { dir = FSDirectory.open(new File(indexLocation)); IndexWriterConfig iwc = new IndexWriterConfig(ANALYZER.getVersion(), ANALYZER); iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); writer = new IndexWriter(dir, iwc); float K1 = (float) 1.0; float B = (float) 0.75; //NOTE: Leo mentioned that lucene's bm25 calculation could be not accurate similarity = new BM25Similarity(K1, B); fieldToLoad.add(RE_RANK_OFFSET); }
/** bm25 with parameters */ public void testParameters() throws Exception { Similarity sim = getSimilarity("text_params"); assertEquals(BM25Similarity.class, sim.getClass()); BM25Similarity bm25 = (BM25Similarity) sim; assertEquals(1.2f, bm25.getK1(), 0.01f); assertEquals(0.76f, bm25.getB(), 0.01f); }
@JsonIgnore public Similarity getScoringAlgorithmObject() { if(this.scoringAlgorithm == null || this.scoringAlgorithm.isEmpty() || this.scoringAlgorithm.equals(DEFAULT_SCORING_ALGORITHM) || this.scoringAlgorithm.equalsIgnoreCase("tfidf") || this.scoringAlgorithm.equalsIgnoreCase("vectorspace")) { // vector-space model return new DefaultSimilarity(); } else if(this.scoringAlgorithm.equalsIgnoreCase("bm25")) { // bm25 probability model return new BM25Similarity(); } return new DefaultSimilarity(); }
/** * Feature 5: cos(BM25) * sim(t_d, q) * * @param keyword * @return */ private Query createFeature5(EntityObject dataObject) { String keyword = dataObject.getText(); BM25Similarity bm25 = new BM25Similarity(); LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term("title", keyword), bm25); return fq; }
/** * Feature 6: cos(Bm25) * sim(a_d, q) * * @param dataObject * @return */ private Query createFeature6(EntityObject dataObject) { String keyword = dataObject.getText(); BM25Similarity bm25 = new BM25Similarity(); LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( "description", keyword), bm25); return fq; }
/** * Feature 7: cos(BM25) * sim(t_d, q_c) * * @param dataObject * @return */ private Query createFeature7(EntityObject dataObject) { String sentence = dataObject.getContext(); String[] split = sentence.split(" "); LTRBooleanQuery bq = new LTRBooleanQuery(); BM25Similarity bm25 = new BM25Similarity(); for (int i = 0; i < split.length; i++) { // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( // "title", split[i]), bm25); LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( "title", split[i]), bm25); bq.add(fq, Occur.SHOULD); } return bq; }
/** * Feature 8: cos(BM25) * sim(a_d, q_c) * * @param dataObject * @return */ private Query createFeature8(EntityObject dataObject) { String sentence = dataObject.getContext(); String[] split = sentence.split(" "); LTRBooleanQuery bq = new LTRBooleanQuery(); BM25Similarity bm25 = new BM25Similarity(); for (int i = 0; i < split.length; i++) { // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( // "description", usePorterStemmer(split[i])), bm25); LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( "description", split[i]), bm25); bq.add(fq, Occur.SHOULD); } return bq; }
/** * Feature 6: cos(Bm25) * sim(a_d, q) * * @param dataObject * @return */ private Query createFeature6(EntityObject dataObject) { String keyword = dataObject.getText(); BM25Similarity bm25 = new BM25Similarity(); LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( "abstract", keyword), bm25); return fq; }
/** * Feature 7: cos(BM25) * sim(t_d, q_c) * * @param dataObject * @return */ private Query createFeature7(EntityObject dataObject) { String sentence = dataObject.getContext(); String[] split = sentence.split(" "); LTRBooleanQuery bq = new LTRBooleanQuery(); BM25Similarity bm25 = new BM25Similarity(); for (int i = 0; i < split.length; i++) { LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( "title", split[i]), bm25); bq.add(fq, Occur.SHOULD); } return bq; }
/** * Feature 8: cos(BM25) * sim(a_d, q_c) * * @param dataObject * @return */ private Query createFeature8(EntityObject dataObject) { String sentence = dataObject.getContext(); String[] split = sentence.split(" "); LTRBooleanQuery bq = new LTRBooleanQuery(); BM25Similarity bm25 = new BM25Similarity(); for (int i = 0; i < split.length; i++) { LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( "abstract", split[i]), bm25); bq.add(fq, Occur.SHOULD); } return bq; }
public Lucene(Path path) throws IOException { /* Setup Lucene */ Directory dir = FSDirectory.open(path); // here we are using a standard analyzer, there are a lot of analyzers available to our use. Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); //this mode by default overwrites the previous index, not a very good option in real usage iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); iwc.setSimilarity(new BM25Similarity()); index = new IndexWriter(dir, iwc); }
/** * * @param propatiesPath 設定ファイル.propertiesのパス * @throws IOException */ public SpokendocBaseline(String propatiesPath) throws IOException { Properties conf = new Properties(); FileInputStream fis = new FileInputStream(new File(propatiesPath)); conf.load(fis); this.analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); this.task = conf.getProperty("task"); this.freqfilePath = conf.getProperty("freqfile"); this.tokenizerPath = conf.getProperty("tokenizer"); this.resultPath = conf.getProperty("result"); this.normalization = new Boolean(conf.getProperty("normalization")); //メモリにインデックス保存する。テスト用 //this.directory = new RAMDirectory(); //MMapDirectory: 読み込みはメモリ、書き出しはファイルシステムらしい String indexPath = conf.getProperty("index"); this.indexDirectory = MMapDirectory.open(new File(indexPath)); String selectedSimilarity = conf.getProperty("similarity"); if (selectedSimilarity.equals("LMDirichlet")) { float mu = Float.valueOf(conf.getProperty("mu")); this.similarity = new LMDirichletSimilarity(mu); } else if (selectedSimilarity.equals("BM25")) { float k1 = Float.valueOf(conf.getProperty("k1")); float b = Float.valueOf(conf.getProperty("b")); this.similarity = new BM25Similarity(k1, b); } else { this.similarity = new DefaultSimilarity(); } fis.close(); }
private PerFieldSimilarityWrapper getSimilarity(final QueryWithFilters queryWithFilters) { return new PerFieldSimilarityWrapper() { @Override public Similarity get(String name) { AnalyzerSettings analyzerSettings = indexConfig.getAnalyzerSettingsForIndexField(name); AnalyzerSettings.Similarity similarity = AnalyzerSettings.Similarity.BM25; if (analyzerSettings != null) { similarity = analyzerSettings.getSimilarity(); } AnalyzerSettings.Similarity fieldSimilarityOverride = queryWithFilters.getFieldSimilarityOverride(name); if (fieldSimilarityOverride != null) { similarity = fieldSimilarityOverride; } if (AnalyzerSettings.Similarity.TFIDF.equals(similarity)) { return new ClassicSimilarity(); } else if (AnalyzerSettings.Similarity.BM25.equals(similarity)) { return new BM25Similarity(); } else if (AnalyzerSettings.Similarity.CONSTANT.equals(similarity)) { return new ConstantSimilarity(); } else if (AnalyzerSettings.Similarity.TF.equals(similarity)) { return new TFSimilarity(); } else { throw new RuntimeException("Unknown similarity type <" + similarity + ">"); } } }; }
private static Similarity getSimilarity(JsonObject similarity) { switch (similarity.getString("type")) { case "BM25Similarity": JsonNumber k1 = similarity.getJsonNumber("k1"); JsonNumber b = similarity.getJsonNumber("b"); if (k1 != null && b != null) return new BM25Similarity((float) k1.doubleValue(), (float) b.doubleValue()); return new BM25Similarity(); case "TermFrequencySimilarity": return new TermFrequencySimilarity(); } return null; }
@Test public void testBM25Similarity() throws Exception { LuceneSettings settings = new LuceneSettings(); String json = "{\"similarity\": {\"type\": \"BM25Similarity\"}}"; settings.updateSettings(new StringReader(json)); assertEquals(BM25Similarity.class, settings.similarity.getClass()); assertEquals(0.75f, ((BM25Similarity) settings.similarity).getB(), 0); assertEquals(1.2f, ((BM25Similarity) settings.similarity).getK1(), 0); }
@Test public void testBM25SimilarityWithKAndB() throws Exception { LuceneSettings settings = new LuceneSettings(); String json = "{\"similarity\": {\"type\": \"BM25Similarity\", \"k1\": 1.0, \"b\": 0.5}}"; settings.updateSettings(new StringReader(json)); assertEquals(BM25Similarity.class, settings.similarity.getClass()); assertEquals(0.5f, ((BM25Similarity) settings.similarity).getB(), 0); assertEquals(1.0f, ((BM25Similarity) settings.similarity).getK1(), 0); }
public IndexSearcher setSimilarity(IndexSearcher searcher) { Similarity similarity = random().nextBoolean() ? new BM25Similarity() : new ClassicSimilarity(); searcher.setSimilarity(similarity); return searcher; }
public void testDefaultSimilarity() { Settings settings = Settings.builder().build(); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings); SimilarityService service = new SimilarityService(indexSettings, Collections.emptyMap()); assertThat(service.getDefaultSimilarity(), instanceOf(BM25Similarity.class)); }
public void testResolveDefaultSimilarities() { SimilarityService similarityService = createIndex("foo").similarityService(); assertThat(similarityService.getSimilarity("classic").get(), instanceOf(ClassicSimilarity.class)); assertThat(similarityService.getSimilarity("BM25").get(), instanceOf(BM25Similarity.class)); assertThat(similarityService.getSimilarity("default"), equalTo(null)); }
private Query createQuery(EntityDisambiguationDPO dpo, EntityCentricKnowledgeBase kb) { LearnToRankQuery query = new LearnToRankQuery(); List<LearnToRankClause> features = new LinkedList<LearnToRankClause>(); FuzzyLabelSimilarity fuzzyLabelSim = new FuzzyLabelSimilarity(); DefaultSimilarity defaultSim = new DefaultSimilarity(); BM25Similarity bm25 = new BM25Similarity(); // Feature 1 features.add(query.add(LuceneFeatures.queryStringFuzzy( dpo.getSelectedText(), "Label", fuzzyLabelSim, Occur.MUST, DisambiguationMainService.MAXCLAUSECOUNT), "Feature1", true)); // Feature 2 features.add(query.add(LuceneFeatures.queryStringTerm( dpo.getSelectedText(), "Description", defaultSim, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature2", false)); // Feature 3 features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(), "Label", defaultSim, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature3", false)); // Feature 4 features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(), "Description", defaultSim, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature4", false)); // Feature 5 features.add(query.add(LuceneFeatures.queryLabelFuzzy( dpo.getSelectedText(), "Label", bm25), "Feature5", false)); // Feature 6 features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(), "Label", bm25, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature6", false)); // Feature 7 features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(), "Description", bm25, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature7", false)); // Feature 8 features.add(query.add( LuceneFeatures.queryPrior(kb.getFeatureDefinition()), "Feature8", false)); // Feature 9 features.add(query.add( LuceneFeatures.querySensePrior(dpo.getSelectedText(), kb.getFeatureDefinition()), "Feature9", false)); features.get(0).setWeight(0.0524974f); features.get(1).setWeight(0.01771f); features.get(2).setWeight(0.0615202f); features.get(3).setWeight(0.0933433f); features.get(4).setWeight(0.0915161f); features.get(5).setWeight(-0.0468604f); features.get(6).setWeight(-0.0947746f); features.get(7).setWeight(0.0423863f); features.get(8).setWeight(0.465053f); return query; }
@Override public Similarity getSimilarity() { BM25Similarity sim = new BM25Similarity(k1, b); sim.setDiscountOverlaps(discountOverlaps); return sim; }
/** bm25 with default parameters */ public void test() throws Exception { assertEquals(BM25Similarity.class, getSimilarity("text").getClass()); }