private void createIndexWriter(List<URLContentObject> urlContentList, Directory ramDirectory) throws IOException { Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File(baseDir+ "data/stopwords/stopwords_en.txt"))); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36,stopWords); IndexWriter indexWriter = new IndexWriter(ramDirectory, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for(URLContentObject d:urlContentList) { Document document = new Document(); StringReader reader=new StringReader(d.getContent()); document.add(new Field("id", d.getId(), Field.Store.YES, Field.Index.ANALYZED,TermVector.YES)); document.add(new Field("contents", reader, TermVector.YES)); //document.add(new Field("id",Integer.toString(d.getAutoIncrementId()),Field.Store.YES, Field.Index.ANALYZED)); indexWriter.addDocument(document); reader.close(); } indexWriter.commit(); indexWriter.close(); }
private Document createDocument(Term term, String value, boolean synonym) { Document doc = new Document(); Field ontologyField = new Field(FIELD_ONTOLOGY, term.getOntology().getName(), Field.Store.YES, Field.Index.NO, TermVector.NO); ontologyField.setOmitNorms(true); ontologyField.setOmitTermFreqAndPositions(true); doc.add(ontologyField); Field idField = new Field(FIELD_ID, term.getReferenceId(), Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.NO); idField.setOmitNorms(true); idField.setOmitTermFreqAndPositions(true); doc.add(idField); Field nameField = new Field(FIELD_TERM, value, Field.Store.YES, Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS); //nameField.setOmitNorms(true); doc.add(nameField); doc.add(new Field(FIELD_SYNONYM, synonym ? new byte[] {1} : new byte[] {0}, Field.Store.YES)); return doc; }
/** * */ public NonDictionaryField(String name, Store store, Index index, TermVector termVector, boolean multiValued) { this.name = name; this.store = store; this.index = index; this.termVector = termVector; this.multiValued = multiValued; }
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException{ RAMDirectory ramDir = new RAMDirectory(); FileReader fr=new FileReader(new File("lib/stoplists/en.txt")); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr ); //Index the full text of both documents //IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer =new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for (String s:fileSentences) { Document doc1 = new Document(); StringReader d1reader=new StringReader(s); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); } // writer.commit(); writer.close(); DocVector[] docs = new DocVector[fileSentences.size()]; //Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String,Integer> terms = new HashMap<String,Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); //System.out.println(RAMreader.numDocs()); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (!"contents".equals(term.field())) break; terms.put(term.text(), pos++); } //System.out.println("Num terms:"+terms.size()); for(int i=0;i<fileSentences.size();i++) { TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i); docs[i]=new DocVector(terms); if (tfvs==null) continue; for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { double idfValue=getIDF(RAMreader,termTexts[j]); double tfIdfValue=termFreqs[j]*idfValue; docs[i].setEntry(termTexts[j], tfIdfValue); } } docs[i].normalize(); } RAMreader.close(); ramDir.close(); //ramDir.close(); //System.out.println(RAMreader.numDocs()); //System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19])); return docs; }
private static void addNonDictionaryField(String name, Store store, Index index, TermVector termVector, boolean multiValued) { nonDictionaryFields.put(name, new NonDictionaryField(name, store, index, termVector, multiValued)); }
private static void addAdditionalContentField(String name, Store store, Index index, TermVector termVector, boolean multiValued) { additionalContentFields.put(name, new NonDictionaryField(name, store, index, termVector, multiValued)); }
private static void addAdditionalTextField(String name, Store store, Index index, TermVector termVector, boolean multiValued) { additionalTextFields.put(name, new NonDictionaryField(name, store, index, termVector, multiValued)); }
private static void addAdditionalMlTextField(String name, Store store, Index index, TermVector termVector, boolean multiValued) { additionalMlTextFields.put(name, new NonDictionaryField(name, store, index, termVector, multiValued)); }
/** * @param field * @return */ public TermVector getFieldTermVec(SchemaField field) { return TermVector.NO; }
@Override protected TermVector getFieldTermVec(SchemaField field, String internalVal) { return AlfrescoSolrDataModel.getInstance(id).getFieldTermVec(field); }
public double run(String doc1,String doc2) throws IOException { // index strings s[0]=doc1; s[1]=doc2; //System.out.print(s[0]+"\n"+s[1]+"\n"); Directory index = new RAMDirectory(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); IndexWriter writer = new IndexWriter(index, config); for (String si : s) { Document doc = new Document(); doc.add(new Field("content", si, Field.Store.YES, Field.Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); } writer.close(); // read the index IndexReader reader = IndexReader.open(index); // calculate tf/idf Map<String,Integer> terms = new HashMap<String,Integer>(); TermEnum termEnum = reader.terms(new Term("content")); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (! "content".equals(term.field())) break; terms.put(term.text(), pos++); } // for (int i=0; i<reader.maxDoc(); i++) { // if (reader.isDeleted(i)) // continue; // // Document doc = reader.document(i); // System.out.println(doc); // TermFreqVector tfvs = reader.getTermFreqVector(i,"content"); // System.out.println(tfvs); // } // // apply cosine similarity DocVector[] docs = new DocVector[s.length]; for (int i=0; i<s.length; i++) { TermFreqVector[] tfvs = reader.getTermFreqVectors(i); //String strip_str=tfvs.toString(); //strip_str.replaceAll("null", ""); docs[i] = new DocVector(terms); //System.out.print(tfvs); //} for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { docs[i].setEntry(termTexts[j], termFreqs[j]); } } docs[i].normalize(); } // now get similarity between doc[0] and doc[1] double cosim01 = getCosineSimilarity(docs[0], docs[1]); //System.out.println("cosim(0,1)=" + cosim01); // between doc[0] and doc[2] // double cosim02 = getCosineSimilarity(docs[0], docs[3]); //System.out.println("cosim(0,2)=" + cosim02); // between doc[1] and doc[3] //double cosim03 = getCosineSimilarity(docs[1], docs[2]); //System.out.println("cosim(1,2)=" + cosim03); // } //double cosim01=10.0; reader.close(); return cosim01; }