private static Map<String, FacetCounter> computeMultivaluedTFV(ReaderAbstract reader, String fieldName, DocIdInterface docIdInterface) throws IOException, SearchLibException { final Map<String, FacetCounter> termMap = new LinkedHashMap<>(); if (docIdInterface.getSize() == 0) return termMap; for (int docId : docIdInterface.getIds()) { final TermFreqVector tfv = reader.getTermFreqVector(docId, fieldName); if (tfv == null) continue; final String[] terms = tfv.getTerms(); final int[] freqs = tfv.getTermFrequencies(); if (terms == null || freqs == null) continue; int i = 0; for (String term : terms) { if (freqs[i++] > 0) { final FacetCounter facetItem = termMap.get(term); if (facetItem == null) termMap.put(term, new FacetCounter(1)); else facetItem.increment(); } } } return termMap; }
public Set<FieldValue> getTermsVectorFields(int docId, Set<String> fieldNameSet) throws IOException { Set<FieldValue> fieldValueList = new HashSet<FieldValue>(); for (String fieldName : fieldNameSet) { TermFreqVector termFreqVector = indexReader.getTermFreqVector(docId, fieldName); if (termFreqVector == null) continue; String[] terms = termFreqVector.getTerms(); if (terms == null) continue; FieldValueItem[] fieldValueItem = new FieldValueItem[terms.length]; int i = 0; for (String term : terms) fieldValueItem[i++] = new FieldValueItem(FieldValueOriginEnum.TERM_VECTOR, term); fieldValueList.add(new FieldValue(fieldName, fieldValueItem)); } return fieldValueList; }
@Override public void populate(List<IndexDocumentResult> indexDocuments) throws IOException, SearchLibException { SchemaFieldList schemaFieldList = request.getConfig().getSchema().getFieldList(); for (int docId : docArray) { IndexDocumentResult indexDocument = new IndexDocumentResult(schemaFieldList.size()); Map<String, FieldValue> storedFieldMap = reader.getDocumentStoredField(docId); for (SchemaField schemaField : schemaFieldList) { String fieldName = schemaField.getName(); List<IndexTerm> indexTermList = null; if (schemaField.checkIndexed(Indexed.YES)) { if (schemaField.getTermVector() == TermVector.NO) { indexTermList = IndexTerm.toList(reader, fieldName, docId); } else { TermFreqVector termFreqVector = reader.getTermFreqVector(docId, fieldName); indexTermList = IndexTerm.toList(termFreqVector); } } IndexField indexField = new IndexField(fieldName, storedFieldMap.get(fieldName), indexTermList); indexDocument.add(indexField); } indexDocuments.add(indexDocument); } }
private void buildCategoryVectors() throws IOException { IndexReader reader = DirectoryReader.open(TestUtil.getBookIndexDirectory()); int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { if (!reader.isDeleted(i)) { Document doc = reader.document(i); String category = doc.get("category"); Map vectorMap = (Map) categoryMap.get(category); if (vectorMap == null) { vectorMap = new TreeMap(); categoryMap.put(category, vectorMap); } TermFreqVector termFreqVector = reader.getTermFreqVector(i, "subject"); addTermFreqToMap(vectorMap, termFreqVector); } } }
private void addTermFreqToMap(Map vectorMap, TermFreqVector termFreqVector) { String[] terms = termFreqVector.getTerms(); int[] freqs = termFreqVector.getTermFrequencies(); for (int i = 0; i < terms.length; i++) { String term = terms[i]; if (vectorMap.containsKey(term)) { Integer value = (Integer) vectorMap.get(term); vectorMap.put(term, new Integer(value.intValue() + freqs[i])); } else { vectorMap.put(term, new Integer(freqs[i])); } } }
public void putTermFreqVectors(final int[] docIds, final String field, final Collection<TermFreqVector> termFreqVectors) throws IOException { if (termFreqVectors == null || docIds == null || docIds.length == 0) return; for (int docId : docIds) termFreqVectors.add(indexReader.getTermFreqVector(docId, field)); }
@Override public void putTermVectors(int[] docIds, String field, Collection<String[]> termVectors) throws IOException { if (docIds == null || docIds.length == 0 || field == null || termVectors == null) return; List<TermFreqVector> termFreqVectors = new ArrayList<TermFreqVector>(docIds.length); putTermFreqVectors(docIds, field, termFreqVectors); for (TermFreqVector termFreqVector : termFreqVectors) termVectors.add(termFreqVector.getTerms()); }
@Override final public TermFreqVector getTermFreqVector(final int docId, final String field) throws IOException, SearchLibException { checkOnline(true); ReaderLocal reader = acquire(); try { return reader.getTermFreqVector(docId, field); } finally { release(reader); } }
private static final TermPositionVector getTermPositionVector( final String[] terms, final ReaderInterface readerInterface, final int docId, final String field, List<FieldValueItem> values, CompiledAnalyzer analyzer, Timer timer) throws IOException, SearchLibException, ParseException, SyntaxError { TermFreqVector termFreqVector = readerInterface.getTermFreqVector( docId, field); if (termFreqVector != null) if (termFreqVector instanceof TermPositionVector) return (TermPositionVector) termFreqVector; if (analyzer == null) return null; SnippetTermPositionVector stpv = new SnippetTermPositionVector(field, terms); int positionOffset = 0; int characterOffset = 0; List<TokenTerm> tokenTerms = new ArrayList<TokenTerm>(); for (FieldValueItem fieldValueItem : values) { if (fieldValueItem.value == null) continue; analyzer.populate(fieldValueItem.value, tokenTerms); positionOffset = stpv.addCollection(tokenTerms, characterOffset, positionOffset); characterOffset += fieldValueItem.value.length() + 1; tokenTerms.clear(); } stpv.compile(); return stpv; }
public final static List<IndexTerm> toList(TermFreqVector termVector) { if (termVector == null) return null; String[] terms = termVector.getTerms(); if (terms == null) return null; int[] frequencies = termVector.getTermFrequencies(); List<IndexTerm> indexTerms = new ArrayList<IndexTerm>(terms.length); if (termVector instanceof TermPositionVector) toListPosition((TermPositionVector) termVector, terms, frequencies, indexTerms); else toListFreq(termVector, terms, frequencies, indexTerms); return indexTerms; }
private final static void toListFreq(TermFreqVector termVector, String[] terms, int[] frequencies, List<IndexTerm> indexTerms) { int i = 0; for (String term : terms) { IndexTerm indexTerm = new IndexTerm(term, frequencies[i], null, null); indexTerms.add(indexTerm); i++; } }
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException{ RAMDirectory ramDir = new RAMDirectory(); FileReader fr=new FileReader(new File("lib/stoplists/en.txt")); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr ); //Index the full text of both documents //IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer =new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for (String s:fileSentences) { Document doc1 = new Document(); StringReader d1reader=new StringReader(s); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); } // writer.commit(); writer.close(); DocVector[] docs = new DocVector[fileSentences.size()]; //Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String,Integer> terms = new HashMap<String,Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); //System.out.println(RAMreader.numDocs()); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (!"contents".equals(term.field())) break; terms.put(term.text(), pos++); } //System.out.println("Num terms:"+terms.size()); for(int i=0;i<fileSentences.size();i++) { TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i); docs[i]=new DocVector(terms); if (tfvs==null) continue; for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { double idfValue=getIDF(RAMreader,termTexts[j]); double tfIdfValue=termFreqs[j]*idfValue; docs[i].setEntry(termTexts[j], tfIdfValue); } } docs[i].normalize(); } RAMreader.close(); ramDir.close(); //ramDir.close(); //System.out.println(RAMreader.numDocs()); //System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19])); return docs; }
public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException { return wrapped.getTermFreqVector(docNumber, field); }
public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { return wrapped.getTermFreqVectors(docNumber); }
@Override public TermFreqVector getTermFreqVector(final int docId, final String field) throws IOException { return indexReader.getTermFreqVector(docId, field); }
public double run(String doc1,String doc2) throws IOException { // index strings s[0]=doc1; s[1]=doc2; //System.out.print(s[0]+"\n"+s[1]+"\n"); Directory index = new RAMDirectory(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); IndexWriter writer = new IndexWriter(index, config); for (String si : s) { Document doc = new Document(); doc.add(new Field("content", si, Field.Store.YES, Field.Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); } writer.close(); // read the index IndexReader reader = IndexReader.open(index); // calculate tf/idf Map<String,Integer> terms = new HashMap<String,Integer>(); TermEnum termEnum = reader.terms(new Term("content")); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (! "content".equals(term.field())) break; terms.put(term.text(), pos++); } // for (int i=0; i<reader.maxDoc(); i++) { // if (reader.isDeleted(i)) // continue; // // Document doc = reader.document(i); // System.out.println(doc); // TermFreqVector tfvs = reader.getTermFreqVector(i,"content"); // System.out.println(tfvs); // } // // apply cosine similarity DocVector[] docs = new DocVector[s.length]; for (int i=0; i<s.length; i++) { TermFreqVector[] tfvs = reader.getTermFreqVectors(i); //String strip_str=tfvs.toString(); //strip_str.replaceAll("null", ""); docs[i] = new DocVector(terms); //System.out.print(tfvs); //} for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { docs[i].setEntry(termTexts[j], termFreqs[j]); } } docs[i].normalize(); } // now get similarity between doc[0] and doc[1] double cosim01 = getCosineSimilarity(docs[0], docs[1]); //System.out.println("cosim(0,1)=" + cosim01); // between doc[0] and doc[2] // double cosim02 = getCosineSimilarity(docs[0], docs[3]); //System.out.println("cosim(0,2)=" + cosim02); // between doc[1] and doc[3] //double cosim03 = getCosineSimilarity(docs[1], docs[2]); //System.out.println("cosim(1,2)=" + cosim03); // } //double cosim01=10.0; reader.close(); return cosim01; }
TermFreqVector getTermFreqVector(final int docId, final String field) throws IOException, SearchLibException;