Java 类org.apache.lucene.document.Field.TermVector 实例源码

项目:WikiKreator    文件:SummarizeWebData.java   
private void createIndexWriter(List<URLContentObject> urlContentList,
    Directory ramDirectory) throws IOException
    {
Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File(baseDir+
        "data/stopwords/stopwords_en.txt")));  
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36,stopWords);


IndexWriter indexWriter = new IndexWriter(ramDirectory, new IndexWriterConfig(Version.LUCENE_36, analyzer));
for(URLContentObject d:urlContentList)
{
    Document document = new Document();
    StringReader reader=new StringReader(d.getContent());
    document.add(new Field("id", d.getId(), Field.Store.YES, Field.Index.ANALYZED,TermVector.YES));
    document.add(new Field("contents", reader, TermVector.YES));
    //document.add(new Field("id",Integer.toString(d.getAutoIncrementId()),Field.Store.YES, Field.Index.ANALYZED));
    indexWriter.addDocument(document);
    reader.close();
}

indexWriter.commit();
indexWriter.close();



    }
项目:ontobrowser    文件:OntologySearchServiceImpl.java   
private Document createDocument(Term term, String value, boolean synonym) {
    Document doc = new Document();

    Field ontologyField = new Field(FIELD_ONTOLOGY, 
            term.getOntology().getName(), 
            Field.Store.YES, 
            Field.Index.NO,
            TermVector.NO);
    ontologyField.setOmitNorms(true);
    ontologyField.setOmitTermFreqAndPositions(true);
    doc.add(ontologyField);

    Field idField = new Field(FIELD_ID, 
            term.getReferenceId(), 
            Field.Store.YES, 
            Field.Index.NOT_ANALYZED,
            TermVector.NO);
    idField.setOmitNorms(true);
    idField.setOmitTermFreqAndPositions(true);
    doc.add(idField);

    Field nameField = new Field(FIELD_TERM, 
            value, 
            Field.Store.YES, 
            Field.Index.ANALYZED,
            TermVector.WITH_POSITIONS_OFFSETS);
    //nameField.setOmitNorms(true);
    doc.add(nameField);

    doc.add(new Field(FIELD_SYNONYM,
            synonym ? new byte[] {1} : new byte[] {0},  
            Field.Store.YES));

    return doc;
}
项目:community-edition-old    文件:AlfrescoSolrDataModel.java   
/**
 * 
 */
public NonDictionaryField(String name, Store store, Index index, TermVector termVector, boolean multiValued)
{
    this.name = name;
    this.store = store;
    this.index = index;
    this.termVector = termVector;
    this.multiValued = multiValued;
}
项目:WikiKreator    文件:CosineDocumentSimilarity.java   
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException{

        RAMDirectory ramDir = new RAMDirectory();
        FileReader fr=new FileReader(new File("lib/stoplists/en.txt"));

        //  Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));  
        Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr );
        //Index the full text of both documents
        //IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, IndexWriter.MaxFieldLength.UNLIMITED);
        IndexWriter writer =new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
        for (String s:fileSentences)
        {
            Document doc1 = new Document();
            StringReader d1reader=new StringReader(s);
            doc1.add(new Field("contents", d1reader, TermVector.YES));
            writer.addDocument(doc1);
        }


        //  writer.commit();
        writer.close();

        DocVector[] docs = new DocVector[fileSentences.size()];
        //Build a term vector for each document
        IndexReader RAMreader = IndexReader.open(ramDir);
        Map<String,Integer> terms = new HashMap<String,Integer>();
        TermEnum termEnum = RAMreader.terms(new Term("contents"));

        //System.out.println(RAMreader.numDocs());
        int pos = 0;
        while (termEnum.next()) {
          Term term = termEnum.term();
          if (!"contents".equals(term.field())) 
            break;
          terms.put(term.text(), pos++);
        }

        //System.out.println("Num terms:"+terms.size());

        for(int i=0;i<fileSentences.size();i++)
        {
            TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i);
            docs[i]=new DocVector(terms);
            if (tfvs==null)
                    continue;
            for (TermFreqVector tfv : tfvs) 
            {
                String[] termTexts = tfv.getTerms();
                int[] termFreqs = tfv.getTermFrequencies();
                for (int j = 0; j < termTexts.length; j++) {
                    double idfValue=getIDF(RAMreader,termTexts[j]);
                    double tfIdfValue=termFreqs[j]*idfValue;
                    docs[i].setEntry(termTexts[j], tfIdfValue);
                }

            }
            docs[i].normalize();


        }


        RAMreader.close();
        ramDir.close();
        //ramDir.close();
        //System.out.println(RAMreader.numDocs());
        //System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19]));
        return docs;

    }
项目:community-edition-old    文件:AlfrescoSolrDataModel.java   
private static void addNonDictionaryField(String name, Store store, Index index, TermVector termVector, boolean multiValued)
{
    nonDictionaryFields.put(name, new NonDictionaryField(name, store, index, termVector, multiValued));
}
项目:community-edition-old    文件:AlfrescoSolrDataModel.java   
private static void addAdditionalContentField(String name, Store store, Index index, TermVector termVector, boolean multiValued)
{
    additionalContentFields.put(name, new NonDictionaryField(name, store, index, termVector, multiValued));
}
项目:community-edition-old    文件:AlfrescoSolrDataModel.java   
private static void addAdditionalTextField(String name, Store store, Index index, TermVector termVector, boolean multiValued)
{
    additionalTextFields.put(name, new NonDictionaryField(name, store, index, termVector, multiValued));
}
项目:community-edition-old    文件:AlfrescoSolrDataModel.java   
private static void addAdditionalMlTextField(String name, Store store, Index index, TermVector termVector, boolean multiValued)
{
    additionalMlTextFields.put(name, new NonDictionaryField(name, store, index, termVector, multiValued));
}
项目:community-edition-old    文件:AlfrescoSolrDataModel.java   
/**
 * @param field
 * @return
 */
public TermVector getFieldTermVec(SchemaField field)
{
    return TermVector.NO;
}
项目:community-edition-old    文件:AlfrescoDataType.java   
@Override
protected TermVector getFieldTermVec(SchemaField field, String internalVal)
{
    return AlfrescoSolrDataModel.getInstance(id).getFieldTermVec(field);
}
项目:eventspotter    文件:CosineSimilarity.java   
public double run(String doc1,String doc2) throws IOException 
    {
        // index strings
        s[0]=doc1;
        s[1]=doc2;
        //System.out.print(s[0]+"\n"+s[1]+"\n");
        Directory index = new RAMDirectory();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter writer = new IndexWriter(index, config);        
        for (String si : s) {
            Document doc = new Document();
            doc.add(new Field("content", si, Field.Store.YES, Field.Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
            writer.addDocument(doc);
        }
        writer.close();

        // read the index
        IndexReader reader = IndexReader.open(index);

        // calculate tf/idf
        Map<String,Integer> terms = new HashMap<String,Integer>();
        TermEnum termEnum = reader.terms(new Term("content"));
        int pos = 0;
        while (termEnum.next()) {
            Term term = termEnum.term();
            if (! "content".equals(term.field())) break;
                terms.put(term.text(), pos++);
        }

//        for (int i=0; i<reader.maxDoc(); i++) {
//            if (reader.isDeleted(i))
//                continue;
//
//            Document doc = reader.document(i);
//            System.out.println(doc);
//            TermFreqVector tfvs = reader.getTermFreqVector(i,"content");
//            System.out.println(tfvs);
//        }
//        
        // apply cosine similarity
        DocVector[] docs = new DocVector[s.length];
        for (int i=0; i<s.length; i++) {
            TermFreqVector[] tfvs = reader.getTermFreqVectors(i);
            //String strip_str=tfvs.toString();
            //strip_str.replaceAll("null", "");

            docs[i] = new DocVector(terms); 
            //System.out.print(tfvs);
        //}

           for (TermFreqVector tfv : tfvs) {
                String[] termTexts = tfv.getTerms();
                int[] termFreqs = tfv.getTermFrequencies();
                for (int j = 0; j < termTexts.length; j++) {
                docs[i].setEntry(termTexts[j], termFreqs[j]);
              }
            }
            docs[i].normalize();

          }

        // now get similarity between doc[0] and doc[1]
        double cosim01 = getCosineSimilarity(docs[0], docs[1]);
        //System.out.println("cosim(0,1)=" + cosim01);
        // between doc[0] and doc[2]
       // double cosim02 = getCosineSimilarity(docs[0], docs[3]);
        //System.out.println("cosim(0,2)=" + cosim02);
        // between doc[1] and doc[3]
        //double cosim03 = getCosineSimilarity(docs[1], docs[2]);
        //System.out.println("cosim(1,2)=" + cosim03);

       // }
        //double cosim01=10.0;
        reader.close();
        return cosim01;
    }