/** * Create a list of term arrays of no larger than 1024 (Default {@link * BooleanQuery.maxClauseCount()} for boolean queries) */ public Term[] expand(IndexReader ir, String field, String prefix) throws IOException { if( prefix.isEmpty() ) { return new Term[0]; } ArrayList<Term> terms = Lists.newArrayList(); try( TermEnum t = ir.terms(new Term(field, prefix)) ) { do { if( t.term().text().startsWith(prefix) ) { terms.add(t.term()); } else { break; } } while( t.next() ); } return terms.toArray(new Term[terms.size()]); }
public Map<String, Object> getTotalTermsAndTermsByDocFreq() throws IOException { Map<String, Object> r = new HashMap<String, Object>(); TermEnum terms = reader.terms(); int max = 0; int num = 0; Map<Integer, List<String>> counts = new HashMap<Integer, List<String>>(); while (terms.next()) { int docFreq = terms.docFreq(); if (docFreq > max) { max = docFreq; } if (!counts.containsKey(docFreq)) { counts.put(docFreq, new ArrayList<String>()); } counts.get(docFreq).add(terms.term().text()); num++; } r.put(TOTAL_NUM_TERMS, num); r.put(TERMS_BY_DOC_FREQ, counts); r.put(MAX_TERM_FREQ, max); return r; }
/** * Devuelve la lista de términos extraídos de los documentos indexados * * @return lista de términos extraídos */ @Override public List<String> getTerms() { ArrayList<String> termList = new ArrayList<>(); try { TermEnum terms = ireader.terms(); while (terms.next()) { termList.add(terms.term().text()); } return termList; } catch (IOException ex) { Logger.getLogger(LuceneIndexing.class.getName()).log(Level.SEVERE, null, ex); } return termList; }
@Override public boolean isInIndex(String fieldType, long id) throws IOException { String target = NumericEncoder.encode(id); RefCounted<SolrIndexSearcher> refCounted = null; Term term = null; try { refCounted = core.getSearcher(false, true, null); TermEnum termEnum = refCounted.get().getReader().terms(new Term(fieldType, target)); term = termEnum.term(); termEnum.close(); } finally { if (refCounted != null) { refCounted.decref(); } refCounted = null; } return term != null && target.equals(term.text()); }
public Set<FieldValue> getTerms(int docId, Set<String> fieldNameSet) throws IOException { TermPositions termPosition = indexReader.termPositions(); Set<FieldValue> fieldValueSet = new HashSet<FieldValue>(); for (String fieldName : fieldNameSet) { List<FieldValueItem> fieldValueItemList = new ArrayList<FieldValueItem>(); TermEnum termEnum = indexReader.terms(new Term(fieldName, "")); if (termEnum == null) continue; Term term = termEnum.term(); if (!term.field().equals(fieldName)) continue; do { term = termEnum.term(); if (!term.field().equals(fieldName)) break; termPosition.seek(term); if (!termPosition.skipTo(docId) || termPosition.doc() != docId) continue; fieldValueItemList.add(new FieldValueItem(FieldValueOriginEnum.TERM_ENUM, term.text())); } while (termEnum.next()); termEnum.close(); if (fieldValueItemList.size() > 0) fieldValueSet.add(new FieldValue(fieldName, fieldValueItemList)); } return fieldValueSet; }
@Override public <T> void queryTerms( @NonNull Collection<? super T> result, @NullAllowed Term start, @NonNull StoppableConvertor<Term, T> filter, @NullAllowed AtomicBoolean cancel) throws IOException, InterruptedException { Parameters.notNull("result", result); //NOI18N Parameters.notNull("filter", filter); //NOI18N lock.readLock().lock(); try { final IndexReader in = getReader(); if (in == null) { return; } final TermEnum terms = start == null ? in.terms () : in.terms (start); try { do { if (cancel != null && cancel.get()) { throw new InterruptedException (); } final Term currentTerm = terms.term(); if (currentTerm != null) { final T vote = filter.convert(currentTerm); if (vote != null) { result.add(vote); } } } while (terms.next()); } catch (StoppableConvertor.Stop stop) { //Stop iteration of TermEnum } finally { terms.close(); } } finally { lock.readLock().unlock(); } }
@Override public T convert(@NonNull final TermEnum terms) throws StoppableConvertor.Stop { final Term currentTerm = terms.term(); if (currentTerm == null) { return null; } return delegate.convert(currentTerm); }
@Override public T convert(TermEnum terms) throws StoppableConvertor.Stop { final Term currentTerm = terms.term(); if (currentTerm == null) { return null; } final int freq = terms.docFreq(); return delegate.convert(accessor.setTermFreq(tf, currentTerm, freq)); }
private void getAllTerms() throws IOException { final TermEnum terms = reader.getIndexReader().terms(); boolean next = terms.next(); while (next != false) { Term term = terms.term(); if (reader.docFreq(term) > MIN_DOC_FREQ) { String original = term.text().trim(); if (original.equals("S")) { textLabels.add(original); next = terms.next(); continue; } if (original.length() < 2) { next = terms.next(); continue; } if (original.length() == 2) { if (original.toUpperCase().equals(original)) { textLabels.add(original); } else { next = terms.next(); continue; } } int uppercases = 0; for (int i = 0; i < original.length() && uppercases <= original.length() / 2; i++) { if (Character.isUpperCase(original.charAt(i))) { uppercases++; } } if (uppercases > original.length() / 2) { textLabels.add(original); } } next = terms.next(); } }
public int findDocFreq(String term) throws IOException { TermEnum terms = reader.terms(); boolean next = terms.next(); while(next) { Term t = terms.term(); if (t.text().equals(term)) { return reader.docFreq(t); } next = terms.next(); } return 0; }
private long getLastTxCommitTimeBeforeHoles(SolrIndexReader reader, Long cutOffTime) throws IOException { long lastTxCommitTimeBeforeHoles = 0; TermEnum termEnum = reader.terms(new Term(QueryConstants.FIELD_TXCOMMITTIME, "")); do { Term term = termEnum.term(); if (term == null) { break; } if (term.field().equals(QueryConstants.FIELD_TXCOMMITTIME)) { Long txCommitTime = NumericEncoder.decodeLong(term.text()); if (txCommitTime < cutOffTime) { lastTxCommitTimeBeforeHoles = txCommitTime; } else { break; } } else { break; } } while (termEnum.next()); termEnum.close(); return lastTxCommitTimeBeforeHoles; }
private long getLastChangeSetCommitTimeBeforeHoles(SolrIndexReader reader, Long cutOffTime) throws IOException { long lastTxCommitTimeBeforeHoles = 0; TermEnum termEnum = reader.terms(new Term(QueryConstants.FIELD_ACLTXCOMMITTIME, "")); do { Term term = termEnum.term(); if (term == null) { break; } if (term.field().equals(QueryConstants.FIELD_ACLTXCOMMITTIME)) { Long txCommitTime = NumericEncoder.decodeLong(term.text()); if (txCommitTime < cutOffTime) { lastTxCommitTimeBeforeHoles = txCommitTime; } else { break; } } else { break; } } while (termEnum.next()); termEnum.close(); return lastTxCommitTimeBeforeHoles; }
private long getLastTransactionCommitTime(SolrIndexReader reader) { long lastTxCommitTime = 0; try { TermEnum termEnum = reader.terms(new Term(QueryConstants.FIELD_TXCOMMITTIME, "")); do { Term term = termEnum.term(); if (term == null) { break; } if (term.field().equals(QueryConstants.FIELD_TXCOMMITTIME)) { Long txCommitTime = NumericEncoder.decodeLong(term.text()); lastTxCommitTime = txCommitTime; } else { break; } } while (termEnum.next()); termEnum.close(); } catch (IOException e1) { // do nothing } return lastTxCommitTime; }
private long getLastTransactionId(SolrIndexReader reader) throws IOException { long lastTxId = 0; try { TermEnum termEnum = reader.terms(new Term(QueryConstants.FIELD_TXID, "")); do { Term term = termEnum.term(); if (term == null) { break; } if (term.field().equals(QueryConstants.FIELD_TXID)) { Long txId = NumericEncoder.decodeLong(term.text()); lastTxId = txId; } else { break; } } while (termEnum.next()); termEnum.close(); } catch (IOException e1) { } return lastTxId; }
private long getLastChangeSetId(SolrIndexReader reader) throws IOException { long lastTxCommitTime = 0; try { TermEnum termEnum = reader.terms(new Term(QueryConstants.FIELD_ACLTXID, "")); do { Term term = termEnum.term(); if (term == null) { break; } if (term.field().equals(QueryConstants.FIELD_ACLTXID)) { Long txCommitTime = NumericEncoder.decodeLong(term.text()); lastTxCommitTime = txCommitTime; } else { break; } } while (termEnum.next()); termEnum.close(); } catch (IOException e1) { } return lastTxCommitTime; }
private long getLastChangeSetCommitTime(SolrIndexReader reader) throws IOException { long lastTxCommitTime = 0; try { TermEnum termEnum = reader.terms(new Term(QueryConstants.FIELD_ACLTXCOMMITTIME, "")); do { Term term = termEnum.term(); if (term == null) { break; } if (term.field().equals(QueryConstants.FIELD_ACLTXCOMMITTIME)) { Long txCommitTime = NumericEncoder.decodeLong(term.text()); lastTxCommitTime = txCommitTime; } else { break; } } while (termEnum.next()); termEnum.close(); } catch (IOException e1) { } return lastTxCommitTime; }
public static boolean fieldHasTerm(IndexReader indexReader, String field) { try { TermEnum termEnum = indexReader.terms(new Term(field, "")); try { if (termEnum.next()) { Term first = termEnum.term(); return first.field().equals(field); } else { return false; } } finally { termEnum.close(); } } catch (IOException e) { throw new AlfrescoRuntimeException("Could not find terms for sort field ", e); } }
public Map<String, Integer> getAllTermFreqFromItems() throws IOException { Map<String,Integer> map = new HashMap<String,Integer>(); String indexDir = this.aCase.getCaseLocation() + File.separator + ApplicationConstants.CASE_INDEX_FOLDER; Directory dir = FSDirectory.open(new File(indexDir)); IndexReader indexReader = IndexReader.open(dir); TermEnum terms = indexReader.terms(); int factor = indexReader.maxDoc() / 100; while(terms.next()) { if ( isCancelledTask() ) break; Term term = terms.term(); if ( this.isAllowedFeild(term.field().trim()) ) { String termText = term.text(); int frequency = indexReader.docFreq(term); if ( frequency >= factor) map.put(termText, frequency); } } System.out.println("map size: " + map.size()); indexReader.close(); return map; }
public static Map<String, Double> getAllFilesFrequency(final CaseFacade caseFacade) throws IOException{ Map<String,Double> map = new HashMap<String,Double>(); String indexDir = caseFacade.getCaseIndexFolderLocation(); Directory dir = FSDirectory.open(new File(indexDir)); IndexReader indexReader = IndexReader.open(dir); TermEnum te = indexReader.terms(new Term(IndexingConstant.FILE_PATH,"") ); while ( te.next() ) { Term currentTerm = te.term(); if ( ! currentTerm.field().equals(IndexingConstant.FILE_PATH)) continue ; String file = currentTerm.text(); String fullPath = caseFacade.getFullPath(file); String ext = FileUtil.getExtension(fullPath); if ( ext == null || ext.length() > 6) // no more extension than 5 character! continue; ext = ext.toLowerCase(); if ( map.get(ext) == null ){ map.put(ext, 1.0); } else map.put(ext, map.get(ext) + 1); } te.close(); indexReader.close(); return map ; }
/** * Reads the document UUIDs within the index. * @param maxUuids the maximum number to read * @param startIndex the index to begin reading * @return the set of UUIDs * @throws CatalogIndexException if an exception occurs */ private StringSet readUuids(int startIndex, int maxUuids) throws CatalogIndexException { StringSet ssUuids = new StringSet(); IndexSearcher searcher = null; TermEnum terms = null; try { String sField = Storeables.FIELD_UUID; searcher = newSearcher(); terms = searcher.getIndexReader().terms(new Term(sField,"")); int nCount = 0; while (sField.equals(terms.term().field())) { if(nCount >= startIndex){ ssUuids.add(terms.term().text()); } nCount++; if (nCount >= (startIndex + maxUuids)) break; if (!terms.next()) break; } } catch (Exception e) { String sMsg = "Error accessing index:\n "+Val.chkStr(e.getMessage()); throw new CatalogIndexException(sMsg,e); } finally { try {if (terms != null) terms.close();} catch (Exception ef) {} closeSearcher(searcher); } return ssUuids; }
static LuceneUnsortedIntTermDocIterator create(final IndexReader r, final String field) throws IOException { final TermEnum terms = r.terms(new Term(field, "")); final TermDocs termDocs; try { termDocs = r.termDocs(); } catch (IOException e) { try { terms.close(); } catch (IOException e1) { log.error("error closing TermEnum", e1); } throw e; } return new LuceneUnsortedIntTermDocIterator(field, terms, termDocs); }
@Override public void accept(TermEnum termEnum) throws IOException, SearchLibException { Term term; while ((term = termEnum.term()) != null) { if (isAborted()) break; if (!fieldName.equals(term.field())) break; docCount = indexTerm(term.text(), termEnum.docFreq(), buffer, docCount); termEnum.next(); } }
@Override public void accept(TermEnum termEnum) throws IOException, SearchLibException { Term term; while ((term = termEnum.term()) != null) { if (isAborted()) break; if (!fieldName.equals(term.field())) break; final TermDocsConsumer termDocsConsumer = new TermDocsConsumer(); sourceClient.getIndex().termDocs(term, termDocsConsumer); if (termDocsConsumer.add) docCount = indexTerm(term.text(), termEnum.docFreq(), buffer, docCount); termEnum.next(); } }
@Override public void termEnum(final FunctionUtils.ConsumerEx<TermEnum, IOException> termEnumConsumer) throws IOException, SearchLibException { try (final TermEnum termEnum = indexReader.terms()) { termEnumConsumer.accept(termEnum); } }
@Override public void termEnum(final Term term, final FunctionUtils.ConsumerEx2<TermEnum, IOException, SearchLibException> termEnumConsumer) throws IOException, SearchLibException { try (final TermEnum termEnum = indexReader.terms(term)) { termEnumConsumer.accept(termEnum); } }
@Override final public void termEnum(final FunctionUtils.ConsumerEx<TermEnum, IOException> termEnumConsumer) throws IOException, SearchLibException { checkOnline(true); final ReaderLocal reader = acquire(); try { reader.termEnum(termEnumConsumer); } finally { release(reader); } }
@Override final public void termEnum(final Term term, final FunctionUtils.ConsumerEx2<TermEnum, IOException, SearchLibException> termEnumConsumer) throws IOException, SearchLibException { checkOnline(true); final ReaderLocal reader = acquire(); try { reader.termEnum(term, termEnumConsumer); } finally { release(reader); } }
private TermEnum getTermEnum(@NonNull final IndexReader reader) { return new TermEnum () { private Iterator<String> pkgsIt = pkgs.iterator(); private String current; { next(); } @Override public boolean next() { if (pkgsIt == null) { throw new IllegalStateException("Already closed."); //NOI18N } if (pkgsIt.hasNext()) { current = pkgsIt.next(); return true; } else { current = null; return false; } } @Override public Term term() { return current == null ? null : new Term (DocumentUtil.FIELD_PACKAGE_NAME, current); } @Override public int docFreq() { return current == null ? -1 : 0; } @Override public void close() throws IOException { pkgsIt = null; } }; }
static <T> StoppableConvertor<TermEnum,T> newTermEnumToTermConvertor( @NonNull StoppableConvertor<Term,T> delegate) { return new TermEnumToTerm<T>(delegate); }
static <T> StoppableConvertor<TermEnum,T> newTermEnumToFreqConvertor( @NonNull StoppableConvertor<Index.WithTermFrequencies.TermFreq,T> delegate) { return new TermEnumToFreq<T>(delegate); }
public void seek(TermEnum termEnum) throws IOException { // Seek is left to the base implementation in.seek(termEnum); }
public void seek(TermEnum termEnum) throws IOException { throw new UnsupportedOperationException(); }
public String suggestTerm(final Search request, final String prefix, final boolean isSearchAttachment) { return search(new Searcher<String>() { @Override public String search(IndexSearcher searcher) throws IOException { // Get the reader IndexReader reader = searcher.getIndexReader(); // Get all the docs from the filters Collection<Filter> filters = getFilters(request); Query query = getQuery(request, reader, isSearchAttachment); filters.add(new QueryWrapperFilter(query)); ChainedFilter chain = new ChainedFilter(filters.toArray(new Filter[filters.size()]), ChainedFilter.AND); DocIdSetIterator iterator = chain.getDocIdSet(reader).iterator(); // Get docs that contain terms that begin with the prefix List<Term> termList = Lists.newArrayList(); termList.add(new Term(FreeTextQuery.FIELD_BODY_NOSTEM, prefix)); termList.add(new Term(FreeTextQuery.FIELD_ATTACHMENT_VECTORED_NOSTEM, prefix)); for( Term term : termList ) { TermDocs termDocs = reader.termDocs(term); TermEnum terms = reader.terms(term); // Check if doc is in the filter and return the term Term t; for( t = terms.term(); t != null && t.text().startsWith(prefix); t = terms.term() ) { termDocs.seek(t); while( termDocs.next() ) { int docId = termDocs.doc(); if( docId == iterator.advance(docId) ) { return t.text(); } } terms.next(); } } return ""; } }); }
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { OpenBitSet bits = new OpenBitSet(reader.maxDoc()); Term startTerm = new Term(field, start); Term endTerm = new Term(field, end); TermEnum enumerator = reader.terms(startTerm); if( enumerator.term() == null ) { return bits; } TermDocs termDocs = reader.termDocs(); try { Term current = enumerator.term(); while( current.compareTo(endTerm) <= 0 ) { termDocs.seek(enumerator.term()); while( termDocs.next() ) { bits.set(termDocs.doc()); } if( !enumerator.next() ) { break; } current = enumerator.term(); } } finally { enumerator.close(); termDocs.close(); } return bits; }
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException{ RAMDirectory ramDir = new RAMDirectory(); FileReader fr=new FileReader(new File("lib/stoplists/en.txt")); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr ); //Index the full text of both documents //IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer =new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for (String s:fileSentences) { Document doc1 = new Document(); StringReader d1reader=new StringReader(s); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); } // writer.commit(); writer.close(); DocVector[] docs = new DocVector[fileSentences.size()]; //Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String,Integer> terms = new HashMap<String,Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); //System.out.println(RAMreader.numDocs()); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (!"contents".equals(term.field())) break; terms.put(term.text(), pos++); } //System.out.println("Num terms:"+terms.size()); for(int i=0;i<fileSentences.size();i++) { TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i); docs[i]=new DocVector(terms); if (tfvs==null) continue; for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { double idfValue=getIDF(RAMreader,termTexts[j]); double tfIdfValue=termFreqs[j]*idfValue; docs[i].setEntry(termTexts[j], tfIdfValue); } } docs[i].normalize(); } RAMreader.close(); ramDir.close(); //ramDir.close(); //System.out.println(RAMreader.numDocs()); //System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19])); return docs; }
public static HashSet<Long> buildReaderAclIds(SolrIndexSearcher searcher, String authority, long[] aclIdByDocId) throws IOException { HashSet<Long> aclsAsSet = new HashSet<Long>(); IndexReader reader = searcher.getReader(); TermEnum termEnum = reader.terms(new Term("READER", authority)); try { Term term = termEnum.term(); if (term == null) { return aclsAsSet; } if (term.field().equals("READER") && term.text().equals(authority)) { TermDocs termDocs = reader.termDocs(term); try { while (termDocs.next()) { int currentDoc = termDocs.doc(); long acl = aclIdByDocId[currentDoc]; aclsAsSet.add(acl); } } finally { termDocs.close(); } return aclsAsSet; } else { return aclsAsSet; } } finally { termEnum.close(); } }
public HashSet<Long> buildReaderAclIds(SolrIndexSearcher searcher, String authority, long[] aclIdByDocId) throws IOException { HashSet<Long> aclsAsSet = new HashSet<Long>(); IndexReader reader = searcher.getReader(); TermEnum termEnum = reader.terms(new Term("READER", authority)); try { Term term = termEnum.term(); if (term == null) { return aclsAsSet; } if (term.field().equals("READER") && term.text().equals(authority)) { TermDocs termDocs = reader.termDocs(term); try { while (termDocs.next()) { int currentDoc = termDocs.doc(); long acl = aclIdByDocId[currentDoc]; aclsAsSet.add(acl); } } finally { termDocs.close(); } return aclsAsSet; } else { return aclsAsSet; } } finally { termEnum.close(); } }