Java 类org.apache.lucene.analysis.tokenattributes.TermAttribute 实例源码
项目:hadoop-distributed-crawler
文件:URLAnalyzer.java
/**
* 对一段文本进行分词,并将分词及其位置加入到urlInfo中
* @param text 待分词的文本
*/
private void segment(String text) {
IKAnalyzer analyzer = new IKAnalyzer(true);
StringReader reader = new StringReader(text);
TokenStream tokenStream = analyzer.tokenStream("*", reader);
TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);
try {
while (tokenStream.incrementToken()) {
location ++;
String term = termAtt.term();
urlInfo.putURLLocation(term, location);
}
}
catch(IOException exp) {
exp.printStackTrace();
}
}
项目:olat
文件:SearchInputController.java
protected Set<String> getHighlightWords(final String searchString) {
try {
final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
final TokenStream stream = analyzer.tokenStream("content", new StringReader(searchString));
final TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
for (boolean next = stream.incrementToken(); next; next = stream.incrementToken()) {
final String term = termAtt.term();
if (log.isDebugEnabled()) {
log.debug(term);
}
}
} catch (final IOException e) {
log.error("", e);
}
return null;
}
项目:MFIBlocking
文件:WordProcessor.java
public List<String> removeStopwordsAndSpecialChars(String value){
List<String> retVal = new ArrayList<String>();
value = value.replaceAll(replaceExpr, "");
StringReader sr = new StringReader(value);
TokenStream ts = analyzer.tokenStream(value, sr);
try {
while(ts.incrementToken()){
TermAttribute m = ts.getAttribute(TermAttribute.class);
retVal.add(m.term());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return retVal;
}
项目:align-api-project
文件:CommonWords.java
private void extractTerms(String e) {
Set<String> s = new LinkedHashSet<String>();
TokenStream ts = analyzer.tokenStream("", new StringReader(e));
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
try {
while ( ts.incrementToken() ) {
s.add( termAtt.term() );
}
} catch (IOException ex) {
ex.printStackTrace();
}
/*
Token token;
try {
while ((token = ts.next()) != null) {
s.add(token.termText());
}
} catch (IOException ex) {
ex.printStackTrace();
}
*/
map.put(e, s);
}
项目:t4f-data
文件:SynonymAnalyzerTest.java
public void testJumps() throws Exception {
TokenStream stream =
synonymAnalyzer.tokenStream("contents", // #A
new StringReader("jumps")); // #A
TermAttribute term = stream.addAttribute(TermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
int i = 0;
String[] expected = new String[]{"jumps", // #B
"hops", // #B
"leaps"}; // #B
while(stream.incrementToken()) {
assertEquals(expected[i], term.term());
int expectedPos; // #C
if (i == 0) { // #C
expectedPos = 1; // #C
} else { // #C
expectedPos = 0; // #C
} // #C
assertEquals(expectedPos, // #C
posIncr.getPositionIncrement()); // #C
i++;
}
assertEquals(3, i);
}
项目:airsonic
文件:SearchService.java
private String analyzeQuery(String query) throws IOException {
StringBuilder result = new StringBuilder();
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
while (filter.incrementToken()) {
result.append(termAttribute.term()).append("* ");
}
return result.toString();
}
项目:subsonic
文件:SearchService.java
private String analyzeQuery(String query) throws IOException {
StringBuilder result = new StringBuilder();
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
while (filter.incrementToken()) {
result.append(termAttribute.term()).append("* ");
}
return result.toString();
}
项目:lesk-wsd-dsm
文件:RevisedLesk.java
/**
*
* @param text
* @return
* @throws IOException
*/
public Map<String, Float> buildBag(String text) throws IOException {
Map<String, Float> bag = new HashMap<>();
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
SnowballStemmer stemmer = null;
if (stemming) {
stemmer = getStemmer(language);
if (stemmer == null) {
Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}", language);
}
}
TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text));
while (tokenStream.incrementToken()) {
TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
String term = token.term();
if (stemmer != null) {
stemmer.setCurrent(term);
if (stemmer.stem()) {
term = stemmer.getCurrent();
}
}
Float c = bag.get(term);
if (c == null) {
bag.put(term, 1f);
} else {
bag.put(term, c + 1f);
}
}
return bag;
}
项目:align-api-project
文件:JWNLDistances.java
/**
* Takes a gloss-like string (text) and returns it tokenized.
* with:
* - stopwords
* - lower case
* - porter stemmer
*/
protected Set<String> tokenizeGloss( String s ) throws IOException {
Set<String> result = new HashSet<String>();
// I am affraid that I am reimplementing the StandardAnalizer...
TokenStream ts = new PorterStemFilter(
new StopFilter( true,
new LowerCaseTokenizer(
new StringReader( s ) ), stopWords, true ));
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
while ( ts.incrementToken() ) {
result.add( termAtt.term() );
}
return result;
}
项目:align-api-project
文件:VectorSpaceMeasure.java
/**
* add all words contained in toAnalyse into words collection. Words are stemmed.
* @param toAnalyse : the string to be analysed
* @param words : the collection to add extracted words
*/
protected void analyseString(String toAnalyse, Collection<String> words) {
TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse));
TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class);
try {
while ( tokenS.incrementToken() ) {
words.add( termAtt.term() );
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
项目:t4f-data
文件:SynonymFilter.java
public SynonymFilter(TokenStream in, SynonymEngine engine) {
super(in);
synonymStack = new Stack<String>(); //#1
this.engine = engine;
this.termAtt = addAttribute(TermAttribute.class);
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
}
项目:t4f-data
文件:ChineseDemo.java
private static void analyze(String string, Analyzer analyzer)
throws IOException {
StringBuffer buffer = new StringBuffer();
TokenStream stream = analyzer.tokenStream("contents",
new StringReader(string));
TermAttribute term = stream.addAttribute(TermAttribute.class);
while(stream.incrementToken()) { //C
buffer.append("[");
buffer.append(term.term());
buffer.append("] ");
}
String output = buffer.toString();
Frame f = new Frame();
f.setTitle(analyzer.getClass().getSimpleName() + " : " + string);
f.setResizable(true);
Font font = new Font(null, Font.PLAIN, 36);
int width = getWidth(f.getFontMetrics(font), output);
f.setSize((width < 250) ? 250 : width + 50, 75);
// NOTE: if Label doesn't render the Chinese characters
// properly, try using javax.swing.JLabel instead
Label label = new Label(output); //D
label.setSize(width, 75);
label.setAlignment(Label.CENTER);
label.setFont(font);
f.add(label);
f.setVisible(true);
}
项目:SynonymAnalyzer
文件:MySynonymFilter.java
protected MySynonymFilter(TokenStream input) {
super(input);
termAtt = addAttribute(TermAttribute.class);
synonymMap.put("lucene", "information retrieval");
synonymMap.put("c#", "csharp");
}
项目:SynonymAnalyzer
文件:LuceneTest.java
public static void testStandardAnalyzer() throws Exception {
System.out.println("Standard Analyzer");
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
TokenStream ts = analyzer.tokenStream("Field", new StringReader("The quick brown fox jumps over lazy dog"));
ts.reset();
while (ts.incrementToken()) {
//System.out.println("token: " + ts.toString());
System.out.println("token: " + ts.getAttribute(TermAttribute.class).term());
}
ts.close();
}
项目:SynonymAnalyzer
文件:LuceneTest.java
public static void testSynonymAnalyzer() throws Exception {
Analyzer analyzer = new SynonymAnalyzer();
TokenStream ts = analyzer.tokenStream("Address", new StringReader("Expertise in C# and Lucene"));
ts.reset();
while (ts.incrementToken()) {
//System.out.println("token: " + ts.toString());
System.out.println("token: " + ts.getAttribute(TermAttribute.class).term());
}
ts.close();
}
项目:t4f-data
文件:BulletinPayloadsFilter.java
BulletinPayloadsFilter(TokenStream in, float warningBoost) {
super(in);
payloadAttr = addAttribute(PayloadAttribute.class);
termAtt = addAttribute(TermAttribute.class);
boostPayload = new Payload(PayloadHelper.encodeFloat(warningBoost));
}
项目:t4f-data
文件:PositionalStopFilter.java
public PositionalStopFilter(TokenStream in, CharArraySet stopWords) {
super(in);
this.stopWords = stopWords;
posIncrAttr = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
termAttr = (TermAttribute) addAttribute(TermAttribute.class);
}