Java 类weka.core.tokenizers.Tokenizer 实例源码

项目:AffectiveTweets    文件:TweetToFeatureVector.java   
@OptionMetadata(displayName = "tokenizer",
        description = "The tokenizing algorithm to use on the tweets. Uses the CMU TweetNLP tokenizer as default",
        commandLineParamName = "tokenizer",
        commandLineParamSynopsis = "-tokenizer <string>", displayOrder = 3)     
public Tokenizer getTokenizer() {
    return m_tokenizer;
}
项目:AffectiveTweets    文件:LexiconDistantSupervision.java   
@OptionMetadata(displayName = "tokenizer",
        description = "The tokenizing algorithm to use on the tweets. Uses the CMU TweetNLP tokenizer as default",
        commandLineParamName = "tokenizer",
        commandLineParamSynopsis = "-tokenizer <string>", displayOrder = 1)     
public Tokenizer getTokenizer() {
    return m_tokenizer;
}
项目:AffectiveTweets    文件:Utils.java   
/**
 * Tokenizes a String
 * @param content the content
 * @param toLowerCase true for lowercasing the content
 * @param standarizeUrlsUsers true for standarizing urls and users
 * @param reduceRepeatedLetters true for reduing repeated letters
 * @param tokenizer the tokenizer
 * @param stemmer the stemmer
 * @param stop the stopwords handler
 * @return a list of tokens
 */
static public List<String> tokenize(String content, boolean toLowerCase, boolean standarizeUrlsUsers, boolean reduceRepeatedLetters, Tokenizer tokenizer, Stemmer stemmer, StopwordsHandler stop) {

    if (toLowerCase)
        content = content.toLowerCase();

    // if a letters appears two or more times it is replaced by only two
    // occurrences of it
    if (reduceRepeatedLetters)
        content = content.replaceAll("([a-z])\\1+", "$1$1");


    List<String> tokens = new ArrayList<String>();

    tokenizer.tokenize(content);
    for(;tokenizer.hasMoreElements();){
        String token=tokenizer.nextElement();
        if(!stop.isStopword(token)){

            if (standarizeUrlsUsers) {
                // Replace URLs to a generic URL
                if (token.matches("http.*|ww\\..*|www\\..*")) {
                    token="http://www.url.com";
                }
                // Replaces user mentions to a generic user
                else if (token.matches("@.*")) {
                    token="@user";
                }

            }               

            tokens.add(stemmer.stem(token));
        }
    }

    return tokens;

}
项目:AffectiveTweets    文件:TweetToFeatureVector.java   
public void setTokenizer(Tokenizer m_tokenizer) {
    this.m_tokenizer = m_tokenizer;
}
项目:AffectiveTweets    文件:LexiconDistantSupervision.java   
public void setTokenizer(Tokenizer m_tokenizer) {
    this.m_tokenizer = m_tokenizer;
}
项目:repo.kmeanspp.silhouette_score    文件:NaiveBayesMultinomialText.java   
/**
 * the tokenizer algorithm to use.
 *
 * @param value the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_tokenizer = value;
}
项目:repo.kmeanspp.silhouette_score    文件:NaiveBayesMultinomialText.java   
/**
 * Returns the current tokenizer algorithm.
 *
 * @return the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_tokenizer;
}
项目:repo.kmeanspp.silhouette_score    文件:SGDText.java   
/**
 * the tokenizer algorithm to use.
 * 
 * @param value the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_tokenizer = value;
}
项目:repo.kmeanspp.silhouette_score    文件:SGDText.java   
/**
 * Returns the current tokenizer algorithm.
 * 
 * @return the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_tokenizer;
}
项目:repo.kmeanspp.silhouette_score    文件:StringToWordVector.java   
/**
 * the tokenizer algorithm to use.
 * 
 * @param value the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_Tokenizer = value;
}
项目:repo.kmeanspp.silhouette_score    文件:StringToWordVector.java   
/**
 * Returns the current tokenizer algorithm.
 * 
 * @return the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_Tokenizer;
}
项目:autoweka    文件:NaiveBayesMultinomialText.java   
/**
 * the tokenizer algorithm to use.
 *
 * @param value     the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_tokenizer = value;
}
项目:autoweka    文件:NaiveBayesMultinomialText.java   
/**
 * Returns the current tokenizer algorithm.
 *
 * @return          the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_tokenizer;
}
项目:autoweka    文件:SGDText.java   
/**
 * the tokenizer algorithm to use.
 *
 * @param value     the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_tokenizer = value;
}
项目:autoweka    文件:SGDText.java   
/**
 * Returns the current tokenizer algorithm.
 *
 * @return          the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_tokenizer;
}
项目:autoweka    文件:StringToWordVector.java   
/**
 * the tokenizer algorithm to use.
 *
 * @param value     the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_Tokenizer = value;
}
项目:autoweka    文件:StringToWordVector.java   
/**
 * Returns the current tokenizer algorithm.
 *
 * @return          the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_Tokenizer;
}
项目:umple    文件:NaiveBayesMultinomialText.java   
/**
 * the tokenizer algorithm to use.
 * 
 * @param value the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_tokenizer = value;
}
项目:umple    文件:NaiveBayesMultinomialText.java   
/**
 * Returns the current tokenizer algorithm.
 * 
 * @return the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_tokenizer;
}
项目:umple    文件:SGDText.java   
/**
 * the tokenizer algorithm to use.
 * 
 * @param value the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_tokenizer = value;
}
项目:umple    文件:SGDText.java   
/**
 * Returns the current tokenizer algorithm.
 * 
 * @return the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_tokenizer;
}
项目:umple    文件:StringToWordVector.java   
/**
 * the tokenizer algorithm to use.
 * 
 * @param value the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_Tokenizer = value;
}
项目:umple    文件:StringToWordVector.java   
/**
 * Returns the current tokenizer algorithm.
 * 
 * @return the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_Tokenizer;
}
项目:jbossBA    文件:StringToWordVector.java   
/**
 * the tokenizer algorithm to use.
 *
 * @param value     the configured tokenizing algorithm
 */
public void setTokenizer(Tokenizer value) {
  m_Tokenizer = value;
}
项目:jbossBA    文件:StringToWordVector.java   
/**
 * Returns the current tokenizer algorithm.
 *
 * @return          the current tokenizer algorithm
 */
public Tokenizer getTokenizer() {
  return m_Tokenizer;
}