private StringToWordVector parseTokenizer(StringToWordVector filter) { switch (Constants.CONFIG.getTokenizer()) { case ALPHABETIC: // Avoid. Does not support diacritics (ã, á, é, etc.) filter.setTokenizer(new AlphabeticTokenizer()); break; case WORD: NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMaxSize(Constants.CONFIG.getNGrams()); filter.setTokenizer(tokenizer); break; case OPENNLP: // TODO use WordTokenizer with a specific delimiter set via options // and printed by OpenNLP. break; case COGROO: // TODO use WordTokenizer with a specific delimiter set via options // and printed by CoGrOO. break; default: throw new IllegalArgumentException(Constants.CONFIG.getTokenizer() + " is not implemented."); } return filter; }
private StringToWordVector getStringToWordVectorFilter(Instances instances) throws Exception { StringToWordVector stringToWordVector = new StringToWordVector(); stringToWordVector.setAttributeIndices(indicesToRangeList(new int[]{ instances.attribute(SURFACE_TEXT_AND_POS_TAG_OF_TWO_PRECEDING_AND_FOLLOWING_TOKENS_AROUND_THE_DESC_CANDIDATE).index(), instances.attribute(SURFACE_TEXT_AND_POS_TAG_OF_THREE_PRECEDING_AND_FOLLOWING_TOKENS_AROUND_THE_PAIRED_MATH_EXPR).index(), instances.attribute(SURFACE_TEXT_OF_THE_FIRST_VERB_THAT_APPEARS_BETWEEN_THE_DESC_CANDIDATE_AND_THE_TARGET_MATH_EXPR).index(), instances.attribute(SURFACE_TEXT_AND_POS_TAG_OF_DEPENDENCY_WITH_LENGTH_3_FROM_IDENTIFIER).index(), instances.attribute(SURFACE_TEXT_AND_POS_TAG_OF_DEPENDENCY_WITH_LENGTH_3_FROM_DEFINIEN).index()})); stringToWordVector.setWordsToKeep(1000); NGramTokenizer nGramTokenizer = new NGramTokenizer(); nGramTokenizer.setNGramMaxSize(3); nGramTokenizer.setNGramMinSize(1); nGramTokenizer.setDelimiters(nGramTokenizer.getDelimiters().replaceAll(":", "")); stringToWordVector.setTokenizer(nGramTokenizer); stringToWordVector.setInputFormat(instances); return stringToWordVector; }
/** * Creates a {@link StringToWordVector} filter with a 3-gram {@link Tokenizer} * and stop word handling. * * @param instances the model which is to be filtered * @return the filter * @throws Exception if filter creation fails */ private StringToWordVector createFilter(Instances instances) throws Exception { NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMaxSize(3); WordsFromFile stopwordsHandler = new WordsFromFile(); stopwordsHandler.setStopwords(FileUtils.loadFile(resourceLoader, dataConfig.getBaseDataDirectory() + dataConfig.getStopWordsDirectory())); StringToWordVector stwv = new StringToWordVector(); stwv.setTokenizer(tokenizer); stwv.setTFTransform(true); stwv.setIDFTransform(true); stwv.setStopwordsHandler(stopwordsHandler); stwv.setLowerCaseTokens(true); stwv.setInputFormat(instances); return stwv; }
public static StringToWordVector WordNgrams(Properties prop) throws Exception{ final StringToWordVector filter = new StringToWordVector(); filter.setAttributeIndices("first-last"); filter.setOutputWordCounts(false); filter.setTFTransform(false); filter.setIDFTransform(false); //if (prop.getProperty("Preprocessings.removeStopWords").equalsIgnoreCase("yes")) filter.setStopwords(new File("ressources//MotsVides.txt")); filter.setWordsToKeep(10000); filter.setMinTermFreq(1); NGramTokenizer tok = new NGramTokenizer(); tok.setDelimiters(" \n .,;'\"()?!-/<>‘’“”…«»•&{[|`^]}$*%"); tok.setNGramMinSize(Integer.parseInt(prop.getProperty("Ngrams.min"))); tok.setNGramMaxSize(Integer.parseInt(prop.getProperty("Ngrams.max"))); filter.setTokenizer(tok); return filter; }
@Override public Tokenizer create(String toTokenize) { this.wekaTokenizer = new NGramTokenizer(); this.wekaTokenizer.setNGramMinSize(this.nMin); this.wekaTokenizer.setNGramMaxSize(this.nMax); this.wekaTokenizer.setDelimiters(this.delimiters); WekaTokenizer t = new WekaTokenizer(toTokenize, wekaTokenizer); t.setTokenPreProcessor(tokenPreProcess); return t; }
@Override public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) { final String html = documentContentData.getContent(); final Attribute input = new Attribute(HTML, (ArrayList<String>) null); final ArrayList<Attribute> inputVec = new ArrayList<>(); inputVec.add(input); final Instances htmlInst = new Instances(HTML, inputVec, 1); htmlInst.add(new DenseInstance(1)); htmlInst.instance(0).setValue(0, html); final StopwordsHandler stopwordsHandler = new StopwordsHandler() { @Override public boolean isStopword(final String word) { return word.length() <5; } }; final NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(1); tokenizer.setNGramMaxSize(1); tokenizer.setDelimiters(TOKEN_DELIMITERS); final StringToWordVector filter = new StringToWordVector(); filter.setTokenizer(tokenizer); filter.setStopwordsHandler(stopwordsHandler); filter.setLowerCaseTokens(true); filter.setOutputWordCounts(true); filter.setWordsToKeep(maxResult); final Map<String,Integer> result = new HashMap<>(); try { filter.setInputFormat(htmlInst); final Instances dataFiltered = Filter.useFilter(htmlInst, filter); final Instance last = dataFiltered.lastInstance(); final int numAttributes = last.numAttributes(); for (int i = 0; i < numAttributes; i++) { result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i))); } } catch (final Exception e) { LOGGER.warn("Problem calculating wordcount for : {} , exception:{}",documentContentData.getId() ,e); } return result; }