我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用nltk.FreqDist()。
def get_user_to_word_proportion(user_to_text, word): """ Maps each user to the proportion of his words that consist of a specificied word. """ user_to_word_proportion = {} for user in user_to_text: lm = LanuageModel(user_to_text[user]) n_tokens = len(lm.lowercase_tokens) if n_tokens > 0: fd = nltk.FreqDist(lm.lowercase_tokens) user_to_word_proportion[user] = fd[word] / float(n_tokens) else: user_to_word_proportion[user] = 0.0 print 'Finished user {}'.format(user.encode('utf-8')) return user_to_word_proportion
def most_frequent_Brown_Corpus_words(): import nltk import nltk.corpus words = [] for word in nltk.corpus.brown.words(): if word not in [ ",", ".", "``", "''", ";", "?", "--", ")", "(", ":", "!" ]: words.append(word.lower()) frequencies_words = nltk.FreqDist(words).most_common() words_most_frequent = [word[0] for word in frequencies_words] return words_most_frequent
def _calculate_word_scores(self, phrase_list): """Scores words according to frequency and tendency to appear in multi-word key phrases""" word_freq = nltk.FreqDist() word_multiplier = nltk.FreqDist() for phrase in phrase_list: # Give a higher score if word appears in multi-word candidates multi_word = min(2, len(filter(lambda x: not is_numeric(x), phrase))) for word in phrase: # Normalize by taking the stem word_freq[stem(word)] += 1 word_multiplier[stem(word)] += multi_word for word in word_freq.keys(): word_multiplier[word] = word_multiplier[word] / float(word_freq[word]) # Take average word_scores = {} for word in word_freq.keys(): word_scores[word] = word_freq[word] * word_multiplier[word] return word_scores
def build_dictionary(words, max_df=5): word_freq = [[unkown_token, -1], [pad_token, 0]] word_freq.extend(nltk.FreqDist(itertools.chain(words)).most_common()) word_freq = OrderedDict(word_freq) word2idx = {unkown_token: 0, pad_token: 1} idx2word = {0: unkown_token, 1: pad_token} idx = 2 for w in word_freq: f = word_freq[w] if f >= max_df: word2idx[w] = idx idx2word[idx] = w idx += 1 else: word2idx[w] = 0 # map the rare word into the unkwon token word_freq[unkown_token] += 1 # increment the number of unknown tokens return word2idx, idx2word, word_freq
def get_corpus_of_most_active_users(n_users=5): tweets = [] texts = [] with open(DATASET_PATH) as f: for line in f: tweets.append(json.loads(line)['user']['screen_name']) texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text'])) users = nltk.FreqDist(tweets).most_common(n_users) dict = {} for user, tweet in texts: if user in dict: dict[user] = " ".join([dict[user],tweet]) else: dict[user] = tweet corpus = [dict[name] for name, _ in users] user_names = [name for name, _ in users] return corpus, user_names
def profile(self, text): ''' Create FreqDist of trigrams within text ''' from nltk import word_tokenize, FreqDist clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = [''.join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint
def load_data(): global N, words, labels posts = corpus.xml_posts()[:10000] freqs = [ FreqDist(post.text) for post in posts ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) labels = list(set([ post.get('class') for post in posts ])) data = [] N = len(words) for post, dist in zip(posts, freqs): V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) data.append((V, labels.index(post.get('class')))) return data
def load_data(): global N, words freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) data = [] N = len(words) for dist in freqs: x = volumize(dist) data.append((x, x.w)) return data
def test(): gt = GetTweets() documents = gt.get_hashtag('ferguson', count=20) documents += gt.get_hashtag('police', count=21) print 'Query:', documents[-1] tokenizer = RegexpTokenizer('\w+') vols = [] for doc in documents: samples = [] for token in tokenizer.tokenize(doc): word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) vols.append(volumize(FreqDist(samples))) vectors = [ doc_code(v) for v in vols[:-1] ] query_vec = doc_code(vols[-1]) sims = [ cos(v, query_vec) for v in vectors ] m = max(sims) print m, documents[sims.index(m)]
def load_data(): global N, words freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) data = [] N = len(words) for dist in freqs: V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) data.append((V, V.w)) return data
def tokenize_sentences(self): # tokenize the sentences into words and count the word frequencies # get most common words, build index_to_word and word_to_index vectors self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in self.sentences] word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences)) print("Found %d unique word tokens." % len(word_freq.items())) vocab = word_freq.most_common(self.vocabulary_size - 1) self.index_to_word = [x[0] for x in vocab] self.index_to_word.append(self.unknown_token) self.word_to_index = dict( [(w, i) for i, w in enumerate(self.index_to_word)]) print("Using vocabulary size %d." % self.vocabulary_size) print( "The least frequent word is '%s' appearing %d times." % ( vocab[-1][0], vocab[-1][1])) # replace all words not in our vocabulary with the unknown token for i, sent in enumerate(self.tokenized_sentences): self.tokenized_sentences[i] = [ w if w in self.word_to_index else self.unknown_token for w in sent]
def load_words(num_words): words = get_words_from_nltk() fdist = nltk.FreqDist(words) fdistmc = fdist.most_common() nd = OrderedDict() nda = [] occurences = set([wt[1] for wt in fdistmc]) occurences = sorted(occurences, key=int, reverse=True) for idx in occurences: nd[idx] = sorted([wt[0] for wt in fdistmc if wt[1] == idx]) for key, val in nd.items(): nda += val words = nda[:num_words] return words
def statistics_by_aspect(): filename = "aspects_train.csv" words_dist = nltk.ConditionalFreqDist() sample_sizes = nltk.FreqDist() samples_stream = get_samples_stream(filename) for aspect,words in samples_stream: sample_sizes[aspect] += 1 for word in words: words_dist[aspect][word] += 1 for category,dist in words_dist.iteritems(): print "\n------- Category: {}".format(category) print dist.most_common(20) total_samples = sample_sizes.N() print "\ntotally {} samples".format(total_samples) for aspect, count in sample_sizes.iteritems(): print "aspect[{}] has {} samples, {:.2f}%".format(aspect,count, count*100.0/total_samples)
def save_topics(model,filename): with open(filename,"wt") as outf: # ---------- write each topic and words' contribution topics = model.show_topics(num_topics=-1, log=False, formatted=True) for topic in topics: # topic[0]: topic number # topic[1]: topic description outf.write("\n############# TOPIC {} #############\n".format(topic[0])) outf.write(topic[1]+"\n") # ---------- words statistics in all topics outf.write("\n\n\n****************** KEY WORDS ******************\n") topics = model.show_topics(num_topics=-1, log=False, formatted=False) keywords = (word for (_,words) in topics for (word,score) in words) fdist = nltk.FreqDist(keywords) for index,(w,c) in enumerate( fdist.most_common(100) ): outf.write("{}-th keyword: <{},{}>\n".format(index+1,w,c))
def createPopularWords(combined, lowerBound, upperBound): allWords = [] for message in combined: for word in message[0]: allWords.append(word) allWords = nltk.FreqDist(allWords) # grab the top several thousand words, ignoring the lowerBound most popular # grabbing more words leads to more accurate predictions, at the cost of both memory and compute time # ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall popularWords = [] wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound] for pair in wordsToUse: popularWords.append(pair[0]) return popularWords # extract features from a single document in a consistent manner for all documents in a corpus # simply returns whether a given word in popularWords is included in the document
def get_word_counts(input_str, limit = 100): input_str = PreprocessManager.remove_non_ascii(input_str) wordnet_lemmatizer = WordNetLemmatizer() snowball_stemmer = EnglishStemmer() tokenized_text = CountVectorizer().build_tokenizer()(input_str.lower()) tokenized_text = [word for word in tokenized_text if len(word) > 1] # Filter some small words #tokenized_text = [word for word in tokenized_text if not word.isnumeric()] filtered_words = [word for word in tokenized_text if word not in stopwords.words('english')] stemmed_list = [wordnet_lemmatizer.lemmatize(w) for w in filtered_words] # Calculate frequency distribution frequency_dist = nltk.FreqDist(stemmed_list) # Output top 50 words result = dict() for word, frequency in frequency_dist.most_common(limit): # print(u'{};{}'.format(word, frequency)) result[word] = frequency return result # This function just splits the words and gives the words that's all!
def analysis(reviews_collection_text): with open('data/reviews_%s' % reviews_collection_text, 'r') as f: raw_data = f.read() with open('data/reviews_%s' % reviews_collection_text, 'r') as f: comments = f.readlines() data = raw_data.replace('\n', ' ') data_lower = data.lower() tokens_with_punc = word_tokenize(data_lower) tokens = RegexpTokenizer(r'\w+').tokenize(data_lower) print("--- Most frequent tokens ---\n", FreqDist(tokens_with_punc).most_common(15)) print("--- Tokens without punctuation ---\n", FreqDist(tokens).most_common(15)) stop = set(stopwords.words('english')) words = [word for word in tokens if word not in stop] print("--- Most frequent words ---\n", FreqDist(words).most_common(15)) tagged = pos_tag(words) nouns = [word for word, pos in tagged if (pos == 'NN')] print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15)) adjts = [word for word, pos in tagged if (pos == 'JJ')] print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15)) tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments] lxdst = [lexical_density(token) for token in tokns if len(token) > 0] avgld = sum(lxdst) / len(comments) print("--- Average lexical density ---\n", avgld)
def plot_common_tokens(self, n_tokens): # Remove common stopwords fd = nltk.FreqDist(w for w in self.alpha_tokens if w not in s) fd.plot(n_tokens)
def get_user_to_word_count(user_to_text, word): user_to_word_count = {} for user in user_to_text: lm = LanuageModel(user_to_text[user]) fd = nltk.FreqDist(lm.lowercase_tokens) user_to_word_count[user] = fd[word] return user_to_word_count
def sentence2vec(self, sentence): if len(self.features) == 0: self.load_feature_model() seg_list = jieba.cut(sentence, False) freq_dist = nltk.FreqDist(seg_list) local_list = [] for each in self.features: local_list.append(freq_dist[each]) return local_list
def get_freq_dist(self, seg_list): freq_dist = [] for each in seg_list: freq_dist.append(nltk.FreqDist(each)) return freq_dist
def _remove_uncommon_words(cls, tokenized_corpus, vocabulary_size): word_count = nltk.FreqDist( itertools.chain(*tokenized_corpus) ) word_count = [cls.WORD_COUNT_ITEM(word=word, count=count) for word, count in word_count.items()] word_count = sorted(word_count, key=lambda item: (item.count, item.word), reverse=True) most_common_words = [word_count_item.word for word_count_item in word_count[:vocabulary_size - \ cls.NUMBER_OF_WORDS_TO_ADD_IN_MANUALLY + 1]] tokenized_corpus = [ [word if word in most_common_words else cls.UNKNOWN_TOKEN for word in sentence]\ for sentence in tokenized_corpus ] return tokenized_corpus
def parse_text(filename, vocabulary_size=9000, type="word"): with open(filename, 'rb') as f: txt = f.read() if type == "word": sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' ')) # sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences] tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print("Found %d unique words tokens." % len(word_freq.items())) vocab = word_freq.most_common(vocabulary_size-1) index = [sentence_start_token, sentence_end_token, unknown_token] + [x[0] for x in vocab] word_to_index = dict([(w,i) for i,w in enumerate(index)]) print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])) for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] X_train = np.asarray([ [0]+[word_to_index[w] for w in sent] for sent in tokenized_sentences]) y_train = np.asarray([ [word_to_index[w] for w in sent]+[1] for sent in tokenized_sentences]) # X_train, y_train = [], [] # for sent in tokenized_sentences: # l = len(sent) - 1 # X_train.append(coo_matrix((np.ones( (l) ), ( range(l), [word_to_index[w] for w in sent[:-1]] )), shape=(l, vocabulary_size )).toarray()) # y_train.append( [word_to_index[w] for w in sent[1:] ] ) else: sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' ')) index = ['^','$'] + list(set(txt)) char_to_index = dict([(w,i) for i,w in enumerate(index)]) X_train = np.asarray([ [0]+[ char_to_index[w] for w in sent] for sent in sentences]) y_train = np.asarray([ [ char_to_index[w] for w in sent]+[1] for sent in sentences]) return X_train, y_train, index
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() started = time.time() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): counts['paras'] += 1 for sent in self._sent_tokenizer.tokenize(para): counts['sents'] += 1 for word in self._word_tokenizer.tokenize(sent): counts['words'] += 1 tokens[word] += 1 # Compute the number of files and categories in the corpus n_fileids = len(self._resolve(fileids, categories) or self.fileids()) n_topics = len(self.categories(self._resolve(fileids, categories))) # Return data structure with information return { 'files': n_fileids, 'topics': n_topics, 'paras': counts['paras'], 'sents': counts['sents'], 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), 'ppdoc': float(counts['paras']) / float(n_fileids), 'sppar': float(counts['sents']) / float(counts['paras']), 'secs': time.time() - started, }
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() started = time.time() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): counts['paras'] += 1 for sent in para: counts['sents'] += 1 for word, tag in sent: counts['words'] += 1 tokens[word] += 1 # Compute the number of files and categories in the corpus n_fileids = len(self._resolve(fileids, categories) or self.fileids()) n_topics = len(self.categories(self._resolve(fileids, categories))) # Return data structure with information return { 'files': n_fileids, 'topics': n_topics, 'paras': counts['paras'], 'sents': counts['sents'], 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), 'ppdoc': float(counts['paras']) / float(n_fileids), 'sppar': float(counts['sents']) / float(counts['paras']), 'secs': time.time() - started, }
def scoreFunction(wholetext): """Get text, find most common words and compare with known stopwords. Return dictionary of values""" dictiolist = {} scorelist = {} # These are the available languages with stopwords from NLTK NLTKlanguages=["dutch","finnish","german","italian", "portuguese", "spanish","turkish","danish","english", "french","hungarian", "norwegian","russian","swedish"] FREElanguages = [""] languages=NLTKlanguages + FREElanguages # Fill the dictionary of languages, to avoid unnecessary function calls for lang in NLTKlanguages: dictiolist[lang] = stopwords.words(lang) # Split all the text in tokens and convert to lowercase. In a # decent version of this, I'd also clean the unicode tokens = word_tokenize(wholetext) tokens = [t.lower() for t in tokens] # Determine the frequency distribution of words, looking for the # most common words freq_dist = FreqDist(tokens) # This is the only interesting piece, and not by much. Pick a # language, and check if each of the 20 most common words is in # the language stopwords. If it's there, add 1 to this language # for each word matched. So the maximal score is 20. Why 20? No # specific reason, looks like a good number of words. for lang in languages: scorelist[lang]=0 for word in freq_dist.keys()[0:20]: if word in dictiolist[lang]: scorelist[lang]+=1 return scorelist
def calc_frequencies(words, words_n=50, lang='german'): words = [word for word in words if len(word) > 1] words = [word for word in words if not word.isnumeric()] words = [word.lower() for word in words] # words = [word for word in words if word not in all_stopwords] # Stemming words seems to make matters worse, disabled # stemmer = nltk.stem.snowball.SnowballStemmer(lang) # words = [stemmer.stem(word) for word in words] fdist = nltk.FreqDist(words) return fdist.most_common(words_n)
def occurencecount(): # Ask the user to input a word word = raw_input("Enter a word : ") # Create a list of file which we will be looking into for matches fileList = ['Text1.txt', 'Text2.txt', 'Text3.txt', 'Text4.txt'] # Open the files one by one, read them and find the occurance count inside each file for filename in fileList: # Openthe file fp_text = codecs.open(filename, 'r', 'utf-8') # Read all the words inside the file words_text = word_tokenize(fp_text.read()) # Find the number of occurances of each word using built in method from NLTK fd_text = FreqDist(words_text) # Print out the number of occurances for that specific word print("Number of occurences in " + filename + " : " + str(fd_text[word]))
def get_words(tweets): """Given a set of tweets, return the most frequently-used words.""" tweets = filter(lambda x: not(x.is_rt), tweets) tokenized = [nltk.word_tokenize(handle_strip(t.tweet_text)) for t in tweets] words = [item for sublist in tokenized for item in sublist] longwords = filter(lambda x: len(x) > 6, words) lcwords = map(lambda x: x.lower(), longwords) fdist = nltk.FreqDist(lcwords) common = fdist.most_common(100) common = filter(lambda x: x[1] > 4, common) common = map(lambda x: [x[0], 6 + int(x[1]/3)], common) return common
def make_data(file_name): '''Returns Tuple of dataframes used in analysis: core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df''' #realDonaldTrump_master_tweet_list.json #TODO: fix so strings aren't written to file and we can just load it as json. with open(file_name) as tfile: lines = tfile.readlines() raw_tweets_data = [eval(t) for t in lines] analyzer = TextAnalyzer(raw_tweets_data) english_stopwords = stopwords.words("english") core_tweet_df = analyzer.make_tweet_df( with_pos_tags=False, columns_to_filter=['id', 'created_at', 'text', 'retweet_count', 'favorite_count']) # get list of tweets as text tweets_list = core_tweet_df.text.tolist() pos_df = analyzer.make_pos_df(tweets_list, make_csv=False) adj_df = pos_df[pos_df.pos_tag=='JJ'] adj_df = analyzer.make_word_frequency_df(adj_df, 'word', make_csv=False) # calculate word frequencies among other words in data set. can't merge with pos # because certain words have many parts of speech. word_frequency_df = analyzer.make_word_frequency_df(pos_df, 'word', make_csv=False) #Most common hashtags and total unique hashtags. all_hashtags = [] for i in raw_tweets_data: all_hashtags.extend([d['text'] for d in i['entities']['hashtags']]) fd = FreqDist(all_hashtags) hash_df = pd.DataFrame([{'hashtag':x,'abs_frequency': y, 'rel_frequency_pct': float(y)/len(all_hashtags)*100} for x,y in fd.most_common()]) return core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df
def zipf(self, message, users): source_user = message.author.name source_user = source_user.strip('@').split('#')[0] target_users = [user.strip('@').split('#')[0] for user in users.split()] if len(users) == 0: target_users = [source_user] if users == '*': if message.server is not None: target_users = [member.name for member in message.server.members] target_users = [user for user in target_users if self.check_nickname_valid(user.lower()) is None] image_file_name = self.quotes_file_name(source_user.lower())[:-4] + '.png' pylab.title('Word frequencies') for user in target_users: quotes_file = codecs.open(self.quotes_file_name(user.lower()), 'r', encoding='utf-8') lines = quotes_file.readlines() quotes_file.close() if len(lines) < 20: continue tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') tokens = self.filter_to_english_words(tokenizer.tokenize(str(lines))) if len(tokens) < 200: continue freq = nltk.FreqDist(tokens) self.plot_word_frequencies(freq, user) pylab.legend() pylab.savefig(image_file_name) pylab.gcf().clear() await self.client.send_file(message.channel, image_file_name)
def index_(tokenized_sentences, vocab_size): # get frequency distribution freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # get vocabulary of 'vocab_size' most used words vocab = freq_dist.most_common(vocab_size) # index2word index2word = ['_'] + [UNK] + [ x[0] for x in vocab ] # word2index word2index = dict([(w,i) for i,w in enumerate(index2word)] ) return index2word, word2index, freq_dist
def index_(tokenized_sentences, vocab_size): # get frequency distribution freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # get vocabulary of 'vocab_size' most used words vocab = freq_dist.most_common(vocab_size) vocab = [ item for item in vocab if item[1] > 1 ] # index2word index2word = ['_'] + ['UNK'] + list(POS_TAGS.keys()) + [ x[0] for x in vocab ] # word2index word2index = dict([(w,i) for i,w in enumerate(index2word)] ) return index2word, word2index, freq_dist
def test(): global N, words, network print 'In testing.' gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth.""" tokenizer = RegexpTokenizer('\w+') gettysburg_tokens = tokenizer.tokenize(gettysburg) samples = [] for token in gettysburg_tokens: word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) dist = FreqDist(samples) V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) pred = network.forward(V).w topics = [] while len(topics) != 5: max_act = max(pred) topic_idx = pred.index(max_act) topic = words[topic_idx] if topic in gettysburg_tokens: topics.append(topic) del pred[topic_idx] print 'Topics of the Gettysburg Address:' print topics
def getFeat(self, line): listItem = [0]*self.noFeat fileFreqDist = nltk.FreqDist(SVM.tokenize(line)) i = 0 for key in self.trainKeys: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i = i + 1 return listItem
def main(): freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION) vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]] for w in vocab: print w
def take_some_analysis(file_dir): context_length = [] utterance_length = [] dist = nltk.FreqDist() for c, u in utterance_generator(file_dir): c_tokens = nltk.word_tokenize(c) u_tokens = nltk.word_tokenize(u) # ???? context_length.append(len(c_tokens)) utterance_length.append(len(u_tokens)) dist.update(c_tokens + u_tokens) cl_array = np.array(context_length) ul_array = np.array(utterance_length) print("most length of context is %d" % cl_array.max()) print("most length of utterance is %d" % ul_array.max()) print("mean length of context is %f" % cl_array.mean()) print("mean length of utterance is %f" % ul_array.mean()) sub_abs = np.abs(cl_array - ul_array) print("max,min,mean of abs(context_length -utterance_length) is %f,%f,%f" % ( np.max(sub_abs), np.min(sub_abs), np.mean(sub_abs))) print("most common words :") print(dist.most_common(10))
def preprocess_data(self): # Read the data and append SENTENCE_START and SENTENCE_END tokens print "Reading CSV file..." with open('data/reddit-comments-2015-08.csv', 'rb') as f: reader = csv.reader(f, skipinitialspace=True) reader.next() # Split full comments into sentences sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader]) # Append SENTENCE_START and SENTENCE_END sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences] print "Parsed %d sentences." % (len(sentences)) # Tokenize the sentences into words tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) # Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(self.vocabulary_size-1) self.index_to_word = [x[0] for x in vocab] self.index_to_word.append(unknown_token) self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)]) print "Using vocabulary size %d." % self.vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in self.word_to_index else unknown_token for w in sent] print "\nExample sentence: '%s'" % sentences[0] print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0] # Create the training data #tokenized_words = [item for sublist in tokenized_sentences for item in sublist] #self.X_train = np.asarray([self.word_to_index[w] for w in tokenized_words[:-1]]) #self.Y_train = np.asarray([self.word_to_index[w] for w in tokenized_words[1:]]) self.X_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) self.Y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
def checkSentenceSanity(sentence): """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected""" caseDist = nltk.FreqDist() for token in sentence: caseDist[getCasing(token)] += 1 if caseDist.most_common(1)[0][0] != 'allLower': return False return True
def get_word_features(wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features
def get_top_ngrams(corpus, ngram_val=1, limit=5): corpus = flatten_corpus(corpus) tokens = nltk.word_tokenize(corpus) ngrams = compute_ngrams(tokens, ngram_val) ngrams_freq_dist = nltk.FreqDist(ngrams) sorted_ngrams_fd = sorted(ngrams_freq_dist.items(), key=itemgetter(1), reverse=True) sorted_ngrams = sorted_ngrams_fd[0:limit] sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams] return sorted_ngrams
def sample_split(dbname,num_train,num_test): client = MongoClient() db = client[dbname] sentisent_collection = db.sentiment_sentences ################## load and count aspect_dist = nltk.FreqDist() sentiment_dist = nltk.FreqDist() all_samples = [] cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train + num_test } } ]) for index,d in enumerate(cursor): sent = Sentence.from_dict(d) all_samples.append( (sent.words,sent.sentiment) ) aspect_dist[sent.aspect] +=1 sentiment_dist[int(sent.sentiment)] +=1 client.close() ################## show statistics for k in aspect_dist: print '[{}]: {}'.format(k,aspect_dist.freq(k)) for k in sentiment_dist: print '[{}]: {}'.format(k,sentiment_dist.freq(k)) ################## shuffle random.shuffle(all_samples) ################## split def __dump(filename,data): with open(filename,"wb") as outf: cPickle.dump(data,outf) __dump("sentidata_train_raw.pkl",all_samples[:num_train]) __dump("sentidata_test_raw.pkl",all_samples[num_train:])