我们从Python开源项目中,提取了以下5个代码示例,用于说明如何使用nltk.ConditionalFreqDist()。
def statistics_by_aspect(): filename = "aspects_train.csv" words_dist = nltk.ConditionalFreqDist() sample_sizes = nltk.FreqDist() samples_stream = get_samples_stream(filename) for aspect,words in samples_stream: sample_sizes[aspect] += 1 for word in words: words_dist[aspect][word] += 1 for category,dist in words_dist.iteritems(): print "\n------- Category: {}".format(category) print dist.most_common(20) total_samples = sample_sizes.N() print "\ntotally {} samples".format(total_samples) for aspect, count in sample_sizes.iteritems(): print "aspect[{}] has {} samples, {:.2f}%".format(aspect,count, count*100.0/total_samples)
def findtags(self, tag_prefix, tagged_text): ''' Find all words that match a 'tag' (word type) prefix :param tag_prefix: The tag prefix :type tag_prefix: ``str`` :param tagged_text: The text to search :type tagged_text: ``list`` of ``dict`` ''' cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)) return dict((tag, cfd[tag].most_common(50)) for tag in cfd.conditions())
def ngram_baseline(text): ngs = ngrams(text, 2) cnt = 0 """ for t in ngs: print(t, ) cnt = cnt + 1 if (cnt > 1000): break """ refine = [] for (first, second) in ngs: if (second[1] == 1): #print(first[0], second[0], zip(first[0], second[0])) #tmp = (first[0], second[0]) #print(tmp) #break refine.append((first[0], second[0])) cnt = 0 """ for t in refine: print(t) cnt = cnt + 1 if (cnt > 1000): break #print(ngs) """ cfdist = nltk.ConditionalFreqDist(refine) return cfdist
def calc_cfd(doc): # Calculate conditional frequency distribution of bigrams words = [w for w, t in Mecab().pos(doc)] bigrams = nltk.bigrams(words) return nltk.ConditionalFreqDist(bigrams)
def generate_from_trigrams(lm, start_words, n_words): """ backoff model start_words: list of two strings. n_words: integer >= 0, number of words to generate, not including start_words lm: lowercase_tokens must be nonempty """ # Create probability maps trigram_counter = Counter(ngrams(lm.lowercase_tokens, 3)) trigram_prob = trigram_prob_map(trigram_counter) bigram_cfd = nltk.ConditionalFreqDist(ngrams(lm.lowercase_tokens, 2)) bigram_prob = bigram_prob_map(bigram_cfd) unigram_counter = Counter(lm.lowercase_tokens) unigram_prob = unigram_prob_map(unigram_counter) # Build sentence w1, w2 = start_words[0], start_words[1] words = [w1, w2] for i in range(n_words): # Use trigram if (w1, w2) in trigram_prob: prob_map = trigram_prob[(w1, w2)] next_words = prob_map.keys() next_word = choice(next_words, p=[prob_map[w] for w in next_words]) # Use bigram elif w2 in bigram_prob: prob_map = bigram_prob[w2] next_words = prob_map.keys() next_word = choice(next_words, p=[prob_map[w] for w in next_words]) # Use unigram else: prob_map = unigram_prob next_words = prob_map.keys() next_word = choice(next_words, p=[prob_map[w] for w in next_words]) # Update words w1 = w2 w2 = next_word words.append(w2) sentence = ' '.join(words) return sentence