我们从Python开源项目中,提取了以下3个代码示例,用于说明如何使用nltk.probability()。
def create_word_scores(posWords,negWords,posTag,negTag): from nltk.probability import FreqDist, ConditionalFreqDist import itertools posWords = list(itertools.chain(*posWords)) #???????????? negWords = list(itertools.chain(*negWords)) #?? word_fd = FreqDist() #????????? cond_word_fd = ConditionalFreqDist() #???????????????????? for word in posWords: #help(FreqDist) word_fd[word] += 1#word_fd.inc(word) cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1#word_fd.inc(word) cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd[posTag].N() #?????? neg_word_count = cond_word_fd[negTag].N() #?????? total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #???????????????????????????? neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #?? word_scores[word] = pos_score + neg_score #????????????????????????? return word_scores #??????????????
def generate_squad_vocab(path, vocabulary_size=30000): import json import itertools # from operator import itemgetter from nltk.probability import FreqDist d = json.load(open(path)) tokenized_sentences = [] for reading in d['data']: for paragraph in reading['paragraphs']: sentence = paragraph['context'].lower() tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence)) for question in paragraph['qas']: sentence = question['question'].lower() #TODO later check whether to add answer as well or not tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence)) word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print('total uniq words:', len(word_freq)) # sorted_freq = sorted(dict(word_freq).items(), key=itemgetter(1))[::-1] full_vocab = word_freq.most_common(len(word_freq)) vocab = open('vocab_full.txt','w') for w in full_vocab: vocab.write(w[0]+'\t'+str(w[1])+'\n') vocab.close() shorted_vocab = word_freq.most_common(vocabulary_size-1) vocab = open('vocab.txt','w') for w in shorted_vocab: vocab.write(w[0]+'\t'+str(w[1])+'\n') vocab.close()
def create_word_scores(posWords,negWords): # import all yuliao import itertools from nltk.probability import FreqDist, ConditionalFreqDist def count_fd(valueWords,tag): Words = list(itertools.chain(*valueWords)) #???????????? word_fd = FreqDist() #????????? cond_word_fd = ConditionalFreqDist() #???????????????????? for word in Words: word_fd[word] += 1#word_fd.inc(word) cond_word_fd[tag][word]+= 1#cond_word_fd['pos'].inc(word) word_count = cond_word_fd[tag].N() #??? return word_fd,cond_word_fd,tag,word_count """ def count_fd(valueWords,tag): Words[0] = list(itertools.chain(*valueWords)) #???????????? word_fd = FreqDist() #????????? cond_word_fd = ConditionalFreqDist() #???????????????????? for word in Words[0]: word_fd[word] += 1#word_fd.inc(word) cond_word_fd[tag[0]][word]+= 1#cond_word_fd['pos'].inc(word) for word in Words[1]: word_fd[word] += 1#word_fd.inc(word) cond_word_fd[tag[1]][word]+= 1#cond_word_fd['pos'].inc(word) word_count[0] = cond_word_fd[tag[0]].N() #??? word_count[1] = cond_word_fd[tag[1]].N() #??? return word_fd,cond_word_fd,tag,word_count[0],word_count[1] """ total_word_count = count_fd(posWords,'pos')[3]+count_fd(negWords,'neg')[3] # get words_scores def all_word_scores(total_word_count,*args):#word_fd,cond_word_fd,tag,word_count): #print args#count_fd(posWords,'pos')[0] word_fd,cond_word_fd,tag,word_count=args[0][0],args[0][1],args[0][2],args[0][3] #print word_fd,cond_word_fd,tag,word_count word_score = [] for word, freq in word_fd.iteritems(): score = BigramAssocMeasures.chi_sq(cond_word_fd[tag][word], (freq, word_count), total_word_count) #???????????????????????????? word_score.append((word,score)) return word_score word_scores={} for word_score in all_word_scores(total_word_count,count_fd(posWords,'pos')): word_scores.setdefault(word_score[0],word_score[1]) for word_score in all_word_scores(total_word_count,count_fd(posWords,'neg')): word_scores.setdefault(word_score[0],word_score[1]) return word_scores #?????????????? #word_scores[word] = pos_score + neg_score #????????????????????????? #????????????????????????