我们从Python开源项目中,提取了以下30个代码示例,用于说明如何使用sklearn.decomposition.NMF。
def print_topic_summary(self, df, topic_num, num_words=20): ''' Function to print summary of a topic from NMF clustering INPUT: df: pandas DataFrame that NMF clustering was run on topic_num: index of topic from clustering num_words: top n words to print in summary ''' num_reviews = self.labels[:, topic_num].sum() print 'Summary of Topic {}:'.format(topic_num) print 'Number of reviews in topic: {}'.format(num_reviews) print 'Top {} words in topic:'.format(num_words) print self.top_words_by_topic(num_words, topic_num) if not num_reviews: return None
def new(n_feature=128): vectorizer = CountVectorizer( encoding='utf-8', ngram_range=(1,1), # Unigram only max_features=n_feature, binary=True ) # Fill the gap (missing expected tags) # --- # Hypothesis: Some tags are somehow related so # we smoothen the missing values with matrix factorisation. smoother = NMF(n_components=n_feature) # Binarise the vector's individual values binariser = Binarizer(copy=True) # Count vectoriser => NMF as smoother => Binariser print(colored('Taghasher model created','yellow')) return [vectorizer,smoother,binariser]
def test_nmf(eng): t = linspace(0, 10, 100) s1 = 1 + absolute(sin(t)) s2 = 1 + square(cos(2*t)) h = c_[s1, s2].T w = array([[1, 0], [0, 1], [1, 1]]) x = dot(w, h) x = fromarray(x, engine=eng) from sklearn.decomposition import NMF as skNMF nmf = skNMF(n_components=2, random_state=0) w1 = nmf.fit_transform(x.toarray()) h1 = nmf.components_ xhat1 = dot(w1, h1) w2, h2 = NMF(k=2, seed=0).fit(x) xhat2 = dot(w2, h2) tol=1e-1 assert allclose(xhat1, xhat2, atol=tol)
def run_nmf(alignment, num_clusters, alpha, mixing): model = NMF(n_components = num_clusters, init = 'nndsvd', alpha = alpha, l1_ratio = mixing, verbose = 0) return(model.fit_transform(alignment))
def fit_nmf(self, df): ''' Function to run NMF clustering on dataframe INPUT: df: pandas Dataframe containing 'lemmatized_text' column for TF-IDF ''' self.optimize_nmf(df) self.nmf = NMF(n_components=self.optimum_topics, alpha=self.nmf_alpha, l1_ratio=self.nmf_l1_ratio, random_state=self.random_state).fit(self.tfidf_matrix) self.W_matrix = self.nmf.transform(self.tfidf_matrix) sums = self.W_matrix.sum(axis=1) self.W_pct = self.W_matrix / sums[:, None] self.labels = self.W_pct >= 0.20 print "Reconstruction Error: {}".format(self.nmf.reconstruction_err_)
def plot_topic(self, topic_idx): ''' Function to plot a wordcloud based on a topic INPUT: topic_idx: index of topic from NMF clustering ''' title = raw_input('Enter a title for this plot: ') num_reviews = self.labels[:, topic_idx].sum() word_freq = self.topic_word_frequency(topic_idx) wc = WordCloud(width=2000, height=1000, max_words=150, background_color='white') wc.fit_words(word_freq) fig = plt.figure(figsize=(16, 8)) ax = fig.add_subplot(111) ax.set_title('Topic {}: {}\nNumber of Reviews in Topic: {}'.format( topic_idx, title, num_reviews), fontsize=24) ax.axis('off') ax.imshow(wc) name = 'topic_' + str(topic_idx) + '.png' if self.pro_or_con == 'pro': img_path = os.path.join('images', 'positive') else: img_path = os.path.join('images', 'negative') plt.savefig(os.path.join(img_path, name)) plt.show()
def visualize_topics(self, df): ''' Function to cycle through all topics and print summary and plot cloud INPUT: df: pandas DataFrame (source for NMF text) ''' for i in range(self.optimum_topics): self.print_topic_summary(df, i) self.plot_topic(i) print ''
def build_model(self, baskets, use_probabilities=False): # print 'build V' self.__buildV(baskets, use_probabilities) # print 'density', 1.0 * len(self.V.nonzero()[0]) / (self.V.shape[0] * self.V.shape[1]) sknmf = SKNMF(n_components=self.n_factor, init='random', solver='cd', tol=self.tol, max_iter=self.max_iter, alpha=self.alpha, l1_ratio=self.l1_ratio, beta=self.beta) self.W = sknmf.fit_transform(self.V) self.H = sknmf.components_ self.R = np.dot(self.W, self.H) self.__state = 'built' return self
def apply( self, X, k = 2, init_W = None, init_H = None ): """ Apply NMF to the specified document-term matrix X. """ self.W = None self.H = None random_seed = np.random.randint( 1, 100000 ) if not (init_W is None or init_H is None): model = decomposition.NMF( init="custom", n_components=k, max_iter=self.max_iters, random_state = random_seed ) self.W = model.fit_transform( X, W=init_W, H=init_H ) else: model = decomposition.NMF( init=self.init_strategy, n_components=k, max_iter=self.max_iters, random_state = random_seed ) self.W = model.fit_transform( X ) self.H = model.components_
def rank_terms( self, topic_index, top = -1 ): """ Return the top ranked terms for the specified topic, generated during the last NMF run. """ if self.H is None: raise ValueError("No results for previous run available") # NB: reverse top_indices = np.argsort( self.H[topic_index,:] )[::-1] # truncate if necessary if top < 1 or top > len(top_indices): return top_indices return top_indices[0:top]
def plot_nmf(data, analyse = True, n_components = 2): """Perform NMF and plot overview of the results""" if analyse: nmf = sd.NMF(n_components=n_components, init = 'nndsvdar', random_state = 0, solver = 'cd') Y = nmf.fit_transform(data) else: Y = data; nmf = None; if n_components is None: n_components = 3; if n_components == 1: plt.subplot(1,3,1); plt.plot(Y); elif n_components == 2: plt.subplot(1,3,1); plt.scatter(Y[:,0], Y[:,1], c = range(len(Y[:,0])), cmap = plt.cm.Spectral); else: ax = plt.gcf().add_subplot(1,3,1, projection = '3d'); ax.scatter(Y[:, 0], Y[:, 1], Y[:,2], c = range(len(Y[:,0])), cmap=plt.cm.Spectral) plt.title("nmf") if nmf is not None: feat = nmf.components_; plt.subplot(1,3,2); plt.imshow(feat, interpolation = 'none', aspect = 'auto', cmap = 'viridis') plt.colorbar(pad = 0.01,fraction = 0.01) plt.title('features'); plt.subplot(1,3,3); plt.imshow(Y, interpolation = 'none', aspect = 'auto', cmap = 'viridis') plt.colorbar(pad = 0.01,fraction = 0.01) plt.title('amplitudes'); plt.tight_layout();
def finetune(Y, cin, nIter=5): """Fine tuning of components within greedyROI using rank-1 NMF """ for iter in range(nIter): a = np.maximum(np.dot(Y, cin), 0) a = a / np.sqrt(np.sum(a**2)) c = np.sum(Y * a[..., np.newaxis], tuple(np.arange(Y.ndim - 1))) return a, c #%%
def NMF_feature_extraction(text_lst, n_samples, n_features, n_topics, n_top_words): print "Extracting tf-idf features for NMF..." tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') tfidf = tfidf_vectorizer.fit_transform(text_lst) print "Fitting the NMF model with tf-idf features," "n_samples=%d and n_features=%d..." % (n_samples, n_features) nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print "\nTopics in NMF model:" tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) print "*************end NMF****************"
def NMF_results(data, n_comps=None): nmf = NMF(n_components=n_comps) model = nmf.fit(data) out_data = {'model' : model, 'reconstruction error': nmf.reconstruction_err_ } return 'NMF', out_data
def nmf(X, n_components=None): ''' Non Negative Matrix Factorization. Outputs the weights (W) matrix and the components. ''' model = NMF(n_components) W = model.fit_transform(X) components = model.components_ return W, components
def nmf(X, n_components=None): model = NMF(n_components) W = model.fit_transform(X) components = model.components_ return W, components
def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (no_plural_stemmer(w) for w in analyzer(doc)) # We use a few heuristics to filter out useless terms early on: the posts # are stripped of headers, footers and quoted replies, and common English # words, words occurring in only one document or in at least 95% of the # documents are removed. # Use tf-idf features for NMF.
def nmf_accuracy(): tdm = pickle.load(open(DATASET_PATH + "BOW.p", "rb")) true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0] print("I'm NNMF-ing!") NNMF = NMF(max_iter=50, n_components=100) tdm_reshaped = NNMF.fit_transform(tdm) print("I'm clustering!") cluster_kmeans(tdm_reshaped, true_labels)
def build_nmf(X, k=5): mod = NMF(n_components=k) W = mod.fit_transform(X) H = mod.components_ return W, H
def build_nmf_all(X, k=5): scaler = MinMaxScaler() X_sca = scaler.fit_transform(X) nmfModel = NMF(n_components=k) W = nmfModel.fit_transform(X_sca) H = nmfModel.components_ print 'NMF done!' # plot_heatmap(H.T, k=k) labelsNMF = W.argmax(axis=1) return W, H, labelsNMF, nmfModel
def nmf_test(df): X = df.drop(['Year', 'zipcode'], axis=1).values scaler = MinMaxScaler() X_sca = scaler.fit_tranform(X) scores = [] for k in xrange(2, 11): model = NMF(n_components=k) W = model.fit_transform(X_sca) labels = W.argmax(axis=1) score = silhouette_score(X_sca, labels) scores.append(score) plt.plot(xrange(2, 11), scores, 'b*-') plt.show()
def svd(train,test,dims=6,it=15,file_name='tf_idf',path='data/'): svd=NMF(random_state=1123,n_components=dims) svd.fit(train) #print svd.transform(train).shape pd.to_pickle(svd.transform(train),path+'train_NMF_'+str(dims)+'_'+file_name+'.pkl') pd.to_pickle(svd.transform(test),path+'test_NMF_'+str(dims)+'_'+file_name+'.pkl') return 'Success' # In[16]:
def check_transformer_data_not_an_array(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 this_X = NotAnArray(X) this_y = NotAnArray(np.asarray(y)) _check_transformer(name, Transformer, this_X, this_y)
def check_classifiers_classes(name, Classifier): X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 y_names = np.array(["one", "two", "three"])[y] for y_names in [y_names, y_names.astype('O')]: if name in ["LabelPropagation", "LabelSpreading"]: # TODO some complication with -1 label y_ = y else: y_ = y_names classes = np.unique(y_) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() if name == 'BernoulliNB': classifier.set_params(binarize=X.mean()) set_testing_parameters(classifier) set_random_state(classifier) # fit classifier.fit(X, y_) y_pred = classifier.predict(X) # training set performance assert_array_equal(np.unique(y_), np.unique(y_pred)) if np.any(classifier.classes_ != classes): print("Unexpected classes_ attribute for %r: " "expected %s, got %s" % (classifier, classes, classifier.classes_))
def _fit_local(self, data): from sklearn.decomposition import NMF nmf = NMF(n_components=self.k, tol=self.tol, max_iter=self.max_iter, random_state=self.seed) w = nmf.fit_transform(data) return w, nmf.components_,
def optimize_nmf(self, df): ''' Function to optimize the number of topics used in NMF clustering. INPUT: df: pandas Dataframe containing 'lemmatized_text' column for TF-IDF ''' self.fit_tfidf(df) if not self.optimum_topics: avg_cosine_sim = [] pbar = ProgressBar() for i in pbar(self.num_topics): cosine_sim = [] self.nmf = NMF(n_components=i, alpha=self.nmf_alpha, l1_ratio=self.nmf_l1_ratio, random_state=self.random_state).fit(self.tfidf_matrix) err = self.nmf.reconstruction_err_ self.H_matrix = self.nmf.components_ if i == 1: avg_cosine_sim.append(1) else: idx_arr = np.arange(i) for combo in combinations(idx_arr, 2): vect_1 = self.H_matrix[:, int(combo[0])].reshape(-1, 1) vect_2 = self.H_matrix[:, int(combo[1])].reshape(-1, 1) sim = cosine_similarity(vect_1, vect_2) cosine_sim.append(sim) avg_cosine_sim.append(np.mean(cosine_sim)) self.reconstruction_err_array.append(err) fig = plt.figure(figsize=(16, 8)) ax_1 = fig.add_subplot(211) ax_1.plot(self.num_topics, self.reconstruction_err_array) ax_1.set_title("Reconstruction Error vs Number of Topics") ax_1.set_xlabel("Number of Topics") ax_1.set_ylabel("Reconstruction Error") ax_2 = fig.add_subplot(212) ax_2.plot(self.num_topics, avg_cosine_sim) ax_2.set_title("Avg Cosine Similarity Between Topics") ax_2.set_xlabel("Number of Topics") ax_2.set_ylabel("Avg Cosine Similarity") plt.tight_layout() if self.pro_or_con == 'pro': img_path = os.path.join('images', 'positive') else: img_path = os.path.join('images', 'negative') plt.savefig(os.path.join(img_path, "nmf_metrics.png")) plt.show() self.optimum_topics = int(raw_input("Desired topics from graph: "))
def main(): r_pre = "[your file path]/all_purpose" f_path = "[your file path]/all_purpose_export.txt" p1 = r_pre + "\.csv\t\d+\t(.*?)(\t\d+){6}" p2 = "(.*?)O\s*\t(.*?)" extracted_combo_dct = {} stemmed_extracted_combo_dct = {} extracted_combo_lst = [] stemmed_extracted_combo_lst = [] n_top_words = 3 n_topics = 20 n_features = 50 f = open(f_path) for l in f: r1 = re.search(p1, l) m1 = ' '.join(r1.group(1).split('\t')) r2 = re.search(p2, l) if r2 == None: print l break # used to add missing " O" m2 = ' '.join([e for e in l.split(r2.group(1))[1].split('O')[1].split('\t') if e != ' ']).split('\n')[0] extracted_combo_dct.setdefault(m1, 0) stemmed_extracted_combo_dct.setdefault(m2, 0) extracted_combo_dct[m1] += 1 stemmed_extracted_combo_dct[m2] += 1 extracted_combo_lst.append(m1) stemmed_extracted_combo_lst.append(m2) sort_dct_by_value(extracted_combo_dct) sort_dct_by_value(stemmed_extracted_combo_dct) n_samples = len(extracted_combo_lst) n_stemmed_samples = len(stemmed_extracted_combo_lst) # using NMF feature extraction NMF_feature_extraction(extracted_combo_lst, n_samples, n_features, n_topics, n_top_words) NMF_feature_extraction(stemmed_extracted_combo_lst, n_stemmed_samples, n_features, n_topics, n_top_words) # using LDA feature extraction LDA_feature_extraction(extracted_combo_lst, n_samples, n_features, n_topics, n_top_words) LDA_feature_extraction(stemmed_extracted_combo_lst, n_stemmed_samples, n_features, n_topics, n_top_words)
def evaluate(self, matrix): """ Args: matrix (2d array): this is the matrix of documents and tokens where the number of topics needs to be determined, this has worked with compressed sparse row matrices before Returns topic_count (int): this is the number of topics that IPNMF was able to pick up heuristically """ if self.noise_pct == 'auto': self._pareto_corpus_content(matrix, .8) if self.step == 'auto': self._determine_auto_step_size(matrix) if self.pnmf_verbose: print('initializing evaluation...') self.corpus_count = matrix.shape[0] self.rich_content = int(self.corpus_count * (1-self.noise_pct)) self.noise_content = self.corpus_count - self.rich_content topic_array = np.arange(self.start, self.max_steps * self.step + self.start, self.step) for topic_count in topic_array: if self.pnmf_verbose: print('extracting {} topics...'.format(topic_count)) self.topic_count = topic_count nmf = NMF(n_components=self.topic_count, init=self.init, solver=self.solver, tol=self.tol, max_iter=self.max_iter, random_state=self.random_state, alpha=self.alpha, l1_ratio=self.l1_ratio, verbose=self.verbose, shuffle=self.shuffle, nls_max_iter=self.nls_max_iter, sparseness=self.sparseness, beta=self.beta, eta=self.eta) W = nmf.fit_transform(matrix) self.nmf = nmf self.topic_labels = np.apply_along_axis(func1d=np.argmax, axis=1, arr=W) self.topic_summary = Counter(self.topic_labels) if self._stopping_condition(): if self.pnmf_verbose: print('heuristic topic count is {}' .format(self.topic_count - self.step)) self.topic_count = self.topic_count - self.step nmf = NMF(n_components=self.topic_count, init=self.init, solver=self.solver, tol=self.tol, max_iter=self.max_iter, random_state=self.random_state, alpha=self.alpha, l1_ratio=self.l1_ratio, verbose=self.verbose, shuffle=self.shuffle, nls_max_iter=self.nls_max_iter, sparseness=self.sparseness, beta=self.beta, eta=self.eta) nmf.fit(matrix) self.nmf = self.previous_nmf return self.topic_count else: self.previous_nmf = nmf
def set_testing_parameters(estimator): # set parameters to speed up some estimators and # avoid deprecated behaviour params = estimator.get_params() if ("n_iter" in params and estimator.__class__.__name__ != "TSNE"): estimator.set_params(n_iter=5) if "max_iter" in params: warnings.simplefilter("ignore", ConvergenceWarning) if estimator.max_iter is not None: estimator.set_params(max_iter=min(5, estimator.max_iter)) # LinearSVR if estimator.__class__.__name__ == 'LinearSVR': estimator.set_params(max_iter=20) # NMF if estimator.__class__.__name__ == 'NMF': estimator.set_params(max_iter=100) # MLP if estimator.__class__.__name__ in ['MLPClassifier', 'MLPRegressor']: estimator.set_params(max_iter=100) if "n_resampling" in params: # randomized lasso estimator.set_params(n_resampling=5) if "n_estimators" in params: # especially gradient boosting with default 100 estimator.set_params(n_estimators=min(5, estimator.n_estimators)) if "max_trials" in params: # RANSAC estimator.set_params(max_trials=10) if "n_init" in params: # K-Means estimator.set_params(n_init=2) if "decision_function_shape" in params: # SVC estimator.set_params(decision_function_shape='ovo') if estimator.__class__.__name__ == "SelectFdr": # be tolerant of noisy datasets (not actually speed) estimator.set_params(alpha=.5) if estimator.__class__.__name__ == "TheilSenRegressor": estimator.max_subpopulation = 100 if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably # greater than the number of features. # So we impose a smaller number (avoid "auto" mode) estimator.set_params(n_components=1) if isinstance(estimator, SelectKBest): # SelectKBest has a default of k=10 # which is more feature than we have in most case. estimator.set_params(k=1) if isinstance(estimator, NMF): if not isinstance(estimator, ProjectedGradientNMF): estimator.set_params(solver='cd')