我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用sklearn.decomposition.RandomizedPCA()。
def visualize_data(data, labels): pca = RandomizedPCA(n_components=2) reshaped = pca.fit_transform(data) df = pd.DataFrame({'x': reshaped[:,0], 'y': reshaped[:, 1], 'label': np.where(labels == 1, 'Positive', np.where(labels == 0, 'Neutral', 'Negative'))}) colors = ['yellow', 'red', 'blue'] for label, color in zip(df['label'].unique(), colors): mask = df['label'] == label plt.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) plt.legend() plt.title('PCA Decomposition of Image Data') plt.xlabel('PCA 1') plt.ylabel('PCA 2') plt.show() # plt.savefig('PCA_plot.png')
def plot_clusters_pca(responsibilities, color_groups): from sklearn.decomposition import RandomizedPCA import pylab as pl from random import shuffle colors = list(colors_dict.values()) shuffle(colors) pca = RandomizedPCA(n_components=2) X = pca.fit_transform(responsibilities) # print >>stderr, pca.explained_variance_ratio_ pl.figure() pl.scatter(X[:, 0], X[:, 1], c="grey", label="unknown") for c, sub, i in zip(colors, color_groups, count(0)): pl.scatter(X[sub, 0], X[sub, 1], c=c, label=str(i)) pl.legend() pl.title("PCA responsibility matrix") pl.show()
def fixed_batch_size_comparison(data): all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10, data.shape[1], num=5)] batch_size = 1000 # Compare runtimes and error for fixed batch size all_times = defaultdict(list) all_errors = defaultdict(list) for n_components in all_features: pca = PCA(n_components=n_components) rpca = RandomizedPCA(n_components=n_components, random_state=1999) ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), ('ipca', ipca), ('rpca', rpca)]} for k in sorted(results_dict.keys()): all_times[k].append(results_dict[k]['time']) all_errors[k].append(results_dict[k]['error']) plot_feature_times(all_times, batch_size, all_features, data) plot_feature_errors(all_errors, batch_size, all_features, data)
def pca_analysis(self): if not self._use_pca: return print "done.\n + Using PCA to analyze the data...",; stdout.flush() cols = self._get_columns() (X_train, _) = self._train_data if not self._pca: self._pca = RandomizedPCA( n_components=self._pca_max_n, whiten=True, random_state=42) self._pca.fit(X_train) # NOTE: plot code stolen from sklearn example: http://bit.ly/1X8ZsUw fig = plt.figure(self._fig_count, figsize=(4,3)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(self._pca.explained_variance_ratio_) fig.suptitle('RandomizedPCA Analysis') plt.axis('tight') plt.xlabel('Component') plt.ylabel('Explained Variance Ratio') plt.show() self._fig_count += 1 # Reset the PCA object, since we will need to set the exact number # of components we want to use if and when we use it again self._pca = None # Train a classifier pipeline that may or may not use PCA or other # feature selection methods
def plot_feature_times(all_times, batch_size, all_components, data): plt.figure() plot_results(all_components, all_times['pca'], label="PCA") plot_results(all_components, all_times['ipca'], label="IncrementalPCA, bsize=%i" % batch_size) plot_results(all_components, all_times['rpca'], label="RandomizedPCA") plt.legend(loc="upper left") plt.suptitle("Algorithm runtime vs. n_components\n \ LFW, size %i x %i" % data.shape) plt.xlabel("Number of components (out of max %i)" % data.shape[1]) plt.ylabel("Time (seconds)")
def plot_feature_errors(all_errors, batch_size, all_components, data): plt.figure() plot_results(all_components, all_errors['pca'], label="PCA") plot_results(all_components, all_errors['ipca'], label="IncrementalPCA, bsize=%i" % batch_size) plot_results(all_components, all_errors['rpca'], label="RandomizedPCA") plt.legend(loc="lower left") plt.suptitle("Algorithm error vs. n_components\n" "LFW, size %i x %i" % data.shape) plt.xlabel("Number of components (out of max %i)" % data.shape[1]) plt.ylabel("Mean absolute error")
def plot_batch_times(all_times, n_features, all_batch_sizes, data): plt.figure() plot_results(all_batch_sizes, all_times['pca'], label="PCA") plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA") plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA") plt.legend(loc="lower left") plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \ LFW, size %i x %i" % ( n_features, data.shape[0], data.shape[1])) plt.xlabel("Batch size") plt.ylabel("Time (seconds)")
def variable_batch_size_comparison(data): batch_sizes = [i.astype(int) for i in np.linspace(data.shape[0] // 10, data.shape[0], num=10)] for n_components in [i.astype(int) for i in np.linspace(data.shape[1] // 10, data.shape[1], num=4)]: all_times = defaultdict(list) all_errors = defaultdict(list) pca = PCA(n_components=n_components) rpca = RandomizedPCA(n_components=n_components, random_state=1999) results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), ('rpca', rpca)]} # Create flat baselines to compare the variation over batch size all_times['pca'].extend([results_dict['pca']['time']] * len(batch_sizes)) all_errors['pca'].extend([results_dict['pca']['error']] * len(batch_sizes)) all_times['rpca'].extend([results_dict['rpca']['time']] * len(batch_sizes)) all_errors['rpca'].extend([results_dict['rpca']['error']] * len(batch_sizes)) for batch_size in batch_sizes: ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) results_dict = {k: benchmark(est, data) for k, est in [('ipca', ipca)]} all_times['ipca'].append(results_dict['ipca']['time']) all_errors['ipca'].append(results_dict['ipca']['error']) plot_batch_times(all_times, n_components, batch_sizes, data) # RandomizedPCA error is always worse (approx 100x) than other PCA # tests plot_batch_errors(all_errors, n_components, batch_sizes, data)