我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用sklearn.decomposition.IncrementalPCA()。
def trainPCA(dstFolder, BATCH_SIZE = 50): ipca = IncrementalPCA(n_components=10, batch_size=BATCH_SIZE) for batch_isomaps_path in glob.glob(os.path.join(dstFolder, "*_isomaps_centered.p")): batch_num = int(os.path.basename(batch_isomaps_path).split("_")[0]) isomaps = pickle.load(open(batch_isomaps_path, "rb")) # remove the illumination and count channel isomaps_just_color = isomaps[:,:,:,1:3] #flatten to data X = isomaps_just_color.reshape(isomaps_just_color.shape[0], -1) # ,isomaps_just_color.shape[3]) ipca.partial_fit(X) with open(os.path.join(dstFolder, "pca.p"), "wb") as file: pickle.dump(ipca, file)
def fixed_batch_size_comparison(data): all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10, data.shape[1], num=5)] batch_size = 1000 # Compare runtimes and error for fixed batch size all_times = defaultdict(list) all_errors = defaultdict(list) for n_components in all_features: pca = PCA(n_components=n_components) rpca = RandomizedPCA(n_components=n_components, random_state=1999) ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), ('ipca', ipca), ('rpca', rpca)]} for k in sorted(results_dict.keys()): all_times[k].append(results_dict[k]['time']) all_errors[k].append(results_dict[k]['error']) plot_feature_times(all_times, batch_size, all_features, data) plot_feature_errors(all_errors, batch_size, all_features, data)
def test_incremental_pca(): # Incremental PCA on dense arrays. X = iris.data batch_size = X.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) pca = PCA(n_components=2) pca.fit_transform(X) X_transformed = ipca.fit_transform(X) np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2)) assert_almost_equal(ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), 1) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X) cov = ipca.get_covariance() precision = ipca.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]))
def test_incremental_pca_check_projection(): # Test that the projection of data is correct. rng = np.random.RandomState(1999) n, p = 100, 3 X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5]) Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5]) # Get the reconstruction of the generated data X # Note that Xt has the same "components" as X, just separated # This is what we want to ensure is recreated correctly Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt) # Normalize Yt /= np.sqrt((Yt ** 2).sum()) # Make sure that the first element of Yt is ~1, this means # the reconstruction worked as expected assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
def test_incremental_pca_set_params(): # Test that components_ sign is stable over batch sizes. rng = np.random.RandomState(1999) n_samples = 100 n_features = 20 X = rng.randn(n_samples, n_features) X2 = rng.randn(n_samples, n_features) X3 = rng.randn(n_samples, n_features) ipca = IncrementalPCA(n_components=20) ipca.fit(X) # Decreasing number of components ipca.set_params(n_components=10) assert_raises(ValueError, ipca.partial_fit, X2) # Increasing number of components ipca.set_params(n_components=15) assert_raises(ValueError, ipca.partial_fit, X3) # Returning to original setting ipca.set_params(n_components=20) ipca.partial_fit(X)
def test_incremental_pca_partial_fit(): # Test that fit and partial_fit get equivalent results. rng = np.random.RandomState(1999) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) batch_size = 10 ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X) pipca = IncrementalPCA(n_components=2, batch_size=batch_size) # Add one to make sure endpoint is included batch_itr = np.arange(0, n + 1, batch_size) for i, j in zip(batch_itr[:-1], batch_itr[1:]): pipca.partial_fit(X[i:j, :]) assert_almost_equal(ipca.components_, pipca.components_, decimal=3)
def test_whitening(): # Test that PCA and IncrementalPCA transforms match to sign flip. X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0., effective_rank=2, random_state=1999) prec = 3 n_samples, n_features = X.shape for nc in [None, 9]: pca = PCA(whiten=True, n_components=nc).fit(X) ipca = IncrementalPCA(whiten=True, n_components=nc, batch_size=250).fit(X) Xt_pca = pca.transform(X) Xt_ipca = ipca.transform(X) assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec) Xinv_ipca = ipca.inverse_transform(Xt_ipca) Xinv_pca = pca.inverse_transform(Xt_pca) assert_almost_equal(X, Xinv_ipca, decimal=prec) assert_almost_equal(X, Xinv_pca, decimal=prec) assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
def compute_reduced_representation(self): if not self.compute_reduced: return None config_score = simple_config.load()["score"] f_db = os.path.join( config_score["output_data_directory"], config_score["document_scores"]["f_db"] ) h5 = touch_h5(f_db) g = h5[self.method] keys = g.keys() V = np.vstack([g[x]["V"][:] for x in keys]) sizes = [g[x]["_ref"].shape[0] for x in keys] nc = self.reduced_n_components clf = IncrementalPCA(n_components=nc) msg = "Performing PCA on {}, ({})->({})" print(msg.format(self.method, V.shape[1], nc)) VX = clf.fit_transform(V) EVR = clf.explained_variance_ratio_ COMPONENTS = clf.components_ for key, size in zip(keys, sizes): # Take slices equal to the size vx, VX = VX[:size,:], VX[size:, :] evr, EVR = EVR[:size], EVR[size:] com, COMPONENTS = COMPONENTS[:size,:], COMPONENTS[size:, :] g[key].create_dataset("VX", data=vx, **self.h5py_args) g[key].create_dataset("VX_explained_variance_ratio_", data=evr) g[key].create_dataset("VX_components_", data=com) h5.close()
def plot_feature_times(all_times, batch_size, all_components, data): plt.figure() plot_results(all_components, all_times['pca'], label="PCA") plot_results(all_components, all_times['ipca'], label="IncrementalPCA, bsize=%i" % batch_size) plot_results(all_components, all_times['rpca'], label="RandomizedPCA") plt.legend(loc="upper left") plt.suptitle("Algorithm runtime vs. n_components\n \ LFW, size %i x %i" % data.shape) plt.xlabel("Number of components (out of max %i)" % data.shape[1]) plt.ylabel("Time (seconds)")
def plot_feature_errors(all_errors, batch_size, all_components, data): plt.figure() plot_results(all_components, all_errors['pca'], label="PCA") plot_results(all_components, all_errors['ipca'], label="IncrementalPCA, bsize=%i" % batch_size) plot_results(all_components, all_errors['rpca'], label="RandomizedPCA") plt.legend(loc="lower left") plt.suptitle("Algorithm error vs. n_components\n" "LFW, size %i x %i" % data.shape) plt.xlabel("Number of components (out of max %i)" % data.shape[1]) plt.ylabel("Mean absolute error")
def plot_batch_times(all_times, n_features, all_batch_sizes, data): plt.figure() plot_results(all_batch_sizes, all_times['pca'], label="PCA") plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA") plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA") plt.legend(loc="lower left") plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \ LFW, size %i x %i" % ( n_features, data.shape[0], data.shape[1])) plt.xlabel("Batch size") plt.ylabel("Time (seconds)")
def plot_batch_errors(all_errors, n_features, all_batch_sizes, data): plt.figure() plot_results(all_batch_sizes, all_errors['pca'], label="PCA") plot_results(all_batch_sizes, all_errors['ipca'], label="IncrementalPCA") plt.legend(loc="lower left") plt.suptitle("Algorithm error vs. batch_size for n_components %i\n \ LFW, size %i x %i" % ( n_features, data.shape[0], data.shape[1])) plt.xlabel("Batch size") plt.ylabel("Mean absolute error")
def test_incremental_pca_inverse(): # Test that the projection of data can be inverted. rng = np.random.RandomState(1999) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X) Y = ipca.transform(X) Y_inverse = ipca.inverse_transform(Y) assert_almost_equal(X, Y_inverse, decimal=3)
def test_incremental_pca_validation(): # Test that n_components is >=1 and <= n_features. X = [[0, 1], [1, 0]] for n_components in [-1, 0, .99, 3]: assert_raises(ValueError, IncrementalPCA(n_components, batch_size=10).fit, X)
def test_incremental_pca_batch_signs(): # Test that components_ sign is stable over batch sizes. rng = np.random.RandomState(1999) n_samples = 100 n_features = 3 X = rng.randn(n_samples, n_features) all_components = [] batch_sizes = np.arange(10, 20) for batch_size in batch_sizes: ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X) all_components.append(ipca.components_) for i, j in zip(all_components[:-1], all_components[1:]): assert_almost_equal(np.sign(i), np.sign(j), decimal=6)
def test_incremental_pca_batch_values(): # Test that components_ values are stable over batch sizes. rng = np.random.RandomState(1999) n_samples = 100 n_features = 3 X = rng.randn(n_samples, n_features) all_components = [] batch_sizes = np.arange(20, 40, 3) for batch_size in batch_sizes: ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X) all_components.append(ipca.components_) for i, j in zip(all_components[:-1], all_components[1:]): assert_almost_equal(i, j, decimal=1)
def test_incremental_pca_against_pca_iris(): # Test that IncrementalPCA and PCA are approximate (to a sign flip). X = iris.data Y_pca = PCA(n_components=2).fit_transform(X) Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X) assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def test_incremental_pca_against_pca_random_data(): # Test that IncrementalPCA and PCA are approximate (to a sign flip). rng = np.random.RandomState(1999) n_samples = 100 n_features = 3 X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features) Y_pca = PCA(n_components=3).fit_transform(X) Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X) assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def new(stop_words=[],decomposition='SVD',n_components=5): # Prepare vectoriser engines idf = TfidfVectorizer( ngram_range=(1,3), #Unigram,bigram,& trigram stop_words=stop_words ) # Prepare normaliser norm = Normalizer(norm='max') print(colored('Texthasher model created','yellow')) # Prepare dimensionality reduction if decomposition and n_components: if decomposition=='LDA': # Results in Non-negative matrix reducer = LatentDirichletAllocation( # TFIDF --> Topic term n_topics=n_components, max_doc_update_iter=20, max_iter=8 ) return [idf,norm,reducer] elif decomposition=='SVD': reducer = TruncatedSVD( # Best for small dataset, n_components, # nightmare for large dataset n_iter=8) # Damn slow return [idf,norm,reducer] elif decomposition=='PCA': # When using IPCA, remember to always keep: # n_samples > n_components > batch_size # reducer = IncrementalPCA(n_components) # Sparse -> Dense greedily consumes large amount of mem # to_dense = SparseToDense() # return [idf,norm,to_dense,reducer] reducer = SparsePCA(n_components) return [idf,norm,reducer] return [idf,norm] else: return [idf,norm]