我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.decomposition.PCA。
def PCA(data, num_components=None): # mean center the data data -= data.mean(axis=0) # calculate the covariance matrix R = np.cov(data, rowvar=False) # calculate eigenvectors & eigenvalues of the covariance matrix # use 'eigh' rather than 'eig' since R is symmetric, # the performance gain is substantial V, E = np.linalg.eigh(R) # sort eigenvalue in decreasing order idx = np.argsort(V)[::-1] E = E[:,idx] # sort eigenvectors according to same index V = V[idx] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) E = E[:, :num_components] # carry out the transformation on the data using eigenvectors # and return the re-scaled data, eigenvalues, and eigenvectors return np.dot(E.T, data.T).T, V, E
def apply_lens(df, lens='pca', dist='euclidean', n_dim=2, **kwargs): """ input: N x F dataframe of observations output: N x n_dim image of input data under lens function """ if n_dim != 2: raise 'error: image of data set must be two-dimensional' if dist not in ['euclidean', 'correlation']: raise 'error: only euclidean and correlation distance metrics are supported' if lens == 'pca' and dist != 'euclidean': raise 'error: PCA requires the use of euclidean distance metric' if lens == 'pca': df_lens = pd.DataFrame(decomposition.PCA(n_components=n_dim, **kwargs).fit_transform(df), df.index) elif lens == 'mds': D = metrics.pairwise.pairwise_distances(df, metric=dist) df_lens = pd.DataFrame(manifold.MDS(n_components=n_dim, **kwargs).fit_transform(D), df.index) elif lens == 'neighbor': D = metrics.pairwise.pairwise_distances(df, metric=dist) df_lens = pd.DataFrame(manifold.SpectralEmbedding(n_components=n_dim, **kwargs).fit_transform(D), df.index) else: raise 'error: only PCA, MDS, neighborhood lenses are supported' return df_lens
def fit(self, scenario: ASlibScenario, config: Configuration): ''' fit pca object to ASlib scenario data Arguments --------- scenario: data.aslib_scenario.ASlibScenario ASlib Scenario with all data in pandas config: ConfigSpace.Configuration configuration ''' if config.get("pca"): self.pca = PCA(n_components=config.get("pca_n_components")) self.pca.fit(scenario.feature_data.values) self.active = True
def transform(self, scenario: ASlibScenario): ''' transform ASLib scenario data Arguments --------- scenario: data.aslib_scenario.ASlibScenario ASlib Scenario with all data in pandas Returns ------- data.aslib_scenario.ASlibScenario ''' if self.pca: self.logger.debug("Applying PCA") values = self.pca.transform( np.array(scenario.feature_data.values)) scenario.feature_data = pd.DataFrame( data=values, index=scenario.feature_data.index, columns=["f%d" % (i) for i in range(values.shape[1])]) return scenario
def get_arguments(): parser = argparse.ArgumentParser(description='Molecular autoencoder network') parser.add_argument('data', type=str, help='HDF5 file to read input data from.') parser.add_argument('model', type=str, help='Trained Keras model to use.') parser.add_argument('--save_h5', type=str, help='Name of a file to write HDF5 output to.') parser.add_argument('--latent_dim', type=int, metavar='N', default=LATENT_DIM, help='Dimensionality of the latent representation.') parser.add_argument('--tsne_lr', metavar='LR', type=float, default=TSNE_LEARNING_RATE, help='Learning to use for t-SNE.') parser.add_argument('--tsne_components', metavar='N', type=int, default=TSNE_COMPONENTS, help='Number of components to use for t-SNE.') parser.add_argument('--tsne_perplexity', metavar='P', type=float, default=TSNE_PERPLEXITY) parser.add_argument('--tsne_iterations', metavar='N', type=int, default=TSNE_ITERATIONS) parser.add_argument('--visualize', dest='visualize', action='store_true', help='Fit manifold and render a visualization. If this flag is not used, the sampled data' + ' will simply be returned with no further processing.') parser.add_argument('--skip-pca', dest='use_pca', action='store_false', help='Skip PCA preprocessing of data to feed into t-SNE.') parser.add_argument('--pca_components', metavar='N', type=int, default=PCA_COMPONENTS, help='Number of components to use for PCA.') parser.set_defaults(use_pca = True) parser.set_defaults(visualize = False) return parser.parse_args()
def transform_PCA(X_train,X_all,n_components=100): """Given some training data, performs a Principle Components Analysis (PCA) and modifies the rest of the data based on the learned PCA. Args: X_train: A matrix containing training data X_all: A matrix containing all the data n_components: The number of components to use in the PCA Returns: The transformed data and the PCA object """ pca = PCA(n_components=n_components) pca.fit(X_train) print("Total explained variance:", sum(pca.explained_variance_ratio_)) return pca.transform(X_all),pca
def test_pca_inverse(): # Test that the projection of data can be inverted rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean dX = da.from_array(X, chunks=(n // 2, p)) # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) pca = dd.PCA(n_components=2, svd_solver='full').fit(dX) Y = pca.transform(dX) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X, Y_inverse, decimal=3) # same as above with whitening (approximate reconstruction) for solver in solver_list: pca = dd.PCA(n_components=2, whiten=True, svd_solver=solver) pca.fit(dX) Y = pca.transform(dX) Y_inverse = pca.inverse_transform(Y) assert_eq(dX, Y_inverse, atol=1e-3)
def test_randomized_pca_inverse(): # Test that randomized PCA is inversible on dense data rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean dX = da.from_array(X, chunks=(n, p)) # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) pca = dd.PCA(n_components=2, svd_solver='randomized', random_state=0).fit(dX) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X, Y_inverse, decimal=2) # same as above with whitening (approximate reconstruction) pca = dd.PCA(n_components=2, whiten=True, svd_solver='randomized', random_state=0).fit(dX) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (np.abs(X - Y_inverse) / np.abs(X).mean()).max() assert_less(relative_max_delta, 1e-5)
def test_infer_dim_1(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6])) X = da.from_array(X, chunks=(n, p)) pca = dd.PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ ll = [] for k in range(p): ll.append(_assess_dimension_(spect, k, n, p)) ll = np.array(ll) assert_greater(ll[1], ll.max() - .01 * n)
def test_infer_dim_by_explained_variance(): X = da.from_array(iris.data, chunks=iris.data.shape) pca = dd.PCA(n_components=0.95, svd_solver='full') pca.fit(X) assert_equal(pca.n_components, 0.95) assert_equal(pca.n_components_, 2) pca = dd.PCA(n_components=0.01, svd_solver='full') pca.fit(X) assert_equal(pca.n_components, 0.01) assert_equal(pca.n_components_, 1) # Can't do this rng = np.random.RandomState(0) # more features than samples X = rng.rand(5, 20) pca = dd.PCA(n_components=.5, svd_solver='full').fit(X) assert_equal(pca.n_components, 0.5) assert_equal(pca.n_components_, 2)
def test_pca_score2(): # Test that probabilistic PCA correctly separated different datasets n, p = 100, 3 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 5]) dX = da.from_array(X, chunks=(n // 2, p)) for solver in solver_list: pca = dd.PCA(n_components=2, svd_solver=solver) pca.fit(dX) ll1 = pca.score(dX) ll2 = pca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5])) assert_greater(ll1, ll2) # Test that it gives different scores if whiten=True pca = dd.PCA(n_components=2, whiten=True, svd_solver=solver) pca.fit(dX) ll2 = pca.score(dX) assert_true(ll1 > ll2)
def test_pca_score3(): # Check that probabilistic PCA selects the right model n, p = 200, 3 rng = np.random.RandomState(0) Xl = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])) Xt = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])) ll = np.zeros(p) dXl = da.from_array(Xl, chunks=(n // 2, p)) dXt = da.from_array(Xt, chunks=(n // 2, p)) for k in range(p): pca = dd.PCA(n_components=k, svd_solver='full') pca.fit(dXl) ll[k] = pca.score(dXt) assert_true(ll.argmax() == 1)
def test_pca_zero_noise_variance_edge_cases(): # ensure that noise_variance_ is 0 in edge cases # when n_components == min(n_samples, n_features) n, p = 100, 3 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 5]) dX = da.from_array(X, chunks=(n, p)) # arpack raises ValueError for n_components == min(n_samples, # n_features) svd_solvers = ['full', 'randomized'] for svd_solver in svd_solvers: pca = dd.PCA(svd_solver=svd_solver, n_components=p) pca.fit(dX) assert pca.noise_variance_ == 0 # Can't handle short and wide # pca.fit(X.T) # assert pca.noise_variance_ == 0 # removed test_svd_solver_auto, as we don't do that. # removed test_deprecation_randomized_pca, as we don't do that
def test_pca_float_dtype_preservation(svd_solver): # Ensure that PCA does not upscale the dtype when input is float32 X_64 = np.random.RandomState(0).rand(1000, 4).astype(np.float64) X_32 = X_64.astype(np.float32) dX_64 = da.from_array(X_64, chunks=X_64.shape) dX_32 = da.from_array(X_32, chunks=X_64.shape) pca_64 = dd.PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(dX_64) pca_32 = dd.PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(dX_32) assert pca_64.components_.dtype == np.float64 assert pca_32.components_.dtype == np.float32 assert pca_64.transform(dX_64).dtype == np.float64 assert pca_32.transform(dX_32).dtype == np.float32 assert_array_almost_equal(pca_64.components_, pca_32.components_, decimal=5)
def test_pca_int_dtype_upcast_to_double(svd_solver): # Ensure that all int types will be upcast to float64 X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4)) X_i64 = X_i64.astype(np.int64) X_i32 = X_i64.astype(np.int32) dX_i64 = da.from_array(X_i64, chunks=X_i64.shape) dX_i32 = da.from_array(X_i32, chunks=X_i32.shape) pca_64 = dd.PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(dX_i64) pca_32 = dd.PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(dX_i32) assert pca_64.components_.dtype == np.float64 assert pca_32.components_.dtype == np.float64 assert pca_64.transform(dX_i64).dtype == np.float64 assert pca_32.transform(dX_i32).dtype == np.float64 assert_array_almost_equal(pca_64.components_, pca_32.components_, decimal=5)
def dim_red(self, col, method, params, kws, load_fit=None): if method == 'PCA': self.do_dim_red = PCA(*params, **kws) if method == 'FastICA': self.do_dim_red = FastICA(*params, **kws) if method == 't-SNE': self.do_dim_red = TSNE(*params, **kws) if method == 'LLE': self.do_dim_red = LocallyLinearEmbedding(*params, **kws) if method == 'JADE-ICA': self.do_dim_red = JADE(*params, **kws) # TODO: Add ICA-JADE here if load_fit: self.do_dim_red = load_fit else: if method != 't-SNE': self.do_dim_red.fit(self.df[col]) dim_red_result = self.do_dim_red.transform(self.df[col]) else: dim_red_result = self.do_dim_red.fit_transform(self.df[col]) for i in list(range(1, dim_red_result.shape[1] + 1)): # will need to revisit this for other methods that don't use n_components to make sure column names still mamke sense self.df[(method, str(i))] = dim_red_result[:, i - 1] return self.do_dim_red
def __init__(self): # The values below can be changed to tweak the recommender algorithm self.n_most_similar = 10 self.n_features_title = 25 self.n_features_content = 50 #commented by shwenag #self.n_features_tags = 25 self.n_features_total = 30 # Do not change the values below self.df = None self.df_article_vectors = None self.similarity_score_dict = {} self.X = None self.X_title = None self.X_content = None self.type = 'Cos' self.is_pca = 'PCA' self.is_weight = 'TF-IDF'
def reduce_dimensionality(self, X, n_features): """ Apply PCA or SVD to reduce dimension to n_features. :param X: :param n_features: :return: """ # Initialize reduction method: PCA or SVD if self.is_pca == 'PCA': reducer = PCA(n_components=n_features) #reducer = PCA(n_components=n_features) if self.is_pca == 'SVD': reducer = TruncatedSVD(n_components=n_features) # Fit and transform data to n_features-dimensional space reducer.fit(X) self.X = reducer.transform(X) logging.debug("Reduced number of features to {0}".format(n_features)) logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum()) return X
def auto_plane_gnd(self): # TODO the condition is no need indices_split_gnd = self.indices_split[np.nonzero(self.gnd_con)] data_gnd = self.ptCLoudData[np.nonzero(self.gnd_con)] plane_split = self.plane_split for i in range(1, len(plane_split)): plane = plane_split[i] vec = (plane['nx'], plane['ny'], plane['nz']) angle_diff = base_process.angle_between(vec, (0, 0, -1)) if angle_diff < 0.3 or True: sel = data_gnd[np.nonzero(indices_split_gnd == i)] if len(sel) < 3: continue pca = PCA(n_components=2) data_transfer = pca.fit_transform(sel['a_position']) tri = np.array(triangle.delaunay(data_transfer), dtype=np.uint32) self.gnd_plane.append({'data': sel, 'tri': tri})
def random_proj_dr(X_train, X_test, rd, X_val=None, rev=None, **kwargs): """ Perform Gaussian Random Projection on X_train then transform X_train, X_test (and X_val). Return transformed data in original space if rev is True; otherwise, return transformed data in PCA space. """ # Generating random matrix for projection grp = GRP(n_components=rd, random_state=10) X_train_dr = grp.fit_transform(PCA_in_train) X_test_dr = grp.transform(PCA_in_test) X_train_dr = X_train_dr.reshape((train_len, 1, rd)) X_test_dr = X_test_dr.reshape((test_len, 1, rd)) return X_train_dr, X_test_dr, grp #------------------------------------------------------------------------------#
def PCA_analysis(data, mode, cell_stages = None): """Principal Component Analysis. """ assert mode in {'pca', 'pca2'} mean_shifter = StandardScaler(with_std = False) if mode == 'pca': pca = PCA(min(data.shape)) projected_data = pca.fit_transform(data) projected_data = pca.fit_transform(mean_shifter.fit_transform(data)) components = pca.components_ else: assert isinstance(cell_stages, np.ndarray) idx = np.where(cell_stages == np.max(cell_stages))[0] pca = PCA(min(idx.size, data.shape[1])) pca.fit(mean_shifter.fit_transform(data[idx])) components = pca.components_ projected_data = np.dot(data, components.T) return components, projected_data
def test_cv_splitting_ea_search_mldataset(trans, estimator): '''Test that an Elm Pipeline using MLDataset X feature matrix input can be split into cross validation train / test samples as in scikit-learn for numpy. (As of PR 192 this test is failing)''' pipe, X, y = new_pipeline(trans, estimator, flatten_first=False) X = X.to_features() param_distribution = param_distribution_sgd.copy() if 'PCA' in trans._cls.__name__: param_distribution.update(param_distribution_pca) else: param_distribution.update(param_distribution_poly) ea = EaSearchCV(estimator=pipe, param_distributions=param_distribution, score_weights=[1], model_selection=model_selection, refit=True, cv=3, error_score='raise', return_train_score=True, scheduler=None, n_jobs=-1, cache_cv=True) ea.fit(X,y) assert isinstance(ea.predict(X), MLDataset)
def trainingPCA(features, n_components=256, whiten=True, pca_model_name=None): print 'loaded features! {}'.format(features.shape) print np.sqrt(sum(features[0,:]**2)) #print 'Features l2 normalization' #features = normalize(features) #print np.sqrt(sum(features[0,:]**2)) print 'Feature PCA-whitenning' pca_model = PCA(n_components=n_components, whiten=whiten) features = pca_model.fit_transform(features) print np.sqrt(sum(features[0,:]**2)) print 'Features l2 normalization' features = normalize(features) print np.sqrt(sum(features[0,:]**2)) if pca_model_name is not None: print 'saving model...' check_path_file(pca_model_name, create_if_missing=True) save_obj(pca_model, pca_model_name) print 'done! {}'.format(pca_model_name) return pca_model
def get_mnist(n_train=5000, n_test=500, pca=True, d=50, dtype=np.float32): (X_train, y_train), (X_test, y_test) = mnist.load_data() n, row, col = X_train.shape channel = 1 X_train = X_train.reshape(-1, channel * row * col) X_test = X_test.reshape(-1, channel * row * col) X_train = X_train.astype(dtype) X_test = X_test.astype(dtype) X_train /= 255 X_test /= 255 X_train = X_train[:n_train] - X_train[:n_train].mean(axis=0) X_test = X_test[:n_test] - X_test[:n_test].mean(axis=0) if pca: pcfit = PCA(n_components=d) X_train = pcfit.fit_transform(X_train) X_test = pcfit.transform(X_test) y_train = y_train[:n_train] y_test = y_test[:n_test] return X_train, y_train, X_test, y_test
def dimension_compression(): X_t_c = make_matrix() token_list = [] contexts_list = [] for token, contexts in sorted(X_t_c.items()): token_list.append(token) contexts_list.append(contexts) pca = PCA(n_components = 300) DictoVec = DictVectorizer(sparse = True) sparse = DictoVec.fit_transform(contexts_list) print(sparse.shape) vec_list = pca.fit_transform(sparse.todense()) word_vec = {} for token, vec in zip(token_list, vec_list): word_vec[token] = vec return word_vec
def extractOptPCADimensionality(trainSamples) : assert len(trainSamples) >= 2 nFeatures = len(trainSamples[0]) pca = PCA(n_components=nFeatures) pca.fit(trainSamples) percentageExplainedVariance = [] nComponents = [] pSum = 0.0 nComponents99 = -1 print "nFeatures Here = ", nFeatures for i in xrange(0,len(pca.explained_variance_ratio_)) : pSum = pSum + pca.explained_variance_ratio_[i] if pSum >= 0.99 and nComponents99 == -1 : nComponents99 = i + 1 nComponents.append(i+1) percentageExplainedVariance.append(pSum*100.0) assert nComponents99 != -1 return nComponents99
def pca(features, var_expl = 0.98, n_com=None): """ Returns features with dimension reduced by PCA method implemented in scikit learn (sklearn) module. Number of components is matched based on explained variance ratio - *var_expl* or can be set by hand as *n_com*. """ pca = decomposition.PCA() pca.fit(features) if not n_com: for p in xrange(1,features.shape[0]): if np.sum(pca.explained_variance_ratio_[:p])>=var_expl: n_com = p break pca = decomposition.PCA(n_components=n_com) pca.fit(features) features = pca.transform(features) return features
def plot_scatter(values, cls): # Create a color-map with a different color for each class. import matplotlib.cm as cm cmap = cm.rainbow(np.linspace(0.0, 1.0, num_classes)) # Create an index with a random permutation to make a better plot. idx = np.random.permutation(len(values)) # Get the color for each sample. colors = cmap[cls[idx]] # Extract the x- and y-values. x = values[idx, 0] y = values[idx, 1] # Plot it. plt.scatter(x, y, color=colors, alpha=0.5) plt.show() # Plot the transfer-values that have been reduced using PCA. There are 3 different colors for the different classes in the Knifey-Spoony data-set. The colors have very large overlap. This may be because PCA cannot properly separate the transfer-values. # In[41]:
def plot_scatter(values, cls): # Create a color-map with a different color for each class. import matplotlib.cm as cm cmap = cm.rainbow(np.linspace(0.0, 1.0, num_classes)) # Get the color for each sample. colors = cmap[cls] # Extract the x- and y-values. x = values[:, 0] y = values[:, 1] # Plot it. plt.scatter(x, y, color=colors) plt.show() # Plot the transfer-values that have been reduced using PCA. There are 10 different colors for the different classes in the CIFAR-10 data-set. The colors are grouped together but with very large overlap. This may be because PCA cannot properly separate the transfer-values. # In[35]:
def main(): Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() pca = PCA() reduced = pca.fit_transform(Xtrain) plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain, alpha=0.5) plt.show() plt.plot(pca.explained_variance_ratio_) plt.show() # cumulative variance # choose k = number of dimensions that gives us 95-99% variance cumulative = [] last = 0 for v in pca.explained_variance_ratio_: cumulative.append(last + v) last = cumulative[-1] plt.plot(cumulative) plt.show()
def calculate_residual_correlation_matrix(returns): # find the market return constraining on the selected companies (first PCA) # regress each stock on that and find correlation of residuals returns_matrix = returns.as_matrix().transpose() covar_matrix = np.cov(returns_matrix) pca = decomposition.PCA(n_components=1) pca.fit(covar_matrix) X = pca.transform(covar_matrix) regr = linear_model.LinearRegression() dim = covar_matrix.shape[1] res = np.zeros(shape=(dim,dim)) for x in range(0, dim): regr = linear_model.LinearRegression() regr = regr.fit(X, covar_matrix[:,x]) res[:,x] = covar_matrix[:,x] - regr.predict(X) res_corr = np.corrcoef(res) return pd.DataFrame(res_corr, index = returns.columns, columns = returns.columns)
def get_embeddings_node2vec(g,d,p,q,path_node2vec): my_pca = PCA(n_components=d) my_edgelist = igraph.Graph.get_edgelist(g) # create temp dir to write and read from tmpdir = tempfile.mkdtemp() # create subdirs for node2vec os.makedirs(tmpdir + '/graph/') os.makedirs(tmpdir + '/emb/') # write edge list with open(tmpdir + '/graph/input.edgelist', 'w') as my_file: my_file.write('\n'.join('%s %s' % x for x in my_edgelist)) # execute node2vec call([path_node2vec + 'node2vec -i:' + tmpdir + '/graph/input.edgelist' + ' -o:' + tmpdir + '/emb/output.emb' + ' -p:' + p + ' -q:' + q],shell=True) # read back results emb = np.loadtxt(tmpdir + '/emb/output.emb',skiprows=1) # sort by increasing node index and keep only coordinates emb = emb[emb[:,0].argsort(),1:] # remove temp dir shutil.rmtree(tmpdir) # perform PCA on the embeddings to align and reduce dim pca_output = my_pca.fit_transform(emb) return pca_output
def plot_pca(act, pc_x=1, pc_y=2, labels=None, filename=None): act = act.T pca = PCA() pca.fit(act) eig_vec = pca.transform(act) data = pd.DataFrame(eig_vec) data.columns = ['PC%d' % i for i in range(data.shape[1])] data['act_mean'] = act.mean(axis=1) pc_x = 'PC%d' % pc_x pc_y = 'PC%d' % pc_y fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(data[pc_x], data[pc_y], c=data['act_mean'], cmap='RdBu_r') ax.set_xlabel(pc_x) ax.set_ylabel(pc_y) fig.colorbar(scatter) if labels: for i, row in data.iterrows(): ax.annotate('%d' % labels[i], xy=(row[pc_x], row[pc_y]), fontsize=10) if filename: fig.savefig(filename) plt.close()
def inverse_transform(self, X): """ Inverse the PCA rotation step. The cube stay whitened. Usefull if you want to denoise noisy bands before the rotation. X: `numpy array` A transformed (MNF) cube (m x n x p). Return: `numpy array` A inverted cube (m x n x p). """ h, w, numBands = X.shape X = np.reshape(X, (w*h, numBands)) M = self.transform.inverse_transform(X) M = np.reshape(M, (h, w, numBands)) return M
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] model = models.TfidfModel(corpus) corpus = [text for text in model[corpus]] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format(parsed_args.input)) data_set = load(parsed_args.input) features = np.reshape(data_set.features, [data_set.num_instances, -1]) if features.shape[1] > 50: self.log.info("applying PCA") pca = PCA(n_components=200) pca.fit(features) features = pca.transform(features) self.log.info("computing T-SNE embedding") tsne = TSNE(perplexity=parsed_args.perplexity, learning_rate=parsed_args.learning_rate, verbose=self.app_args.verbose_level) embedding = tsne.fit_transform(features) self.log.info("plotting embedding") self.plot_with_labels(data_set, embedding)
def gen_instance(self, max_length, dimension, test_mode=True, seed=0): if seed!=0: np.random.seed(seed) # Randomly generate (max_length) cities with (dimension) coordinates in [0,100] seq = np.random.randint(100, size=(max_length, dimension)) # Principal Component Analysis to center & rotate coordinates pca = PCA(n_components=dimension) sequence = pca.fit_transform(seq) # Scale to [0,1[ input_ = sequence/100 if test_mode == True: return input_, seq else: return input_ # Generate random batch for training procedure
def pca(data, n_components=None): """ Param: n_components: use 'mle' to guess """ newdata = data.copy() model = PCA(n_components=n_components) if len(newdata.shape) != 2: newdata = newdata.reshape((newdata.shape[0], -1)) model.fit(newdata) ret = model.explained_variance_ratio_ return ret
def YYT(Y, n_components=None, DEBUG=False): """ Param: Y: n x d n_components: use 'mle' to guess Returns: P: d x d' QT: d' x d """ newdata = Y.copy() model = PCA(n_components=n_components) if len(newdata.shape) != 2: newdata = newdata.reshape((newdata.shape[0], -1)) #TODO center data model.fit(newdata) if DEBUG: from IPython import embed; embed() return model.components_.T, model.components_ #def GSVD(Z, Y): # NotImplementedError # return [U,V,X,C,S]
def _apply_transformers_to_X(self, X, transformers): for transformer in transformers: if not hasattr(transformer, 'fit') or not callable(transformer.fit): continue X_stacked = np.vstack(X) transformer.fit(X_stacked) X = [transformer.transform(X_curr) for X_curr in X] # The transforms may change the number of features (e.g. PCA). If this is the case, calculate generic # feature_names and feature_lengths n_features = X[0].shape[1] if n_features != self.n_features: feature_names = ['transformed_feature_%d' % idx for idx in xrange(n_features)] feature_lengths = [1 for idx in xrange(n_features)] else: feature_names = self.feature_names feature_lengths = self.feature_lengths return X, feature_names, feature_lengths
def precipitation_pca(window='1H', areas=None, n_compos=3): """?????????????? """ from sklearn.decomposition import PCA df = precipitation_rolling_sum(window, areas) pca = PCA(n_components=n_compos) pca.fit(df) if areas is None: header = "overall_precipitation_pca_" + window + '_' else: header = "area_precipitation_pca_" + window + '_' pca_df = pd.DataFrame(pca.transform(df), index=df.index, columns=[header + str(i) for i in range(n_compos)]) return pca_df
def wind_pca(window='1H', areas=None, n_compos=3): """PCA""" from sklearn.decomposition import PCA df = wind_rolling_sum(window=window, areas=areas) pca = PCA(n_components=n_compos) pca.fit(df) if areas is None: header = "overall_wind_pca_" + window + '_' else: header = "area_wind_pca_" + window + '_' pca_df = pd.DataFrame(pca.transform(df), index=df.index, columns=[header + str(i) for i in range(n_compos)]) # pca_df.plot(figsize=(20, 12)) # plt.show() return pca_df
def visualizeData(self, featureData = '', fileName = ''): if featureData == '': (label_vector, input_vector) = loadData(self.featureFile) else: (label_vector, input_vector) = loadData(featureData) pca = PCA(n_components = 2) X_trans = pca.fit_transform(input_vector) plt.figure() colorArray = [] for n in range(0, len(input_vector)): colorArray.append(COLOR_MAP[label_vector[n]]) plt.scatter(X_trans[:,0], X_trans[:,1], c = colorArray) if fileName == '': plt.show() else: plt.savefig(fileName) print "Plot saved as " + fileName + ".png"
def __visualizePredictedDataset__(self, data, testIndices, predictedLabels, expectedLabels): pca = PCA(n_components = 2) X_trans = pca.fit_transform(data) plt.figure() colorArray = [] print("----- Wrong predictions -----") for n in range(0, len(data)): if n in testIndices: if predictedLabels[testIndices.index(n)] != expectedLabels[testIndices.index(n)]: colorArray.append('red') print("Expected", expectedLabels[testIndices.index(n)], "Predicted", predictedLabels[testIndices.index(n)]) else: colorArray.append('olivedrab') else: colorArray.append('white') plt.scatter(X_trans[:,0], X_trans[:,1], c = colorArray) plt.show()
def segment_ch4(self, segment_fn, segment_transform): segs = np.zeros_like(self.ch4_images, dtype=np.float32) ims = np.copy(self.ch4_images).reshape(-1, 1, self.ch4_images.shape[1], self.ch4_images.shape[2]) ims = segment_transform(ims) for i in xrange(self.ch4_images.shape[0]): segs[i:i+1] = segment_fn(ims[i:i+1]) _,sb = cv2.threshold(np.copy(segs[i])*255, 127, 255, cv2.THRESH_BINARY) patches = get_patches(sb) sb = np.zeros_like(sb, dtype=np.uint8) if len(patches) > 0: patch = next(p for p in patches if p.shape[0] == max(p1.shape[0] for p1 in patches)) for x,y in patch: sb[x,y]=255 pca = decomposition.PCA(n_components=2) pca.fit(patch) mean, major = pca.mean_, pca.components_[0] middle = sb.shape[0]/2 sb = cv2.warpAffine(sb, np.float32([[1,0,middle-mean[1]], [0,1,middle-mean[0]]]), sb.shape) sb = scipy.misc.imrotate(sb, np.arctan2(*major)*180/np.pi) segs[i:i+1]=sb self.ch4seg = segs self.ch4counts = np.array([np.count_nonzero(s) for s in self.ch4seg]).reshape(1,-1)
def get_principal_components(flattened_images, n_components='default', default_pct_variance_explained=.96): """ Standardizes the data and gets the principal components. """ for img in flattened_images: assert isinstance(img, np.ndarray) assert img.shape == flattened_images[-1].shape assert len(img.shape) == 1 X = np.asarray(flattened_images) X -= X.mean(axis=0) # Center all of the data around the origin. X /= np.std(X, axis=0) pca = PCA() pca.fit(X) if n_components == 'default': sorted_eig_vals = pca.explained_variance_ cum_pct_variance = (sorted_eig_vals / sorted_eig_vals.sum()).cumsum() idxs = np.argwhere(cum_pct_variance >= default_pct_variance_explained) n_components = np.squeeze(idxs)[0] V = pca.components_[:n_components + 1, :].T principal_components = np.matmul(X, V) return principal_components
def load_wemb(params, vocab): wemb = pkl.load(open(prm.wordemb_path, 'rb')) dim_emb_orig = wemb.values()[0].shape[0] W = 0.01 * np.random.randn(prm.n_words, dim_emb_orig).astype(config.floatX) for word, pos in vocab.items(): if word in wemb: W[pos,:] = wemb[word] if prm.dim_emb < dim_emb_orig: pca =PCA(n_components=prm.dim_emb, copy=False, whiten=True) W = pca.fit_transform(W) params['W'] = W return params