Python sklearn.decomposition 模块,PCA 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.decomposition.PCA

项目:blender-scripting    作者:njanakiev    | 项目源码 | 文件源码
def PCA(data, num_components=None):
    # mean center the data
    data -= data.mean(axis=0)
    # calculate the covariance matrix
    R = np.cov(data, rowvar=False)
    # calculate eigenvectors & eigenvalues of the covariance matrix
    # use 'eigh' rather than 'eig' since R is symmetric,
    # the performance gain is substantial
    V, E = np.linalg.eigh(R)
    # sort eigenvalue in decreasing order
    idx = np.argsort(V)[::-1]
    E = E[:,idx]
    # sort eigenvectors according to same index
    V = V[idx]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    E = E[:, :num_components]
    # carry out the transformation on the data using eigenvectors
    # and return the re-scaled data, eigenvalues, and eigenvectors
    return np.dot(E.T, data.T).T, V, E
项目:sakmapper    作者:szairis    | 项目源码 | 文件源码
def apply_lens(df, lens='pca', dist='euclidean', n_dim=2, **kwargs):
    """
    input: N x F dataframe of observations
    output: N x n_dim image of input data under lens function
    """
    if n_dim != 2:
        raise 'error: image of data set must be two-dimensional'
    if dist not in ['euclidean', 'correlation']:
        raise 'error: only euclidean and correlation distance metrics are supported'
    if lens == 'pca' and dist != 'euclidean':
        raise 'error: PCA requires the use of euclidean distance metric'

    if lens == 'pca':
        df_lens = pd.DataFrame(decomposition.PCA(n_components=n_dim, **kwargs).fit_transform(df), df.index)
    elif lens == 'mds':
        D = metrics.pairwise.pairwise_distances(df, metric=dist)
        df_lens = pd.DataFrame(manifold.MDS(n_components=n_dim, **kwargs).fit_transform(D), df.index)
    elif lens == 'neighbor':
        D = metrics.pairwise.pairwise_distances(df, metric=dist)
        df_lens = pd.DataFrame(manifold.SpectralEmbedding(n_components=n_dim, **kwargs).fit_transform(D), df.index)
    else:
        raise 'error: only PCA, MDS, neighborhood lenses are supported'

    return df_lens
项目:AutoFolio    作者:mlindauer    | 项目源码 | 文件源码
def fit(self, scenario: ASlibScenario, config: Configuration):
        '''
            fit pca object to ASlib scenario data

            Arguments
            ---------
            scenario: data.aslib_scenario.ASlibScenario
                ASlib Scenario with all data in pandas
            config: ConfigSpace.Configuration
                configuration
        '''

        if config.get("pca"):
            self.pca = PCA(n_components=config.get("pca_n_components"))
            self.pca.fit(scenario.feature_data.values)
            self.active = True
项目:AutoFolio    作者:mlindauer    | 项目源码 | 文件源码
def transform(self, scenario: ASlibScenario):
        '''
            transform ASLib scenario data

            Arguments
            ---------
            scenario: data.aslib_scenario.ASlibScenario
                ASlib Scenario with all data in pandas

            Returns
            -------
            data.aslib_scenario.ASlibScenario
        '''
        if self.pca:
            self.logger.debug("Applying PCA")
            values = self.pca.transform(
                np.array(scenario.feature_data.values))

            scenario.feature_data = pd.DataFrame(
                data=values, index=scenario.feature_data.index, columns=["f%d" % (i) for i in range(values.shape[1])])

        return scenario
项目:keras-molecules    作者:maxhodak    | 项目源码 | 文件源码
def get_arguments():
    parser = argparse.ArgumentParser(description='Molecular autoencoder network')
    parser.add_argument('data', type=str, help='HDF5 file to read input data from.')
    parser.add_argument('model', type=str, help='Trained Keras model to use.')
    parser.add_argument('--save_h5', type=str, help='Name of a file to write HDF5 output to.')
    parser.add_argument('--latent_dim', type=int, metavar='N', default=LATENT_DIM,
                        help='Dimensionality of the latent representation.')
    parser.add_argument('--tsne_lr', metavar='LR', type=float, default=TSNE_LEARNING_RATE,
                        help='Learning to use for t-SNE.')
    parser.add_argument('--tsne_components', metavar='N', type=int, default=TSNE_COMPONENTS,
                        help='Number of components to use for t-SNE.')
    parser.add_argument('--tsne_perplexity', metavar='P', type=float, default=TSNE_PERPLEXITY)
    parser.add_argument('--tsne_iterations', metavar='N', type=int, default=TSNE_ITERATIONS)
    parser.add_argument('--visualize', dest='visualize', action='store_true',
                        help='Fit manifold and render a visualization. If this flag is not used, the sampled data' +
                        ' will simply be returned with no further processing.')
    parser.add_argument('--skip-pca', dest='use_pca', action='store_false',
                        help='Skip PCA preprocessing of data to feed into t-SNE.')
    parser.add_argument('--pca_components', metavar='N', type=int, default=PCA_COMPONENTS,
                        help='Number of components to use for PCA.')
    parser.set_defaults(use_pca = True)
    parser.set_defaults(visualize = False)

    return parser.parse_args()
项目:MultimodalAutoencoder    作者:natashamjaques    | 项目源码 | 文件源码
def transform_PCA(X_train,X_all,n_components=100):
    """Given some training data, performs a Principle Components Analysis (PCA)
    and modifies the rest of the data based on the learned PCA.

    Args:
        X_train: A matrix containing training data
        X_all: A matrix containing all the data
        n_components: The number of components to use in the PCA

    Returns:
        The transformed data and the PCA object
    """
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    print("Total explained variance:", sum(pca.explained_variance_ratio_))

    return pca.transform(X_all),pca
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_pca_inverse():
    # Test that the projection of data can be inverted
    rng = np.random.RandomState(0)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean
    dX = da.from_array(X, chunks=(n // 2, p))

    # same check that we can find the original data from the transformed
    # signal (since the data is almost of rank n_components)
    pca = dd.PCA(n_components=2, svd_solver='full').fit(dX)
    Y = pca.transform(dX)
    Y_inverse = pca.inverse_transform(Y)
    assert_almost_equal(X, Y_inverse, decimal=3)

    # same as above with whitening (approximate reconstruction)
    for solver in solver_list:
        pca = dd.PCA(n_components=2, whiten=True, svd_solver=solver)
        pca.fit(dX)
        Y = pca.transform(dX)
        Y_inverse = pca.inverse_transform(Y)
        assert_eq(dX, Y_inverse, atol=1e-3)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_randomized_pca_inverse():
    # Test that randomized PCA is inversible on dense data
    rng = np.random.RandomState(0)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean
    dX = da.from_array(X, chunks=(n, p))

    # same check that we can find the original data from the transformed signal
    # (since the data is almost of rank n_components)
    pca = dd.PCA(n_components=2, svd_solver='randomized',
                 random_state=0).fit(dX)
    Y = pca.transform(X)
    Y_inverse = pca.inverse_transform(Y)
    assert_almost_equal(X, Y_inverse, decimal=2)

    # same as above with whitening (approximate reconstruction)
    pca = dd.PCA(n_components=2, whiten=True, svd_solver='randomized',
                 random_state=0).fit(dX)
    Y = pca.transform(X)
    Y_inverse = pca.inverse_transform(Y)
    relative_max_delta = (np.abs(X - Y_inverse) / np.abs(X).mean()).max()
    assert_less(relative_max_delta, 1e-5)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_infer_dim_1():
    # TODO: explain what this is testing
    # Or at least use explicit variable names...
    n, p = 1000, 5
    rng = np.random.RandomState(0)
    X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) +
         np.array([1, 0, 7, 4, 6]))
    X = da.from_array(X, chunks=(n, p))
    pca = dd.PCA(n_components=p, svd_solver='full')
    pca.fit(X)
    spect = pca.explained_variance_
    ll = []
    for k in range(p):
        ll.append(_assess_dimension_(spect, k, n, p))
    ll = np.array(ll)
    assert_greater(ll[1], ll.max() - .01 * n)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_infer_dim_by_explained_variance():
    X = da.from_array(iris.data, chunks=iris.data.shape)
    pca = dd.PCA(n_components=0.95, svd_solver='full')
    pca.fit(X)
    assert_equal(pca.n_components, 0.95)
    assert_equal(pca.n_components_, 2)

    pca = dd.PCA(n_components=0.01, svd_solver='full')
    pca.fit(X)
    assert_equal(pca.n_components, 0.01)
    assert_equal(pca.n_components_, 1)

    # Can't do this
    rng = np.random.RandomState(0)
    # more features than samples
    X = rng.rand(5, 20)
    pca = dd.PCA(n_components=.5, svd_solver='full').fit(X)
    assert_equal(pca.n_components, 0.5)
    assert_equal(pca.n_components_, 2)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_pca_score2():
    # Test that probabilistic PCA correctly separated different datasets
    n, p = 100, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    dX = da.from_array(X, chunks=(n // 2, p))
    for solver in solver_list:
        pca = dd.PCA(n_components=2, svd_solver=solver)
        pca.fit(dX)
        ll1 = pca.score(dX)
        ll2 = pca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5]))
        assert_greater(ll1, ll2)

        # Test that it gives different scores if whiten=True
        pca = dd.PCA(n_components=2, whiten=True, svd_solver=solver)
        pca.fit(dX)
        ll2 = pca.score(dX)
        assert_true(ll1 > ll2)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_pca_score3():
    # Check that probabilistic PCA selects the right model
    n, p = 200, 3
    rng = np.random.RandomState(0)
    Xl = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
          np.array([1, 0, 7]))
    Xt = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
          np.array([1, 0, 7]))
    ll = np.zeros(p)
    dXl = da.from_array(Xl, chunks=(n // 2, p))
    dXt = da.from_array(Xt, chunks=(n // 2, p))
    for k in range(p):
        pca = dd.PCA(n_components=k, svd_solver='full')
        pca.fit(dXl)
        ll[k] = pca.score(dXt)

    assert_true(ll.argmax() == 1)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_pca_zero_noise_variance_edge_cases():
    # ensure that noise_variance_ is 0 in edge cases
    # when n_components == min(n_samples, n_features)
    n, p = 100, 3

    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    dX = da.from_array(X, chunks=(n, p))
    # arpack raises ValueError for n_components == min(n_samples,
    # n_features)
    svd_solvers = ['full', 'randomized']

    for svd_solver in svd_solvers:
        pca = dd.PCA(svd_solver=svd_solver, n_components=p)
        pca.fit(dX)
        assert pca.noise_variance_ == 0

        # Can't handle short and wide
        # pca.fit(X.T)
        # assert pca.noise_variance_ == 0


# removed test_svd_solver_auto, as we don't do that.
# removed test_deprecation_randomized_pca, as we don't do that
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_pca_float_dtype_preservation(svd_solver):
    # Ensure that PCA does not upscale the dtype when input is float32
    X_64 = np.random.RandomState(0).rand(1000, 4).astype(np.float64)
    X_32 = X_64.astype(np.float32)

    dX_64 = da.from_array(X_64, chunks=X_64.shape)
    dX_32 = da.from_array(X_32, chunks=X_64.shape)

    pca_64 = dd.PCA(n_components=3, svd_solver=svd_solver,
                    random_state=0).fit(dX_64)
    pca_32 = dd.PCA(n_components=3, svd_solver=svd_solver,
                    random_state=0).fit(dX_32)

    assert pca_64.components_.dtype == np.float64
    assert pca_32.components_.dtype == np.float32
    assert pca_64.transform(dX_64).dtype == np.float64
    assert pca_32.transform(dX_32).dtype == np.float32

    assert_array_almost_equal(pca_64.components_, pca_32.components_,
                              decimal=5)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_pca_int_dtype_upcast_to_double(svd_solver):
    # Ensure that all int types will be upcast to float64
    X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4))
    X_i64 = X_i64.astype(np.int64)
    X_i32 = X_i64.astype(np.int32)

    dX_i64 = da.from_array(X_i64, chunks=X_i64.shape)
    dX_i32 = da.from_array(X_i32, chunks=X_i32.shape)

    pca_64 = dd.PCA(n_components=3, svd_solver=svd_solver,
                    random_state=0).fit(dX_i64)
    pca_32 = dd.PCA(n_components=3, svd_solver=svd_solver,
                    random_state=0).fit(dX_i32)

    assert pca_64.components_.dtype == np.float64
    assert pca_32.components_.dtype == np.float64
    assert pca_64.transform(dX_i64).dtype == np.float64
    assert pca_32.transform(dX_i32).dtype == np.float64

    assert_array_almost_equal(pca_64.components_, pca_32.components_,
                              decimal=5)
项目:PySAT    作者:USGS-Astrogeology    | 项目源码 | 文件源码
def dim_red(self, col, method, params, kws, load_fit=None):
        if method == 'PCA':
            self.do_dim_red = PCA(*params, **kws)
        if method == 'FastICA':
            self.do_dim_red = FastICA(*params, **kws)
        if method == 't-SNE':
            self.do_dim_red = TSNE(*params, **kws)
        if method == 'LLE':
            self.do_dim_red = LocallyLinearEmbedding(*params, **kws)
        if method == 'JADE-ICA':
            self.do_dim_red = JADE(*params, **kws)
        # TODO: Add ICA-JADE here
        if load_fit:
            self.do_dim_red = load_fit
        else:
            if method != 't-SNE':
                self.do_dim_red.fit(self.df[col])
                dim_red_result = self.do_dim_red.transform(self.df[col])
            else:
                dim_red_result = self.do_dim_red.fit_transform(self.df[col])

        for i in list(range(1, dim_red_result.shape[1] + 1)):  # will need to revisit this for other methods that don't use n_components to make sure column names still mamke sense
            self.df[(method, str(i))] = dim_red_result[:, i - 1]

        return self.do_dim_red
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def __init__(self):
        # The values below can be changed to tweak the recommender algorithm
        self.n_most_similar = 10
        self.n_features_title = 25
        self.n_features_content = 50
        #commented by shwenag
        #self.n_features_tags = 25
        self.n_features_total = 30

        # Do not change the values below
        self.df = None
        self.df_article_vectors = None
        self.similarity_score_dict = {}
        self.X = None
        self.X_title = None
        self.X_content = None
        self.type = 'Cos'
        self.is_pca = 'PCA'
        self.is_weight = 'TF-IDF'
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def reduce_dimensionality(self, X, n_features):
        """
        Apply PCA or SVD to reduce dimension to n_features.
        :param X:
        :param n_features:
        :return:
        """
        # Initialize reduction method: PCA or SVD
        if self.is_pca == 'PCA':
           reducer = PCA(n_components=n_features)

        #reducer = PCA(n_components=n_features)
        if self.is_pca == 'SVD':
            reducer = TruncatedSVD(n_components=n_features)

        # Fit and transform data to n_features-dimensional space
        reducer.fit(X)
        self.X = reducer.transform(X)
        logging.debug("Reduced number of features to {0}".format(n_features))
        logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
        return X
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def __init__(self):
        # The values below can be changed to tweak the recommender algorithm
        self.n_most_similar = 10
        self.n_features_title = 25
        self.n_features_content = 50
        #commented by shwenag
        #self.n_features_tags = 25
        self.n_features_total = 30

        # Do not change the values below
        self.df = None
        self.df_article_vectors = None
        self.similarity_score_dict = {}
        self.X = None
        self.X_title = None
        self.X_content = None
        self.type = 'Cos'
        self.is_pca = 'PCA'
        self.is_weight = 'TF-IDF'
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def reduce_dimensionality(self, X, n_features):
        """
        Apply PCA or SVD to reduce dimension to n_features.
        :param X:
        :param n_features:
        :return:
        """
        # Initialize reduction method: PCA or SVD
        if self.is_pca == 'PCA':
           reducer = PCA(n_components=n_features)

        #reducer = PCA(n_components=n_features)
        if self.is_pca == 'SVD':
            reducer = TruncatedSVD(n_components=n_features)

        # Fit and transform data to n_features-dimensional space
        reducer.fit(X)
        self.X = reducer.transform(X)
        logging.debug("Reduced number of features to {0}".format(n_features))
        logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
        return X
项目:StreetView_lite    作者:ydnaandy123    | 项目源码 | 文件源码
def auto_plane_gnd(self):
        # TODO the condition is no need
        indices_split_gnd = self.indices_split[np.nonzero(self.gnd_con)]
        data_gnd = self.ptCLoudData[np.nonzero(self.gnd_con)]
        plane_split = self.plane_split

        for i in range(1, len(plane_split)):
            plane = plane_split[i]
            vec = (plane['nx'], plane['ny'], plane['nz'])
            angle_diff = base_process.angle_between(vec, (0, 0, -1))
            if angle_diff < 0.3 or True:
                sel = data_gnd[np.nonzero(indices_split_gnd == i)]
                if len(sel) < 3:
                    continue
                pca = PCA(n_components=2)
                data_transfer = pca.fit_transform(sel['a_position'])
                tri = np.array(triangle.delaunay(data_transfer), dtype=np.uint32)
                self.gnd_plane.append({'data': sel, 'tri': tri})
项目:ml_defense    作者:arjunbhagoji    | 项目源码 | 文件源码
def random_proj_dr(X_train, X_test, rd, X_val=None, rev=None, **kwargs):
    """
    Perform Gaussian Random Projection on X_train then transform X_train, X_test
    (and X_val). Return transformed data in original space if rev is True;
    otherwise, return transformed data in PCA space.
    """

    # Generating random matrix for projection
    grp = GRP(n_components=rd, random_state=10)
    X_train_dr = grp.fit_transform(PCA_in_train)
    X_test_dr = grp.transform(PCA_in_test)

    X_train_dr = X_train_dr.reshape((train_len, 1, rd))
    X_test_dr = X_test_dr.reshape((test_len, 1, rd))

    return X_train_dr, X_test_dr, grp
#------------------------------------------------------------------------------#
项目:PySCUBA    作者:GGiecold    | 项目源码 | 文件源码
def PCA_analysis(data, mode, cell_stages = None):
    """Principal Component Analysis.
    """

    assert mode in {'pca', 'pca2'}

    mean_shifter = StandardScaler(with_std = False)

    if mode == 'pca':
        pca = PCA(min(data.shape))
        projected_data = pca.fit_transform(data)
        projected_data = pca.fit_transform(mean_shifter.fit_transform(data))
        components = pca.components_
    else:
        assert isinstance(cell_stages, np.ndarray)

        idx = np.where(cell_stages == np.max(cell_stages))[0]

        pca = PCA(min(idx.size, data.shape[1]))
        pca.fit(mean_shifter.fit_transform(data[idx]))
        components = pca.components_
        projected_data = np.dot(data, components.T)

    return components, projected_data
项目:elm    作者:ContinuumIO    | 项目源码 | 文件源码
def test_cv_splitting_ea_search_mldataset(trans, estimator):
    '''Test that an Elm Pipeline using MLDataset X feature
    matrix input can be split into cross validation train / test
    samples as in scikit-learn for numpy.  (As of PR 192 this test
    is failing)'''
    pipe, X, y = new_pipeline(trans, estimator, flatten_first=False)
    X = X.to_features()
    param_distribution = param_distribution_sgd.copy()
    if 'PCA' in trans._cls.__name__:
        param_distribution.update(param_distribution_pca)
    else:
        param_distribution.update(param_distribution_poly)
    ea = EaSearchCV(estimator=pipe,
                    param_distributions=param_distribution,
                    score_weights=[1],
                    model_selection=model_selection,
                    refit=True,
                    cv=3,
                    error_score='raise',
                    return_train_score=True,
                    scheduler=None,
                    n_jobs=-1,
                    cache_cv=True)
    ea.fit(X,y)
    assert isinstance(ea.predict(X), MLDataset)
项目:blcf    作者:willard-yuan    | 项目源码 | 文件源码
def trainingPCA(features, n_components=256, whiten=True, pca_model_name=None):
    print 'loaded features! {}'.format(features.shape)
    print np.sqrt(sum(features[0,:]**2))

    #print 'Features l2 normalization'
    #features = normalize(features)
    #print np.sqrt(sum(features[0,:]**2))

    print 'Feature PCA-whitenning'
    pca_model = PCA(n_components=n_components, whiten=whiten)
    features = pca_model.fit_transform(features)
    print np.sqrt(sum(features[0,:]**2))

    print 'Features l2 normalization'
    features = normalize(features)
    print np.sqrt(sum(features[0,:]**2))

    if pca_model_name is not None:
        print 'saving model...'
        check_path_file(pca_model_name, create_if_missing=True)
        save_obj(pca_model, pca_model_name)

    print 'done! {}'.format(pca_model_name)

    return pca_model
项目:tensorsne    作者:gokceneraslan    | 项目源码 | 文件源码
def get_mnist(n_train=5000, n_test=500, pca=True, d=50, dtype=np.float32):
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    n, row, col = X_train.shape
    channel = 1

    X_train = X_train.reshape(-1, channel * row * col)
    X_test = X_test.reshape(-1, channel * row * col)
    X_train = X_train.astype(dtype)
    X_test = X_test.astype(dtype)
    X_train /= 255
    X_test /= 255

    X_train = X_train[:n_train] - X_train[:n_train].mean(axis=0)
    X_test = X_test[:n_test] - X_test[:n_test].mean(axis=0)

    if pca:
        pcfit = PCA(n_components=d)

        X_train = pcfit.fit_transform(X_train)
        X_test = pcfit.transform(X_test)

    y_train = y_train[:n_train]
    y_test = y_test[:n_test]

    return X_train, y_train, X_test, y_test
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def dimension_compression():
    X_t_c = make_matrix()
    token_list = []
    contexts_list = []
    for token, contexts in sorted(X_t_c.items()):
        token_list.append(token)
        contexts_list.append(contexts)

    pca = PCA(n_components = 300)
    DictoVec = DictVectorizer(sparse = True)

    sparse = DictoVec.fit_transform(contexts_list)

    print(sparse.shape)

    vec_list = pca.fit_transform(sparse.todense())

    word_vec = {}
    for token, vec in zip(token_list, vec_list):
        word_vec[token] = vec

    return word_vec
项目:NetPower_TestBed    作者:Vignesh2208    | 项目源码 | 文件源码
def extractOptPCADimensionality(trainSamples) :
    assert len(trainSamples) >= 2 
    nFeatures = len(trainSamples[0])
    pca = PCA(n_components=nFeatures)
    pca.fit(trainSamples)

    percentageExplainedVariance = []
    nComponents = []
    pSum  = 0.0
    nComponents99 = -1

    print "nFeatures Here = ", nFeatures

    for i in xrange(0,len(pca.explained_variance_ratio_)) :
        pSum = pSum + pca.explained_variance_ratio_[i]
        if pSum >= 0.99 and nComponents99 == -1 :
            nComponents99 = i + 1
        nComponents.append(i+1)
        percentageExplainedVariance.append(pSum*100.0)

    assert nComponents99 != -1
    return nComponents99
项目:MusicAnalyser    作者:ShivayaDevs    | 项目源码 | 文件源码
def pca(features, var_expl = 0.98, n_com=None):
    """
    Returns features with dimension reduced by PCA method
    implemented in scikit learn (sklearn) module. Number of components
    is matched based on explained variance ratio - *var_expl* or 
    can be set by hand as *n_com*.
    """
    pca = decomposition.PCA()
    pca.fit(features)
    if not n_com:
        for p in xrange(1,features.shape[0]):
            if np.sum(pca.explained_variance_ratio_[:p])>=var_expl:
                n_com = p
                break
    pca = decomposition.PCA(n_components=n_com)
    pca.fit(features)
    features = pca.transform(features)
    return features
项目:LIE    作者:EmbraceLife    | 项目源码 | 文件源码
def plot_scatter(values, cls):
    # Create a color-map with a different color for each class.
    import matplotlib.cm as cm
    cmap = cm.rainbow(np.linspace(0.0, 1.0, num_classes))

    # Create an index with a random permutation to make a better plot.
    idx = np.random.permutation(len(values))

    # Get the color for each sample.
    colors = cmap[cls[idx]]

    # Extract the x- and y-values.
    x = values[idx, 0]
    y = values[idx, 1]

    # Plot it.
    plt.scatter(x, y, color=colors, alpha=0.5)
    plt.show()


# Plot the transfer-values that have been reduced using PCA. There are 3 different colors for the different classes in the Knifey-Spoony data-set. The colors have very large overlap. This may be because PCA cannot properly separate the transfer-values.

# In[41]:
项目:LIE    作者:EmbraceLife    | 项目源码 | 文件源码
def plot_scatter(values, cls):
    # Create a color-map with a different color for each class.
    import matplotlib.cm as cm
    cmap = cm.rainbow(np.linspace(0.0, 1.0, num_classes))

    # Get the color for each sample.
    colors = cmap[cls]

    # Extract the x- and y-values.
    x = values[:, 0]
    y = values[:, 1]

    # Plot it.
    plt.scatter(x, y, color=colors)
    plt.show()


# Plot the transfer-values that have been reduced using PCA. There are 10 different colors for the different classes in the CIFAR-10 data-set. The colors are grouped together but with very large overlap. This may be because PCA cannot properly separate the transfer-values.

# In[35]:
项目:lazyprogrammer    作者:inhwane    | 项目源码 | 文件源码
def main():
    Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()

    pca = PCA()
    reduced = pca.fit_transform(Xtrain)
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain, alpha=0.5)
    plt.show()

    plt.plot(pca.explained_variance_ratio_)
    plt.show()

    # cumulative variance
    # choose k = number of dimensions that gives us 95-99% variance
    cumulative = []
    last = 0
    for v in pca.explained_variance_ratio_:
        cumulative.append(last + v)
        last = cumulative[-1]
    plt.plot(cumulative)
    plt.show()
项目:covar_me_app    作者:CovarMe    | 项目源码 | 文件源码
def calculate_residual_correlation_matrix(returns):
    # find the market return constraining on the selected companies (first PCA)
    # regress each stock on that and find correlation of residuals
    returns_matrix = returns.as_matrix().transpose()
    covar_matrix = np.cov(returns_matrix)
    pca = decomposition.PCA(n_components=1)
    pca.fit(covar_matrix)
    X = pca.transform(covar_matrix)
    regr = linear_model.LinearRegression()
    dim = covar_matrix.shape[1]
    res = np.zeros(shape=(dim,dim))
    for x in range(0, dim):
        regr = linear_model.LinearRegression()
        regr = regr.fit(X, covar_matrix[:,x])
        res[:,x] = covar_matrix[:,x] - regr.predict(X)

    res_corr = np.corrcoef(res)
    return pd.DataFrame(res_corr, index = returns.columns, columns = returns.columns)
项目:graph_2D_CNN    作者:Tixierae    | 项目源码 | 文件源码
def get_embeddings_node2vec(g,d,p,q,path_node2vec):
    my_pca = PCA(n_components=d)
    my_edgelist = igraph.Graph.get_edgelist(g)
    # create temp dir to write and read from
    tmpdir = tempfile.mkdtemp()
    # create subdirs for node2vec
    os.makedirs(tmpdir + '/graph/')
    os.makedirs(tmpdir + '/emb/')
    # write edge list
    with open(tmpdir + '/graph/input.edgelist', 'w') as my_file:
        my_file.write('\n'.join('%s %s' % x for x in my_edgelist))
    # execute node2vec
    call([path_node2vec + 'node2vec -i:' + tmpdir + '/graph/input.edgelist' + ' -o:' + tmpdir + '/emb/output.emb' + ' -p:' + p + ' -q:' + q],shell=True)
    # read back results
    emb = np.loadtxt(tmpdir + '/emb/output.emb',skiprows=1)
    # sort by increasing node index and keep only coordinates
    emb = emb[emb[:,0].argsort(),1:]
    # remove temp dir
    shutil.rmtree(tmpdir)
    # perform PCA on the embeddings to align and reduce dim
    pca_output = my_pca.fit_transform(emb)
    return pca_output
项目:deepcpg    作者:cangermueller    | 项目源码 | 文件源码
def plot_pca(act, pc_x=1, pc_y=2, labels=None, filename=None):
    act = act.T
    pca = PCA()
    pca.fit(act)
    eig_vec = pca.transform(act)
    data = pd.DataFrame(eig_vec)
    data.columns = ['PC%d' % i for i in range(data.shape[1])]
    data['act_mean'] = act.mean(axis=1)

    pc_x = 'PC%d' % pc_x
    pc_y = 'PC%d' % pc_y
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(data[pc_x], data[pc_y],
                         c=data['act_mean'], cmap='RdBu_r')
    ax.set_xlabel(pc_x)
    ax.set_ylabel(pc_y)
    fig.colorbar(scatter)
    if labels:
        for i, row in data.iterrows():
            ax.annotate('%d' % labels[i], xy=(row[pc_x], row[pc_y]),
                        fontsize=10)
    if filename:
        fig.savefig(filename)
        plt.close()
项目:pysptools    作者:ctherien    | 项目源码 | 文件源码
def inverse_transform(self, X):
        """
        Inverse the PCA rotation step. The cube stay
        whitened. Usefull if you want to denoise noisy
        bands before the rotation.

        X: `numpy array`
            A transformed (MNF) cube (m x n x p).

        Return: `numpy array`
            A inverted cube (m x n x p).
        """
        h, w, numBands = X.shape
        X = np.reshape(X, (w*h, numBands))
        M = self.transform.inverse_transform(X)
        M = np.reshape(M, (h, w, numBands))
        return M
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)
        corpus = [dictionary.doc2bow(text) for text in corpus]
        model = models.TfidfModel(corpus)
        corpus = [text for text in model[corpus]]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)
项目:auDeep    作者:auDeep    | 项目源码 | 文件源码
def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("failed to open data set at {}".format(parsed_args.input))

        data_set = load(parsed_args.input)

        features = np.reshape(data_set.features, [data_set.num_instances, -1])

        if features.shape[1] > 50:
            self.log.info("applying PCA")

            pca = PCA(n_components=200)
            pca.fit(features)
            features = pca.transform(features)

        self.log.info("computing T-SNE embedding")
        tsne = TSNE(perplexity=parsed_args.perplexity,
                    learning_rate=parsed_args.learning_rate,
                    verbose=self.app_args.verbose_level)

        embedding = tsne.fit_transform(features)

        self.log.info("plotting embedding")
        self.plot_with_labels(data_set, embedding)
项目:neural-combinatorial-optimization-rl-tensorflow    作者:MichelDeudon    | 项目源码 | 文件源码
def gen_instance(self, max_length, dimension, test_mode=True, seed=0):
        if seed!=0: np.random.seed(seed)

        # Randomly generate (max_length) cities with (dimension) coordinates in [0,100]
        seq = np.random.randint(100, size=(max_length, dimension))

        # Principal Component Analysis to center & rotate coordinates
        pca = PCA(n_components=dimension)
        sequence = pca.fit_transform(seq)

        # Scale to [0,1[
        input_ = sequence/100

        if test_mode == True:
            return input_, seq
        else:
            return input_

    # Generate random batch for training procedure
项目:channel-pruning    作者:yihui-he    | 项目源码 | 文件源码
def pca(data, n_components=None):
    """
    Param:
        n_components: use 'mle' to guess
    """
    newdata = data.copy()
    model = PCA(n_components=n_components)

    if len(newdata.shape) != 2:
        newdata = newdata.reshape((newdata.shape[0], -1))

    model.fit(newdata)

    ret = model.explained_variance_ratio_

    return ret
项目:channel-pruning    作者:yihui-he    | 项目源码 | 文件源码
def YYT(Y, n_components=None, DEBUG=False):
    """
    Param:
        Y: n x d
        n_components: use 'mle' to guess
    Returns:
        P: d x d'
        QT: d' x d
    """
    newdata = Y.copy()
    model = PCA(n_components=n_components)

    if len(newdata.shape) != 2:
        newdata = newdata.reshape((newdata.shape[0], -1))
    #TODO center data
    model.fit(newdata)
    if DEBUG: from IPython import embed; embed()

    return model.components_.T, model.components_

#def GSVD(Z, Y):
#    NotImplementedError
#    return [U,V,X,C,S]
项目:motion-classification    作者:matthiasplappert    | 项目源码 | 文件源码
def _apply_transformers_to_X(self, X, transformers):
        for transformer in transformers:
            if not hasattr(transformer, 'fit') or not callable(transformer.fit):
                continue
            X_stacked = np.vstack(X)
            transformer.fit(X_stacked)
            X = [transformer.transform(X_curr) for X_curr in X]

        # The transforms may change the number of features (e.g. PCA). If this is the case, calculate generic
        # feature_names and feature_lengths
        n_features = X[0].shape[1]
        if n_features != self.n_features:
            feature_names = ['transformed_feature_%d' % idx for idx in xrange(n_features)]
            feature_lengths = [1 for idx in xrange(n_features)]
        else:
            feature_names = self.feature_names
            feature_lengths = self.feature_lengths

        return X, feature_names, feature_lengths
项目:jsaicup2017    作者:SS1031    | 项目源码 | 文件源码
def precipitation_pca(window='1H', areas=None, n_compos=3):
    """??????????????
    """
    from sklearn.decomposition import PCA

    df = precipitation_rolling_sum(window, areas)

    pca = PCA(n_components=n_compos)
    pca.fit(df)

    if areas is None:
        header = "overall_precipitation_pca_" + window + '_'
    else:
        header = "area_precipitation_pca_" + window + '_'

    pca_df = pd.DataFrame(pca.transform(df),
                          index=df.index,
                          columns=[header + str(i) for i in range(n_compos)])

    return pca_df
项目:jsaicup2017    作者:SS1031    | 项目源码 | 文件源码
def wind_pca(window='1H', areas=None, n_compos=3):
    """PCA"""
    from sklearn.decomposition import PCA

    df = wind_rolling_sum(window=window, areas=areas)

    pca = PCA(n_components=n_compos)
    pca.fit(df)

    if areas is None:
        header = "overall_wind_pca_" + window + '_'
    else:
        header = "area_wind_pca_" + window + '_'
    pca_df = pd.DataFrame(pca.transform(df),
                          index=df.index,
                          columns=[header + str(i) for i in range(n_compos)])

    # pca_df.plot(figsize=(20, 12))
    # plt.show()

    return pca_df
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def visualizeData(self, featureData = '', fileName = ''):
    if featureData == '':
      (label_vector, input_vector) = loadData(self.featureFile)
    else:
      (label_vector, input_vector) = loadData(featureData)

    pca = PCA(n_components = 2)
    X_trans = pca.fit_transform(input_vector)

    plt.figure()
    colorArray = []
    for n in range(0, len(input_vector)):
      colorArray.append(COLOR_MAP[label_vector[n]])

    plt.scatter(X_trans[:,0], X_trans[:,1], c = colorArray)
    if fileName == '':
      plt.show()
    else:
      plt.savefig(fileName)
      print "Plot saved as " + fileName + ".png"
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def __visualizePredictedDataset__(self, data, testIndices, predictedLabels, expectedLabels):
    pca = PCA(n_components = 2)
    X_trans = pca.fit_transform(data)

    plt.figure()
    colorArray = []

    print("----- Wrong predictions -----")
    for n in range(0, len(data)):
      if n in testIndices:
        if predictedLabels[testIndices.index(n)] != expectedLabels[testIndices.index(n)]:
          colorArray.append('red')
          print("Expected", expectedLabels[testIndices.index(n)], 
                    "Predicted", predictedLabels[testIndices.index(n)])
        else:
          colorArray.append('olivedrab')
      else:
        colorArray.append('white')

    plt.scatter(X_trans[:,0], X_trans[:,1], c = colorArray)
    plt.show()
项目:diagnose-heart    作者:woshialex    | 项目源码 | 文件源码
def segment_ch4(self, segment_fn, segment_transform):
        segs = np.zeros_like(self.ch4_images, dtype=np.float32)
        ims = np.copy(self.ch4_images).reshape(-1, 1, self.ch4_images.shape[1],
                self.ch4_images.shape[2])
        ims = segment_transform(ims)
        for i in xrange(self.ch4_images.shape[0]):
            segs[i:i+1] = segment_fn(ims[i:i+1])
            _,sb = cv2.threshold(np.copy(segs[i])*255, 127, 255, cv2.THRESH_BINARY)
            patches = get_patches(sb)
            sb = np.zeros_like(sb, dtype=np.uint8)
            if len(patches) > 0:
                patch = next(p for p in patches if p.shape[0] == max(p1.shape[0]
                    for p1 in patches))
                for x,y in patch:
                    sb[x,y]=255
                pca = decomposition.PCA(n_components=2)
                pca.fit(patch)
                mean, major = pca.mean_, pca.components_[0]
                middle = sb.shape[0]/2
                sb = cv2.warpAffine(sb, np.float32([[1,0,middle-mean[1]],
                    [0,1,middle-mean[0]]]), sb.shape)
                sb = scipy.misc.imrotate(sb, np.arctan2(*major)*180/np.pi)
            segs[i:i+1]=sb
        self.ch4seg = segs
        self.ch4counts = np.array([np.count_nonzero(s) for s in self.ch4seg]).reshape(1,-1)
项目:plda    作者:RaviSoji    | 项目源码 | 文件源码
def get_principal_components(flattened_images, n_components='default',
                             default_pct_variance_explained=.96):
    """ Standardizes the data and gets the principal components.
    """
    for img in flattened_images:
        assert isinstance(img, np.ndarray)
        assert img.shape == flattened_images[-1].shape
        assert len(img.shape) == 1
    X = np.asarray(flattened_images)
    X -= X.mean(axis=0)  # Center all of the data around the origin.
    X /= np.std(X, axis=0)

    pca = PCA()
    pca.fit(X)

    if n_components == 'default':
        sorted_eig_vals = pca.explained_variance_
        cum_pct_variance = (sorted_eig_vals / sorted_eig_vals.sum()).cumsum()
        idxs = np.argwhere(cum_pct_variance >= default_pct_variance_explained)
        n_components = np.squeeze(idxs)[0]

    V = pca.components_[:n_components + 1, :].T
    principal_components = np.matmul(X, V)

    return principal_components
项目:WebNav    作者:nyu-dl    | 项目源码 | 文件源码
def load_wemb(params, vocab):
    wemb = pkl.load(open(prm.wordemb_path, 'rb'))
    dim_emb_orig = wemb.values()[0].shape[0]

    W = 0.01 * np.random.randn(prm.n_words, dim_emb_orig).astype(config.floatX)
    for word, pos in vocab.items():
        if word in wemb:
            W[pos,:] = wemb[word]

    if prm.dim_emb < dim_emb_orig:
        pca =PCA(n_components=prm.dim_emb, copy=False, whiten=True)
        W = pca.fit_transform(W)

    params['W'] = W

    return params