Python sklearn.utils.extmath 模块,randomized_svd() 实例源码

我们从Python开源项目中,提取了以下24个代码示例,用于说明如何使用sklearn.utils.extmath.randomized_svd()

项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100):
    while True:
        if queue.empty():
            break
        year = queue.get()
        print "Loading embeddings for year", year
        time.sleep(random.random() * 120)
        valid_words = set(words_above_count(count_dir, year, min_count))
        print len(valid_words)
        words = list(valid_words.intersection(words[year][:num_words]))
        print len(words)
        base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False)
        base_embed = base_embed.get_subembed(words, restrict_context=True)
        print "SVD for year", year
        u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
        print "Saving year", year
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s)
        write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")
项目:Lyssandra    作者:ektormak    | 项目源码 | 文件源码
def ksvd(Y, D, X, n_cycles=1, verbose=True):
    n_atoms = D.shape[1]
    n_features, n_samples = Y.shape
    unused_atoms = []
    R = Y - fast_dot(D, X)

    for c in range(n_cycles):
        for k in range(n_atoms):
            if verbose:
                sys.stdout.write("\r" + "k-svd..." + ":%3.2f%%" % ((k / float(n_atoms)) * 100))
                sys.stdout.flush()
            # find all the datapoints that use the kth atom
            omega_k = X[k, :] != 0
            if not np.any(omega_k):
                unused_atoms.append(k)
                continue
            # the residual due to all the other atoms but k
            Rk = R[:, omega_k] + np.outer(D[:, k], X[k, omega_k])
            U, S, V = randomized_svd(Rk, n_components=1, n_iter=10, flip_sign=False)
            D[:, k] = U[:, 0]
            X[k, omega_k] = V[0, :] * S[0]
            # update the residual
            R[:, omega_k] = Rk - np.outer(D[:, k], X[k, omega_k])
        print ""
    return D, X, unused_atoms
项目:Class_Evaluation_Summarization    作者:arunrm87    | 项目源码 | 文件源码
def sparse_dense(summary):
    text_copy = copy.deepcopy(summary)
    """
    Find a suitable value for the hyperparameter, some random value like 0.5, or based
    on some heuristic like (rank of original matrix/10), or (max_singular_value of the
    original matrix / 20)
    """
    _, s, _ = randomized_svd(summary, 1, n_iter=5)
    hyperparameter = s[0] / 50

    term_document_matrix_rank = np.linalg.matrix_rank(summary)
    iterations = int(term_document_matrix_rank / 10) 

    A_new = dense(text_copy, hyperparameter, 0.02, iterations)

    return A_new
项目:abc    作者:daemon    | 项目源码 | 文件源码
def _init_svd(self, dictionary, definitions):
    self.td_matrix = lil_matrix((len(dictionary), self.n_terms))
    for defn, i in zip(definitions, range(len(definitions))):
      if i % 100 == 0:
        print("Building term-document matrix: {} / {}".format(i, len(dictionary)), end="\r")
      self.td_matrix[i, :] = self.compute_freq_vec(dictionary[defn])
    self.td_matrix = self.td_matrix.transpose().tocsr()
    print()
    for i in range(self.n_terms):
      n = float(self.td_matrix[i, :].getnnz())
      if i % 100 == 0:
        print("Applying td-idf: {} / {}".format(i, self.n_terms), end="\r")
      if n > 0:
        self.td_matrix[i, :] *= np.log(len(dictionary) / n)
    print()
    print("Performing rank reduction...")
    self.u, self.s, self.vt = randomized_svd(self.td_matrix, 50, transpose=False)
    self.doc_matrix = np.matmul(np.diag(self.s), self.vt).transpose()
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def svd_timing(X, n_comps, n_iter, n_oversamples,
               power_iteration_normalizer='auto', method=None):
    """
    Measure time for decomposition
    """
    print("... running SVD ...")
    if method is not 'fbpca':
        gc.collect()
        t0 = time()
        U, mu, V = randomized_svd(X, n_comps, n_oversamples, n_iter,
                                  power_iteration_normalizer,
                                  random_state=random_state, transpose=False)
        call_time = time() - t0
    else:
        gc.collect()
        t0 = time()
        # There is a different convention for l here
        U, mu, V = fbpca.pca(X, n_comps, raw=True, n_iter=n_iter,
                             l=n_oversamples+n_comps)
        call_time = time() - t0

    return U, mu, V, call_time
项目:thunder-factorization    作者:thunder-project    | 项目源码 | 文件源码
def test_svd(eng):
    x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0)
    x = fromarray(x, engine=eng)

    from sklearn.utils.extmath import randomized_svd
    u1, s1, v1 = randomized_svd(x.toarray(), n_components=2,  random_state=0)

    u2, s2, v2 = SVD(k=2, method='direct').fit(x)
    assert allclose_sign(u1, u2)
    assert allclose(s1, s2)
    assert allclose_sign(v1.T, v2.T)

    u2, s2, v2 = SVD(k=2, method='em', max_iter=100, seed=0).fit(x)
    tol = 1e-1
    assert allclose_sign(u1, u2, atol=tol)
    assert allclose(s1, s2, atol=tol)
    assert allclose_sign(v1.T, v2.T, atol=tol)
项目:pySTATIS    作者:mfalkiewicz    | 项目源码 | 文件源码
def gsvd(X, M, A, n_comps = 10):
    """
    Generalized SVD

    :param X:
    :param M:
    :param A:
    :return:
    """

    print("GSVD")
    print("GSVD: Weights... ", end='')
    Xw = np.dot(np.sqrt(M), np.dot(X, np.sqrt(A)))
    print("Done!")

    print("GSVD: SVD... ", end='')
    [P_, D, Q_] = randomized_svd(Xw, n_comps)

    #P_ = P_[:,0:n_comps]
    #D = D[0:n_comps]
    #Q_ = Q_[0:n_comps,:]
    print('Done!')

    print("GSVD: Factor scores and eigenvalues... ", end='')
    Mp = np.power(np.diag(M), -0.5)
    Ap = np.power(np.diag(A), -0.5)

    P = np.dot(np.diag(Mp), P_)
    Q = np.dot(np.diag(Ap), Q_.T)
    ev = np.power(D, 2)

    print('Done!')

    return P, D, Q, ev
项目:nlp-lt    作者:minven    | 项目源码 | 文件源码
def randomizedSVD(self):
        # http://scikit-learn.org/stable/modules/decomposition.html#truncated-singular-value-decomposition-and-latent-semantic-analysis
        # http://stackoverflow.com/questions/31523575/get-u-sigma-v-matrix-from-truncated-svd-in-scikit-learn
        U, S, V = randomized_svd(self.bag_of_words_matrix.T, 
                                      n_components=self.dimensions,
                                      n_iter=5,
                                      random_state=None)
        self.U = U
        self.S = S
        self.V = V
        self.tokens_representation = np.matrix(U) * np.diag(S)
        self.documents_representation = (np.diag(S) * np.matrix(V)).T
项目:sparseMF    作者:jeh0753    | 项目源码 | 文件源码
def _svd(self, X, max_rank=None):
        if max_rank:
            # if we have a max rank then perform the faster randomized SVD
            return randomized_svd(
                X,
                max_rank,
                n_iter=self.n_power_iterations)
        else:
            # perform a full rank SVD using ARPACK
            return np.linalg.svd(
                X,
                full_matrices=False,
                compute_uv=True)
项目:sparseMF    作者:jeh0753    | 项目源码 | 文件源码
def _svd(self, X, max_rank=None):
        if max_rank:
            # if we have a max rank then perform the faster randomized SVD
            return randomized_svd(
                X,
                max_rank,
                n_iter=self.n_power_iterations)
        else:
            # perform a full rank SVD using ARPACK
            return np.linalg.svd(
                X,
                full_matrices=False,
                compute_uv=True)
项目:preconditioned_GPs    作者:mauriziofilippone    | 项目源码 | 文件源码
def __init__(self, X, kern, M):
        super(SVD, self).__init__("SVD")

        start = time.time()
        self.X = X
        self.kern = kern

        K = kern.K(X, X)
        N = np.shape(X)[0]

        #(self.U, self.Sigma, self.VT) = fb.pca(K, M)#, n_iter=1, l=M)
        self.U, self.Sigma, self.VT = randomized_svd(K, M)
        self.precon = np.dot(self.U, np.dot(np.diag(self.Sigma), self.VT)) + self.kern.noise*np.identity(N)
        self.duration = time.time() - start
项目:preconditioned_GPs    作者:mauriziofilippone    | 项目源码 | 文件源码
def __init__(self, X, kern, M):
        super(SVD, self).__init__("SVD")

        start = time.time()
        self.X = X
        self.kern = kern

        K = kern.K(X, X)
        N = np.shape(X)[0]

        #(self.U, self.Sigma, self.VT) = fb.pca(K, M)#, n_iter=1, l=M)
        self.U, self.Sigma, self.VT = randomized_svd(K, M)
        self.precon = np.dot(self.U, np.dot(np.diag(self.Sigma), self.VT)) + self.kern.noise*np.identity(N)
        self.duration = time.time() - start
项目:themarketingtechnologist    作者:thomhopmans    | 项目源码 | 文件源码
def apply_uv_decomposition(self):
        U, Sigma, VT = randomized_svd(self.behaviour_matrix,
                                      n_components=15,
                                      n_iter=10,
                                      random_state=None)
        print(U.shape)
        print(VT.shape)
        self.X_hat = np.dot(U, VT)  # U * np.diag(Sigma)
项目:pyrec    作者:mesuvash    | 项目源码 | 文件源码
def fit(self, train_input, train):
        U, sigma, VT = randomized_svd(train, self.nfactor)
        sigma = scipy.sparse.diags(sigma, 0)
        self.U = U * sigma
        self.V = VT.T
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def compute_bench(samples_range, features_range, n_iter=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('====================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('====================')
            X = make_low_rank_matrix(n_samples, n_features,
                                  effective_rank=rank,
                                  tail_strength=0.2)

            gc.collect()
            print("benchmarking scipy svd: ")
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print("benchmarking scikit-learn randomized_svd: n_iter=0")
            tstart = time()
            randomized_svd(X, rank, n_iter=0)
            results['scikit-learn randomized_svd (n_iter=0)'].append(
                time() - tstart)

            gc.collect()
            print("benchmarking scikit-learn randomized_svd: n_iter=%d "
                  % n_iter)
            tstart = time()
            randomized_svd(X, rank, n_iter=n_iter)
            results['scikit-learn randomized_svd (n_iter=%d)'
                    % n_iter].append(time() - tstart)

    return results
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_randomized_svd_low_rank():
    # Check that extmath.randomized_svd is consistent with linalg.svd
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.0,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = \
            randomized_svd(X, k, power_iteration_normalizer=normalizer,
                           random_state=0)
        assert_equal(Ua.shape, (n_samples, k))
        assert_equal(sa.shape, (k,))
        assert_equal(Va.shape, (k, n_features))

        # ensure that the singular values of both methods are equal up to the
        # real rank of the matrix
        assert_almost_equal(s[:k], sa)

        # check the singular vectors too (while not checking the sign)
        assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

        # check the sparse matrix representation
        X = sparse.csr_matrix(X)

        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = \
            randomized_svd(X, k, power_iteration_normalizer=normalizer,
                           random_state=0)
        assert_almost_equal(s[:rank], sa[:rank])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_randomized_svd_low_rank_with_noise():
    # Check that extmath.randomized_svd can handle noisy matrices
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.1,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    for normalizer in ['auto', 'none', 'LU', 'QR']:
        # compute the singular values of X using the fast approximate
        # method without the iterated power method
        _, sa, _ = randomized_svd(X, k, n_iter=0,
                                  power_iteration_normalizer=normalizer,
                                  random_state=0)

        # the approximation does not tolerate the noise:
        assert_greater(np.abs(s[:k] - sa).max(), 0.01)

        # compute the singular values of X using the fast approximate
        # method with iterated power method
        _, sap, _ = randomized_svd(X, k,
                                   power_iteration_normalizer=normalizer,
                                   random_state=0)

        # the iterated power method is helping getting rid of the noise:
        assert_almost_equal(s[:k], sap, decimal=3)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_randomized_svd_infinite_rank():
    # Check that extmath.randomized_svd can handle noisy matrices
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=1.0,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)
    for normalizer in ['auto', 'none', 'LU', 'QR']:
        # compute the singular values of X using the fast approximate method
        # without the iterated power method
        _, sa, _ = randomized_svd(X, k, n_iter=0,
                                  power_iteration_normalizer=normalizer)

        # the approximation does not tolerate the noise:
        assert_greater(np.abs(s[:k] - sa).max(), 0.1)

        # compute the singular values of X using the fast approximate method
        # with iterated power method
        _, sap, _ = randomized_svd(X, k, n_iter=5,
                                   power_iteration_normalizer=normalizer)

        # the iterated power method is still managing to get most of the
        # structure at the requested rank
        assert_almost_equal(s[:k], sap, decimal=3)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_randomized_svd_power_iteration_normalizer():
    # randomized_svd with power_iteration_normalized='none' diverges for
    # large number of power iterations on this dataset
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
    X += 3 * rng.randint(0, 2, size=X.shape)
    n_components = 50

    # Check that it diverges with many (non-normalized) power iterations
    U, s, V = randomized_svd(X, n_components, n_iter=2,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_2 = linalg.norm(A, ord='fro')
    U, s, V = randomized_svd(X, n_components, n_iter=20,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_20 = linalg.norm(A, ord='fro')
    assert_greater(np.abs(error_2 - error_20), 100)

    for normalizer in ['LU', 'QR', 'auto']:
        U, s, V = randomized_svd(X, n_components, n_iter=2,
                                 power_iteration_normalizer=normalizer,
                                 random_state=0)
        A = X - U.dot(np.diag(s).dot(V))
        error_2 = linalg.norm(A, ord='fro')

        for i in [5, 10, 50]:
            U, s, V = randomized_svd(X, n_components, n_iter=i,
                                     power_iteration_normalizer=normalizer,
                                     random_state=0)
            A = X - U.dot(np.diag(s).dot(V))
            error = linalg.norm(A, ord='fro')
            assert_greater(15, np.abs(error_2 - error))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_randomized_svd_sign_flip():
    a = np.array([[2.0, 0.0], [0.0, 1.0]])
    u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
    for seed in range(10):
        u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
        assert_almost_equal(u1, u2)
        assert_almost_equal(v1, v2)
        assert_almost_equal(np.dot(u2 * s2, v2), a)
        assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
        assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_randomized_svd_sign_flip_with_transpose():
    # Check if the randomized_svd sign flipping is always done based on u
    # irrespective of transpose.
    # See https://github.com/scikit-learn/scikit-learn/issues/5608
    # for more details.
    def max_loading_is_positive(u, v):
        """
        returns bool tuple indicating if the values maximising np.abs
        are positive across all rows for u and across all columns for v.
        """
        u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
        v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
        return u_based, v_based

    mat = np.arange(10 * 8).reshape(10, -1)

    # Without transpose
    u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True)
    u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
    assert_true(u_based)
    assert_false(v_based)

    # With transpose
    u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
        mat, 3, flip_sign=True, transpose=True)
    u_based, v_based = max_loading_is_positive(
        u_flipped_with_transpose, v_flipped_with_transpose)
    assert_true(u_based)
    assert_false(v_based)
项目:pca    作者:vighneshbirodkar    | 项目源码 | 文件源码
def sv_thresh(X, t, k):
    m, n = X.shape
    U, s, V = randomized_svd(X, k)  #pca(X, raw=True, k=25)
    # Number of singular values greater than `t`
    greater_sv = np.sum(s > t)
    s = soft_thresh(s, t)
    S = np.diag(s)
    ret = np.dot(U, np.dot(S, V))
    assert ret.shape == X.shape
    return ret, greater_sv
项目:thunder-factorization    作者:thunder-project    | 项目源码 | 文件源码
def _fit_local(self, mat):

        from sklearn.utils.extmath import randomized_svd
        U, S, V = randomized_svd(mat, n_components=self.k, n_iter=self.max_iter, random_state=self.seed)
        return U, S, V
项目:Lyssandra    作者:ektormak    | 项目源码 | 文件源码
def nn_ksvd(Y, D, X, n_cycles=1, verbose=True):
    # the non-negative variant
    n_atoms = D.shape[1]
    n_features, n_samples = Y.shape
    unused_atoms = []
    R = Y - fast_dot(D, X)

    for k in range(n_atoms):
        if verbose:
            sys.stdout.write("\r" + "k-svd..." + ":%3.2f%%" % ((k / float(n_atoms)) * 100))
            sys.stdout.flush()
        # find all the datapoints that use the kth atom
        omega_k = X[k, :] != 0
        if not np.any(omega_k):
            unused_atoms.append(k)
            continue
        # the residual due to all the other atoms but k
        Rk = R[:, omega_k] + np.outer(D[:, k], X[k, omega_k])
        try:
            U, S, V = randomized_svd(Rk, n_components=1, n_iter=50, flip_sign=False)
        except:
            warnings.warn('SVD error')
            continue

        d = U[:, 0]
        x = V[0, :] * S[0]
        # projection to the constraint set
        d[d < 0] = 0
        x[x < 0] = 0

        dTd = np.dot(d, d)
        xTx = np.dot(x, x)
        if dTd <= np.finfo('float').eps or xTx <= np.finfo('float').eps:
            continue

        for j in range(n_cycles):
            d = np.dot(Rk, x) / np.dot(x, x)
            d[d < 0] = 0
            x = np.dot(d.T, Rk) / np.dot(d, d)
            x[x < 0] = 0

        _norm = norm(d)
        d = d / _norm
        x = x * _norm
        D[:, k] = d
        X[k, omega_k] = x
        # update the residual
        R[:, omega_k] = Rk - np.outer(D[:, k], X[k, omega_k])
    print ""
    return D, X, unused_atoms