我们从Python开源项目中,提取了以下24个代码示例,用于说明如何使用sklearn.utils.extmath.randomized_svd()。
def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100): while True: if queue.empty(): break year = queue.get() print "Loading embeddings for year", year time.sleep(random.random() * 120) valid_words = set(words_above_count(count_dir, year, min_count)) print len(valid_words) words = list(valid_words.intersection(words[year][:num_words])) print len(words) base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False) base_embed = base_embed.get_subembed(words, restrict_context=True) print "SVD for year", year u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5) print "Saving year", year np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s) write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")
def ksvd(Y, D, X, n_cycles=1, verbose=True): n_atoms = D.shape[1] n_features, n_samples = Y.shape unused_atoms = [] R = Y - fast_dot(D, X) for c in range(n_cycles): for k in range(n_atoms): if verbose: sys.stdout.write("\r" + "k-svd..." + ":%3.2f%%" % ((k / float(n_atoms)) * 100)) sys.stdout.flush() # find all the datapoints that use the kth atom omega_k = X[k, :] != 0 if not np.any(omega_k): unused_atoms.append(k) continue # the residual due to all the other atoms but k Rk = R[:, omega_k] + np.outer(D[:, k], X[k, omega_k]) U, S, V = randomized_svd(Rk, n_components=1, n_iter=10, flip_sign=False) D[:, k] = U[:, 0] X[k, omega_k] = V[0, :] * S[0] # update the residual R[:, omega_k] = Rk - np.outer(D[:, k], X[k, omega_k]) print "" return D, X, unused_atoms
def sparse_dense(summary): text_copy = copy.deepcopy(summary) """ Find a suitable value for the hyperparameter, some random value like 0.5, or based on some heuristic like (rank of original matrix/10), or (max_singular_value of the original matrix / 20) """ _, s, _ = randomized_svd(summary, 1, n_iter=5) hyperparameter = s[0] / 50 term_document_matrix_rank = np.linalg.matrix_rank(summary) iterations = int(term_document_matrix_rank / 10) A_new = dense(text_copy, hyperparameter, 0.02, iterations) return A_new
def _init_svd(self, dictionary, definitions): self.td_matrix = lil_matrix((len(dictionary), self.n_terms)) for defn, i in zip(definitions, range(len(definitions))): if i % 100 == 0: print("Building term-document matrix: {} / {}".format(i, len(dictionary)), end="\r") self.td_matrix[i, :] = self.compute_freq_vec(dictionary[defn]) self.td_matrix = self.td_matrix.transpose().tocsr() print() for i in range(self.n_terms): n = float(self.td_matrix[i, :].getnnz()) if i % 100 == 0: print("Applying td-idf: {} / {}".format(i, self.n_terms), end="\r") if n > 0: self.td_matrix[i, :] *= np.log(len(dictionary) / n) print() print("Performing rank reduction...") self.u, self.s, self.vt = randomized_svd(self.td_matrix, 50, transpose=False) self.doc_matrix = np.matmul(np.diag(self.s), self.vt).transpose()
def svd_timing(X, n_comps, n_iter, n_oversamples, power_iteration_normalizer='auto', method=None): """ Measure time for decomposition """ print("... running SVD ...") if method is not 'fbpca': gc.collect() t0 = time() U, mu, V = randomized_svd(X, n_comps, n_oversamples, n_iter, power_iteration_normalizer, random_state=random_state, transpose=False) call_time = time() - t0 else: gc.collect() t0 = time() # There is a different convention for l here U, mu, V = fbpca.pca(X, n_comps, raw=True, n_iter=n_iter, l=n_oversamples+n_comps) call_time = time() - t0 return U, mu, V, call_time
def test_svd(eng): x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0) x = fromarray(x, engine=eng) from sklearn.utils.extmath import randomized_svd u1, s1, v1 = randomized_svd(x.toarray(), n_components=2, random_state=0) u2, s2, v2 = SVD(k=2, method='direct').fit(x) assert allclose_sign(u1, u2) assert allclose(s1, s2) assert allclose_sign(v1.T, v2.T) u2, s2, v2 = SVD(k=2, method='em', max_iter=100, seed=0).fit(x) tol = 1e-1 assert allclose_sign(u1, u2, atol=tol) assert allclose(s1, s2, atol=tol) assert allclose_sign(v1.T, v2.T, atol=tol)
def gsvd(X, M, A, n_comps = 10): """ Generalized SVD :param X: :param M: :param A: :return: """ print("GSVD") print("GSVD: Weights... ", end='') Xw = np.dot(np.sqrt(M), np.dot(X, np.sqrt(A))) print("Done!") print("GSVD: SVD... ", end='') [P_, D, Q_] = randomized_svd(Xw, n_comps) #P_ = P_[:,0:n_comps] #D = D[0:n_comps] #Q_ = Q_[0:n_comps,:] print('Done!') print("GSVD: Factor scores and eigenvalues... ", end='') Mp = np.power(np.diag(M), -0.5) Ap = np.power(np.diag(A), -0.5) P = np.dot(np.diag(Mp), P_) Q = np.dot(np.diag(Ap), Q_.T) ev = np.power(D, 2) print('Done!') return P, D, Q, ev
def randomizedSVD(self): # http://scikit-learn.org/stable/modules/decomposition.html#truncated-singular-value-decomposition-and-latent-semantic-analysis # http://stackoverflow.com/questions/31523575/get-u-sigma-v-matrix-from-truncated-svd-in-scikit-learn U, S, V = randomized_svd(self.bag_of_words_matrix.T, n_components=self.dimensions, n_iter=5, random_state=None) self.U = U self.S = S self.V = V self.tokens_representation = np.matrix(U) * np.diag(S) self.documents_representation = (np.diag(S) * np.matrix(V)).T
def _svd(self, X, max_rank=None): if max_rank: # if we have a max rank then perform the faster randomized SVD return randomized_svd( X, max_rank, n_iter=self.n_power_iterations) else: # perform a full rank SVD using ARPACK return np.linalg.svd( X, full_matrices=False, compute_uv=True)
def __init__(self, X, kern, M): super(SVD, self).__init__("SVD") start = time.time() self.X = X self.kern = kern K = kern.K(X, X) N = np.shape(X)[0] #(self.U, self.Sigma, self.VT) = fb.pca(K, M)#, n_iter=1, l=M) self.U, self.Sigma, self.VT = randomized_svd(K, M) self.precon = np.dot(self.U, np.dot(np.diag(self.Sigma), self.VT)) + self.kern.noise*np.identity(N) self.duration = time.time() - start
def apply_uv_decomposition(self): U, Sigma, VT = randomized_svd(self.behaviour_matrix, n_components=15, n_iter=10, random_state=None) print(U.shape) print(VT.shape) self.X_hat = np.dot(U, VT) # U * np.diag(Sigma)
def fit(self, train_input, train): U, sigma, VT = randomized_svd(train, self.nfactor) sigma = scipy.sparse.diags(sigma, 0) self.U = U * sigma self.V = VT.T
def compute_bench(samples_range, features_range, n_iter=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2) gc.collect() print("benchmarking scipy svd: ") tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=0") tstart = time() randomized_svd(X, rank, n_iter=0) results['scikit-learn randomized_svd (n_iter=0)'].append( time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=%d " % n_iter) tstart = time() randomized_svd(X, rank, n_iter=n_iter) results['scikit-learn randomized_svd (n_iter=%d)' % n_iter].append(time() - tstart) return results
def test_randomized_svd_low_rank(): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k,)) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the # real rank of the matrix assert_almost_equal(s[:k], sa) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va)) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) assert_almost_equal(s[:rank], sa[:rank])
def test_randomized_svd_low_rank_with_noise(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.1, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate # method without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.01) # compute the singular values of X using the fast approximate # method with iterated power method _, sap, _ = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_infinite_rank(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.1) # compute the singular values of X using the fast approximate method # with iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5, power_iteration_normalizer=normalizer) # the iterated power method is still managing to get most of the # structure at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_power_iteration_normalizer(): # randomized_svd with power_iteration_normalized='none' diverges for # large number of power iterations on this dataset rng = np.random.RandomState(42) X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng) X += 3 * rng.randint(0, 2, size=X.shape) n_components = 50 # Check that it diverges with many (non-normalized) power iterations U, s, V = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(V)) error_2 = linalg.norm(A, ord='fro') U, s, V = randomized_svd(X, n_components, n_iter=20, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(V)) error_20 = linalg.norm(A, ord='fro') assert_greater(np.abs(error_2 - error_20), 100) for normalizer in ['LU', 'QR', 'auto']: U, s, V = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(V)) error_2 = linalg.norm(A, ord='fro') for i in [5, 10, 50]: U, s, V = randomized_svd(X, n_components, n_iter=i, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(V)) error = linalg.norm(A, ord='fro') assert_greater(15, np.abs(error_2 - error))
def test_randomized_svd_sign_flip(): a = np.array([[2.0, 0.0], [0.0, 1.0]]) u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41) for seed in range(10): u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed) assert_almost_equal(u1, u2) assert_almost_equal(v1, v2) assert_almost_equal(np.dot(u2 * s2, v2), a) assert_almost_equal(np.dot(u2.T, u2), np.eye(2)) assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
def test_randomized_svd_sign_flip_with_transpose(): # Check if the randomized_svd sign flipping is always done based on u # irrespective of transpose. # See https://github.com/scikit-learn/scikit-learn/issues/5608 # for more details. def max_loading_is_positive(u, v): """ returns bool tuple indicating if the values maximising np.abs are positive across all rows for u and across all columns for v. """ u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all() v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all() return u_based, v_based mat = np.arange(10 * 8).reshape(10, -1) # Without transpose u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True) u_based, v_based = max_loading_is_positive(u_flipped, v_flipped) assert_true(u_based) assert_false(v_based) # With transpose u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd( mat, 3, flip_sign=True, transpose=True) u_based, v_based = max_loading_is_positive( u_flipped_with_transpose, v_flipped_with_transpose) assert_true(u_based) assert_false(v_based)
def sv_thresh(X, t, k): m, n = X.shape U, s, V = randomized_svd(X, k) #pca(X, raw=True, k=25) # Number of singular values greater than `t` greater_sv = np.sum(s > t) s = soft_thresh(s, t) S = np.diag(s) ret = np.dot(U, np.dot(S, V)) assert ret.shape == X.shape return ret, greater_sv
def _fit_local(self, mat): from sklearn.utils.extmath import randomized_svd U, S, V = randomized_svd(mat, n_components=self.k, n_iter=self.max_iter, random_state=self.seed) return U, S, V
def nn_ksvd(Y, D, X, n_cycles=1, verbose=True): # the non-negative variant n_atoms = D.shape[1] n_features, n_samples = Y.shape unused_atoms = [] R = Y - fast_dot(D, X) for k in range(n_atoms): if verbose: sys.stdout.write("\r" + "k-svd..." + ":%3.2f%%" % ((k / float(n_atoms)) * 100)) sys.stdout.flush() # find all the datapoints that use the kth atom omega_k = X[k, :] != 0 if not np.any(omega_k): unused_atoms.append(k) continue # the residual due to all the other atoms but k Rk = R[:, omega_k] + np.outer(D[:, k], X[k, omega_k]) try: U, S, V = randomized_svd(Rk, n_components=1, n_iter=50, flip_sign=False) except: warnings.warn('SVD error') continue d = U[:, 0] x = V[0, :] * S[0] # projection to the constraint set d[d < 0] = 0 x[x < 0] = 0 dTd = np.dot(d, d) xTx = np.dot(x, x) if dTd <= np.finfo('float').eps or xTx <= np.finfo('float').eps: continue for j in range(n_cycles): d = np.dot(Rk, x) / np.dot(x, x) d[d < 0] = 0 x = np.dot(d.T, Rk) / np.dot(d, d) x[x < 0] = 0 _norm = norm(d) d = d / _norm x = x * _norm D[:, k] = d X[k, omega_k] = x # update the residual R[:, omega_k] = Rk - np.outer(D[:, k], X[k, omega_k]) print "" return D, X, unused_atoms