我们从Python开源项目中,提取了以下4个代码示例,用于说明如何使用sklearn.datasets.fetch_20newsgroups_vectorized()。
def generate_data(case, sparse=False): """Generate regression/classification data.""" bunch = None if case == 'regression': bunch = datasets.load_boston() elif case == 'classification': bunch = datasets.fetch_20newsgroups_vectorized(subset='all') X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test} return data
def test_20news_vectorized(): # This test is slow. raise SkipTest("Test too slow.") bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314, 107428)) assert_equal(bunch.target.shape[0], 11314) assert_equal(bunch.data.dtype, np.float64) bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (7532, 107428)) assert_equal(bunch.target.shape[0], 7532) assert_equal(bunch.data.dtype, np.float64) bunch = datasets.fetch_20newsgroups_vectorized(subset="all") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314 + 7532, 107428)) assert_equal(bunch.target.shape[0], 11314 + 7532) assert_equal(bunch.data.dtype, np.float64)
def load_20newsgroup_vectorized(folder=SCIKIT_LEARN_DATA, one_hot=True, partitions_proportions=None, shuffle=False, binary_problem=False, as_tensor=True, minus_value=-1.): data_train = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='train') data_test = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='test') X_train = data_train.data X_test = data_test.data y_train = data_train.target y_test = data_test.target if binary_problem: y_train[data_train.target < 10] = minus_value y_train[data_train.target >= 10] = 1. y_test[data_test.target < 10] = minus_value y_test[data_test.target >= 10] = 1. if one_hot: y_train = to_one_hot_enc(y_train) y_test = to_one_hot_enc(y_test) # if shuffle and sk_shuffle: # xtr = X_train.tocoo() # xts = X_test.tocoo() d_train = Dataset(data=X_train, target=y_train, info={'target names': data_train.target_names}) d_test = Dataset(data=X_test, target=y_test, info={'target names': data_train.target_names}) res = [d_train, d_test] if partitions_proportions: res = redivide_data([d_train, d_test], partition_proportions=partitions_proportions, shuffle=False) if as_tensor: [dat.convert_to_tensor() for dat in res] return Datasets.from_list(res)
def test_LogisticRegressionCV(): bunch = fetch_20newsgroups_vectorized(subset="train") X = bunch.data y = bunch.target y[y < y.mean()] = -1 y[y >= y.mean()] = 1 Xt, Xh, yt, yh = cross_validation.train_test_split( X, y, test_size=.5, random_state=0) # compute the scores all_scores = [] all_alphas = np.linspace(-12, 0, 5) for a in all_alphas: lr = linear_model.LogisticRegression( solver='lbfgs', C=np.exp(-a), fit_intercept=False, tol=1e-6, max_iter=100) lr.fit(Xt, yt) score_scv = linear_model.logistic._logistic_loss( lr.coef_.ravel(), Xh, yh, 0) all_scores.append(score_scv) all_scores = np.array(all_scores) best_alpha = all_alphas[np.argmin(all_scores)] clf = LogisticRegressionCV(verbose=True) clf.fit(Xt, yt, Xh, yh) assert np.abs(clf.alpha_ - best_alpha) < 0.5