Python sklearn.datasets 模块，make_regression() 实例源码

我们从Python开源项目中，提取了以下37个代码示例，用于说明如何使用sklearn.datasets.make_regression()。

项目：stacker 作者：bamine | 项目源码 | 文件源码

def setUp(self):
        os.putenv("KMP_DUPLICATE_LIB_OK", "TRUE")
        self.X_class, self.y_class = datasets.make_classification(random_state=42)
        self.X_reg, self.y_reg = datasets.make_regression(random_state=42)
        self.classification_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
        self.regression_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
        self.class_scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
        self.reg_scorer = Scorer("mse", metrics.mean_squared_error)

        self.classification_task_split = \
            Task("class_split", self.X_class, self.y_class, "classification", test_size=0.1, random_state=42)
        self.regression_task_split = \
            Task("reg_split", self.X_class, self.y_class, "regression", test_size=0.1, random_state=42)

        self.classification_task_cv = \
            Task("class_cv", self.X_reg, self.y_reg, "classification", cv=5, random_state=42)
        self.regression_task_cv = \
            Task("reg_cv", self.X_reg, self.y_reg, "regression", cv=5, random_state=42)

项目：pyglmnet 作者：glm-tools | 项目源码 | 文件源码

def test_cv():
    """Simple CV check."""
    # XXX: don't use scikit-learn for tests.
    X, y = make_regression()
    cv = KFold(X.shape[0], 5)

    glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
    # check that it returns 5 scores
    scores = cross_val_score(glm_normal, X, y, cv=cv)
    assert_equal(len(scores), 5)

    param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
                  {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
                                             10, base=np.exp(1))}]
    glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
    glmcv.fit(X, y)

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def test_min_samples_split():
    X_c, y_c = load_digits(return_X_y=True)
    X_r, y_r = make_regression(n_samples=10000, random_state=0)

    for mss in [2, 4, 10, 20]:
        mtr = MondrianTreeRegressor(random_state=0, min_samples_split=mss)
        mtr.partial_fit(X_r[: X_r.shape[0] // 2], y_r[: X_r.shape[0] // 2])
        mtr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:])
        n_node_samples = mtr.tree_.n_node_samples[mtr.tree_.children_left != -1]
        assert_greater(np.min(n_node_samples) + 1, mss)

        mtc = MondrianTreeClassifier(random_state=0, min_samples_split=mss)
        mtc.partial_fit(X_c[: X_c.shape[0] // 2], y_c[: X_c.shape[0] // 2])
        mtc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:])
        n_node_samples = mtc.tree_.n_node_samples[mtc.tree_.children_left != -1]
        assert_greater(np.min(n_node_samples) + 1, mss)

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def test_min_samples_split():
    X_c, y_c = load_digits(return_X_y=True)
    X_r, y_r = make_regression(n_samples=10000, random_state=0)

    for mss in [2, 4, 10, 20]:
        mfr = MondrianForestRegressor(random_state=0, min_samples_split=mss)
        mfr.partial_fit(X_r[: X_r.shape[0] // 2], y_r[: X_r.shape[0] // 2])
        mfr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:])
        for est in mfr.estimators_:
            n_node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
            assert_greater(np.min(n_node_samples) + 1, mss)

        mfc = MondrianForestClassifier(random_state=0, min_samples_split=mss)
        mfc.partial_fit(X_c[: X_c.shape[0] // 2], y_c[: X_c.shape[0] // 2])
        mfc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:])
        for est in mfc.estimators_:
            n_node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
            assert_greater(np.min(n_node_samples) + 1, mss)

项目：MLAlgorithms 作者：rushter | 项目源码 | 文件源码

def regression():
    # Generate a random regression problem
    X, y = make_regression(n_samples=5000, n_features=25, n_informative=25,
                           n_targets=1, random_state=100, noise=0.05)
    y *= 0.01
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                        random_state=1111)

    model = NeuralNet(
        layers=[
            Dense(64, Parameters(init='normal')),
            Activation('linear'),
            Dense(32, Parameters(init='normal')),
            Activation('linear'),
            Dense(1),
        ],
        loss='mse',
        optimizer=Adam(),
        metric='mse',
        batch_size=256,
        max_epochs=15,
    )
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print("regression mse", mean_squared_error(y_test, predictions.flatten()))

项目：yellowbrick 作者：DistrictDataLabs | 项目源码 | 文件源码

def test_get_errors_param(self):
        """
        Test known models we can get the cv errors for alpha selection
        """

        # Test original CV models
        for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV):
            try:
                model = AlphaSelection(model())

                X, y = make_regression()
                model.fit(X, y)

                errors = model._find_errors_param()
                self.assertTrue(len(errors) > 0)
            except YellowbrickValueError:
                self.fail("could not find errors on {}".format(model.name))

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    mse_scores = cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error")
    expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(mse_scores, expected_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cval.cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    mse_scores = cval.cross_val_score(reg, X, y, cv=5,
                                      scoring="mean_squared_error")
    expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(mse_scores, expected_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_multi_target_regression():
    X, y = datasets.make_regression(n_targets=3)
    X_train, y_train = X[:50], y[:50]
    X_test, y_test = X[50:], y[50:]

    references = np.zeros_like(y_test)
    for n in range(3):
        rgr = GradientBoostingRegressor(random_state=0)
        rgr.fit(X_train, y_train[:, n])
        references[:,n] = rgr.predict(X_test)

    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
    rgr.fit(X_train, y_train)
    y_pred = rgr.predict(X_test)

    assert_almost_equal(references, y_pred)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_ridge_fit_intercept_sparse():
    X, y = make_regression(n_samples=1000, n_features=2, n_informative=2,
                           bias=10., random_state=42)
    X_csr = sp.csr_matrix(X)

    dense = Ridge(alpha=1., tol=1.e-15, solver='sag', fit_intercept=True)
    sparse = Ridge(alpha=1., tol=1.e-15, solver='sag', fit_intercept=True)
    dense.fit(X, y)
    sparse.fit(X_csr, y)
    assert_almost_equal(dense.intercept_, sparse.intercept_)
    assert_array_almost_equal(dense.coef_, sparse.coef_)

    # test the solver switch and the corresponding warning
    sparse = Ridge(alpha=1., tol=1.e-15, solver='lsqr', fit_intercept=True)
    assert_warns(UserWarning, sparse.fit, X_csr, y)
    assert_almost_equal(dense.intercept_, sparse.intercept_)
    assert_array_almost_equal(dense.coef_, sparse.coef_)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_make_regression():
    X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
                              effective_rank=5, coef=True, bias=0.0,
                              noise=1.0, random_state=0)

    assert_equal(X.shape, (100, 10), "X shape mismatch")
    assert_equal(y.shape, (100,), "y shape mismatch")
    assert_equal(c.shape, (10,), "coef shape mismatch")
    assert_equal(sum(c != 0.0), 3, "Unexpected number of informative features")

    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)

    # Test with small number of features.
    X, y = make_regression(n_samples=100, n_features=1)  # n_informative=3
    assert_equal(X.shape, (100, 1))

项目：base_function 作者：Rockyzsu | 项目源码 | 文件源码

def lession_5():
    # db = datasets.load_boston()
    # print db.data.shape
    # data_X=db.data
    # data_y=db.target
    # model = LinearRegression()
    # model.fit(data_X,data_y)
    # print model.predict(data_X[:8])
    # print data_y[:8]

    X,y = datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=10)

    plt.scatter(X,y)
    plt.show()

项目：microbiome-summer-school-2017 作者：aldro61 | 项目源码 | 文件源码

def make_regression_example(axis, random_state):
    X, y = make_regression(n_samples=100, n_features=1, noise=30.0, random_state=random_state)

    axis.scatter(X[:, 0], y, color="blue", s=10, label="Patients")

    clf = LinearSVR().fit(X, y)
    axis.plot(X[:, 0], clf.predict(X), color="black", label="Model")

    ax2.tick_params(labelbottom='off', labelleft='off')
    ax2.set_xlabel("Gene 1")
    ax2.set_ylabel("Survived (years)")
    ax2.legend()

项目：ML-From-Scratch 作者：eriklindernoren | 项目源码 | 文件源码

def main():

    X, y = make_regression(n_samples=100, n_features=1, noise=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    n_samples, n_features = np.shape(X)

    model = LinearRegression(n_iterations=100)

    model.fit(X_train, y_train)

    # Training error plot
    n = len(model.training_errors)
    training, = plt.plot(range(n), model.training_errors, label="Training Error")
    plt.legend(handles=[training])
    plt.title("Error Plot")
    plt.ylabel('Mean Squared Error')
    plt.xlabel('Iterations')
    plt.show()

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print ("Mean squared error: %s" % (mse))

    y_pred_line = model.predict(X)

    # Color map
    cmap = plt.get_cmap('viridis')

    # Plot the results
    m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10)
    m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10)
    plt.plot(366 * X, y_pred_line, color='black', linewidth=2, label="Prediction")
    plt.suptitle("Linear Regression")
    plt.title("MSE: %.2f" % mse, fontsize=10)
    plt.xlabel('Day')
    plt.ylabel('Temperature in Celcius')
    plt.legend((m1, m2), ("Training data", "Test data"), loc='lower right')
    plt.show()

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def test_tau():
    """
    Test time of split for the root.
    """
    X, y = make_regression(random_state=0, n_features=10)
    y = np.round(y)
    rate = np.sum(np.max(X, axis=0) - np.min(X, axis=0))

    for est in estimators:
        est = est.set_params(max_depth=1)
        taus = []
        for random_state in np.arange(100):
            est.set_params(random_state=random_state).fit(X, y)
            taus.append(est.tree_.tau[0])
        assert_almost_equal(np.mean(taus), 1.0 / rate, 2)

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def test_mondrian_tree_n_node_samples():
    for r in range(1000):
        X, y = make_regression(n_samples=2, random_state=r)
        mtr = MondrianTreeRegressor(random_state=0)
        mtr.partial_fit(X, y)
        assert_array_equal(mtr.tree_.n_node_samples, [1, 1, 2])

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def test_partial_fit_equivalence():
    X, y = make_regression(random_state=0, n_samples=100)
    mtr = MondrianTreeRegressor(random_state=0)
    mtr.partial_fit(X, y)
    for batch_size in [10, 20, 25, 50, 90]:
        check_partial_fit_equivalence(batch_size, mtr, 0, X, y)

    X, y = make_classification(random_state=0, n_samples=100)
    mtc = MondrianTreeClassifier(random_state=0)
    mtc.partial_fit(X, y)
    for batch_size in [10, 20, 25, 50, 90]:
        check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)

项目：scikit-garden 作者：scikit-garden | 项目源码 | 文件源码

def test_partial_fit_n_samples_1000():
    mtc = MondrianTreeClassifier(random_state=0)
    X, y = load_digits(return_X_y=True)
    check_online_fit(mtc, X, y, 20)

    mtc = MondrianTreeClassifier(random_state=0)
    check_online_fit(mtc, X, y, 100)

    X, y = make_regression(random_state=0, n_samples=10000)
    mtr = MondrianTreeRegressor(random_state=0)
    check_online_fit(mtr, X, y, 100, is_clf=False)

    mtr = MondrianTreeRegressor(random_state=0)
    check_online_fit(mtr, X, y, 20, is_clf=False)

项目：extreme-learning-machines 作者：IssamLaradji | 项目源码 | 文件源码

def test_multioutput_regression():
    """Test whether multi-output regression works as expected."""
    X, y = make_regression(n_samples=200, n_targets=5,
                           random_state=random_state)
    for activation in ACTIVATION_TYPES:
        elm = ELMRegressor(n_hidden=300, activation=activation,
                           random_state=random_state)
        elm.fit(X, y)
        assert_greater(elm.score(X, y), 0.95)

项目：STK-INF4000-templates 作者：dhesse | 项目源码 | 文件源码

def test_known_values(self):
        from sklearn.datasets import make_regression
        X,y, coef = make_regression(200, 15, 15, coef=True)
        np.testing.assert_equal(relevant_features(X, y),
                                coef != 0.0)

项目：MLAlgorithms 作者：rushter | 项目源码 | 文件源码

def regression():
    # Generate a random regression problem
    X, y = make_regression(n_samples=500, n_features=5,
                           n_informative=5, n_targets=1,
                           noise=0.05, random_state=1111, bias=0.5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                        random_state=1111)

    model = knn.KNNRegressor(k=5, distance_func=distance.euclidean)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('regression mse', mean_squared_error(y_test, predictions))

项目：MLAlgorithms 作者：rushter | 项目源码 | 文件源码

def regression():
    # Generate a random regression problem
    X, y = make_regression(n_samples=500, n_features=5, n_informative=5,
                           n_targets=1, noise=0.05, random_state=1111,
                           bias=0.5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                        random_state=1111)

    model = GradientBoostingRegressor(n_estimators=25, max_depth=5,
                                      max_features=3, )
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('regression, mse: %s'
          % mean_squared_error(y_test.flatten(), predictions.flatten()))

项目：MLAlgorithms 作者：rushter | 项目源码 | 文件源码

def regression():
    # Generate a random regression problem
    X, y = make_regression(n_samples=10000, n_features=100,
                           n_informative=75, n_targets=1, noise=0.05,
                           random_state=1111, bias=0.5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                        random_state=1111)

    model = LinearRegression(lr=0.01, max_iters=2000, penalty='l2', C=0.03)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('regression mse', mean_squared_error(y_test, predictions))

项目：MLAlgorithms 作者：rushter | 项目源码 | 文件源码

def regression():
    # Generate a random regression problem
    X, y = make_regression(n_samples=500, n_features=5, n_informative=5,
                           n_targets=1, noise=0.05, random_state=1111,
                           bias=0.5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                        random_state=1111)

    model = RandomForestRegressor(n_estimators=50, max_depth=10, max_features=3, )
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('regression, mse: %s'
          % mean_squared_error(y_test.flatten(), predictions.flatten()))

项目：yellowbrick 作者：DistrictDataLabs | 项目源码 | 文件源码

def test_get_alphas_param_lassolars(self):
        """
        Assert that we can get alphas from lasso lars.
        """
        X, y = make_regression()
        model = AlphaSelection(LassoLarsCV())
        model.fit(X, y)
        try:
            malphas = model._find_alphas_param()
            self.assertTrue(len(malphas) > 0)
        except YellowbrickValueError:
            self.fail("could not find alphas on {}".format(model.name))

项目：snape 作者：mbernico | 项目源码 | 文件源码

def create_regression_dataset(n_samples, n_features, n_informative, effective_rank, tail_strength,
                              noise, random_state=None):
    """
    Creates a regression dataset

    :param n_samples: number of observations
    :param n_features: number of features
    :param n_informative: number of informative features
    :param n_targets: The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar.
    :param effective_rank: approximate number of singular vectors required to explain data
    :param tail_strength: relative importance of the fat noisy tail of the singular values profile
    :param noise: standard deviation of the gaussian noise applied to the output
    :param random_state: the numpy RandomState
    :return: the requested dataframe
    """
    random_state = get_random_state(random_state)
    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
                           n_targets=1, effective_rank=effective_rank, tail_strength=tail_strength,
                           noise=noise, random_state=random_state)

    # cast to a data frame
    df = pd.DataFrame(X)
    # rename X columns
    df = rename_columns(df)
    # and add the Y
    df['y'] = y
    return df

项目：cartesian 作者：Ohjeah | 项目源码 | 文件源码

def test_Symbolic_fit(n_out):
    x, y = make_regression(n_features=2, n_informative=1, n_targets=n_out)
    est = Symbolic(max_nfev=1, lambda_=1).fit(x, y)
    yhat = est.predict(x)
    assert yhat.shape == y.shape

项目：cartesian 作者：Ohjeah | 项目源码 | 文件源码

def test_Symbolic_joblib():
    x, y = make_regression(n_features=2, n_informative=1, n_targets=1)
    yhat = Symbolic(n_jobs=-1, max_nfev=1, lambda_=1).fit(x, y).predict(x)
    assert yhat.shape == y.shape

项目：skorch 作者：dnouri | 项目源码 | 文件源码

def data(self):
        X, y = make_regression(
            1000, 20, n_informative=10, bias=0, random_state=0)
        X, y = X.astype(np.float32), y.astype(np.float32).reshape(-1, 1)
        Xt = StandardScaler().fit_transform(X)
        yt = StandardScaler().fit_transform(y)
        return Xt, yt

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_multi_target_regression_one_target():
    # Test multi target regression raises
    X, y = datasets.make_regression(n_targets=1)
    X_train, y_train = X[:50], y[:50]
    X_test, y_test = X[50:], y[50:]

    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
    assert_raises(ValueError, rgr.fit, X_train, y_train)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def make_regression_with_outliers(n_samples=50, n_features=20):
    rng = np.random.RandomState(0)
    # Generate data with outliers by replacing 10% of the samples with noise.
    X, y = make_regression(
        n_samples=n_samples, n_features=n_features,
        random_state=0, noise=0.05)

    # Replace 10% of the sample with noise.
    num_noise = int(0.1 * n_samples)
    random_samples = rng.randint(0, n_samples, num_noise)
    X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1]))
    return X, y

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_make_regression_multitarget():
    X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
                              n_targets=3, coef=True, noise=1., random_state=0)

    assert_equal(X.shape, (100, 10), "X shape mismatch")
    assert_equal(y.shape, (100, 3), "y shape mismatch")
    assert_equal(c.shape, (10, 3), "coef shape mismatch")
    assert_array_equal(sum(c != 0.0), 3,
                       "Unexpected number of informative features")

    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_mse_solving():
    # test the MSE estimate to be sane.
    # non-regression test for ignoring off-diagonals of feature covariance,
    # testing with nugget that renders covariance useless, only
    # using the mean function, with low effective rank of data
    gp = GaussianProcess(corr='absolute_exponential', theta0=1e-4,
                         thetaL=1e-12, thetaU=1e-2, nugget=1e-2,
                         optimizer='Welch', regr="linear", random_state=0)

    X, y = make_regression(n_informative=3, n_features=60, noise=50,
                           random_state=0, effective_rank=1)

    gp.fit(X, y)
    assert_greater(1000, gp.predict(X, eval_MSE=True)[1].mean())

项目：karura 作者：chakki-works | 项目源码 | 文件源码

def test_insight_regression(self):
        candidates = 4
        X, y = make_regression(
            n_samples=1000, n_features=15, n_informative=candidates,
            n_targets=1)

        df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"])
        dfe = DataFrameExtension(df, numericals=["target"], target="target")

        insight = ModelSelectionInsight()
        insight.adopt(dfe)

        self.assertTrue(insight.score > 0)
        print(insight.score)

项目：karura 作者：chakki-works | 项目源码 | 文件源码

def test_insight_regression(self):
        candidates = 4
        X, y = make_regression(
            n_samples=1000, n_features=15, n_informative=candidates,
            n_targets=1)

        df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"])
        dfe = DataFrameExtension(df, numericals=["target"], target="target")

        insight = FeatureSelectionInsight()
        insight.adopt(dfe)

        print("selected regressor features {}".format(dfe.ftypes.keys()))
        self.assertTrue(candidates <= len(dfe.ftypes) - 1 < candidates * 2)  # -1 is target ftype

项目：fri 作者：lpfann | 项目源码 | 文件源码

def genRegressionData(n_samples: int = 100, n_features: int = 2, n_redundant: int = 0, strRel: int = 1,
                      n_repeated: int = 0, noise: float = 1, random_state: object = None,
                      partition = None) -> object:
    """Generate synthetic regression data

    Parameters
    ----------
    n_samples : int, optional
        Number of samples
    n_features : int, optional
        Number of features
    n_redundant : int, optional
        Number of features which are part of redundant subsets (weakly relevant)
    strRel : int, optional
        Number of features which are mandatory for the underlying model (strongly relevant)
    n_repeated : int, optional
        Number of features which are clones of existing ones. 
    noise : float, optional
        Noise of the created samples around ground truth.
    random_state : object, optional
        Randomstate object used for generation.

    Returns
    -------
    X : array of shape [n_samples, n_features]
        The generated samples.
    y : array of shape [n_samples]
        The output values (target).

    Raises
    ------
    ValueError
    Wrong parameters for specified amonut of features/samples.
    """ 

    _checkParam(**locals())
    random_state = check_random_state(random_state)

    X = np.zeros((int(n_samples), int(n_features)))

    # Find partitions which defíne the weakly relevant subsets
    if partition is None:
        # Legacy behaviour yielding subsets of size 2
        partition =  int(n_redundant / 2) * [2]
    part_size = len(partition) 

    X_informative, Y = make_regression(n_features=int(strRel + part_size),
                                        n_samples=int(n_samples),
                                        noise=noise,
                                        n_informative=int(strRel),
                                        random_state=random_state,
                                        shuffle=False)

    X = _fillVariableSpace(**locals())

    return X, Y

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_sparse_regression():
    # Check regression with sparse input.

    class CustomSVR(SVR):
        """SVR variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVR, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostRegressor(
            base_estimator=CustomSVR(),
            random_state=1
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = dense_results = AdaBoostRegressor(
            base_estimator=CustomSVR(),
            random_state=1
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])