Python sklearn.datasets 模块，make_classification() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用sklearn.datasets.make_classification()。

项目：dask-searchcv 作者：dask | 项目源码 | 文件源码

def test_feature_union_fit_failure():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()),
                                             ('bad', FailingClassifier())],
                                            transformer_weights={'bad': 0.5})),
                     ('clf', MockClassifier())])

    grid = {'union__bad__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=None)

    # Check that failure raises if error_score is `'raise'`
    with pytest.raises(ValueError):
        gs.fit(X, y)

    # Check that grid scores were set to error_score on failure
    gs.error_score = float('nan')
    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)
    check_scores_all_nan(gs, 'union__bad__parameter')

项目：OptML 作者：johannespetrat | 项目源码 | 文件源码

def test_improvement(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        hyperopt = HyperoptOptimizer(model, [p1], clf_score)
        best_params, best_model = hyperopt.fit(X_train=data, y_train=target, n_iters=10)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)

        for status in hyperopt.trials.statuses():
            self.assertEqual(status, 'ok')

项目：OptML 作者：johannespetrat | 项目源码 | 文件源码

def test_improvement(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        grid_sizes = {'max_depth': 5}
        grid_search = GridSearchOptimizer(model, [p1], clf_score, grid_sizes)
        best_params, best_model = grid_search.fit(X_train=data, y_train=target)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)

项目：OptML 作者：johannespetrat | 项目源码 | 文件源码

def test_objective_function(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=10,
                                   n_informative=10,
                                   n_redundant=0,
                                   class_sep=100,
                                   n_clusters_per_class=1,
                                   flip_y=0.0)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        fun = partial(objective, model, 
                                 'sklearn', 
                                 clf_score,
                                 data, target, data, target)
        # model should fit the data perfectly
        final_score = fun(model.get_params())[0]
        self.assertEqual(final_score,1)

项目：OptML 作者：johannespetrat | 项目源码 | 文件源码

def test_improvement(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        n_init_samples = 4    
        mutation_noise = {'max_depth': 0.4, 'learning_rate': 0.05, 
                          'reg_lambda':0.5}
        geneticOpt = GeneticOptimizer(model, [p1], clf_score, n_init_samples, 
                                     'RouletteWheel', mutation_noise)

        best_params, best_model = geneticOpt.fit(X_train=data, y_train=target, n_iters=30)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)

项目：OptML 作者：johannespetrat | 项目源码 | 文件源码

def test_expected_improvement_tractable(self):
        np.random.seed(5)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='expected_improvement')
        best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
        self.assertTrue(bayesOpt.success)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)

项目：OptML 作者：johannespetrat | 项目源码 | 文件源码

def test_upper_confidence_bound_tractable(self):
        np.random.seed(5)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='upper_confidence_bound')
        best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
        self.assertTrue(bayesOpt.success)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)

项目：OptML 作者：johannespetrat | 项目源码 | 文件源码

def test_improvement(self):
        np.random.seed(4)
        data, target = make_classification(n_samples=100,
                                   n_features=45,
                                   n_informative=15,
                                   n_redundant=5,
                                   class_sep=1,
                                   n_clusters_per_class=4,
                                   flip_y=0.4)
        model = RandomForestClassifier(max_depth=5)
        model.fit(data, target)
        start_score = clf_score(target, model.predict(data))
        p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
        rand_search = RandomSearchOptimizer(model, [p1], clf_score)
        best_params, best_model = rand_search.fit(X_train=data, y_train=target, n_iters=10)
        best_model.fit(data, target)
        final_score = clf_score(target, best_model.predict(data))
        self.assertTrue(final_score>start_score)

项目：stacker 作者：bamine | 项目源码 | 文件源码

def setUp(self):
        os.putenv("KMP_DUPLICATE_LIB_OK", "TRUE")
        self.X_class, self.y_class = datasets.make_classification(random_state=42)
        self.X_reg, self.y_reg = datasets.make_regression(random_state=42)
        self.classification_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
        self.regression_optimizers = [XGBoostOptimizer, RandomForestOptimizer]
        self.class_scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true))
        self.reg_scorer = Scorer("mse", metrics.mean_squared_error)

        self.classification_task_split = \
            Task("class_split", self.X_class, self.y_class, "classification", test_size=0.1, random_state=42)
        self.regression_task_split = \
            Task("reg_split", self.X_class, self.y_class, "regression", test_size=0.1, random_state=42)

        self.classification_task_cv = \
            Task("class_cv", self.X_reg, self.y_reg, "classification", cv=5, random_state=42)
        self.regression_task_cv = \
            Task("reg_cv", self.X_reg, self.y_reg, "regression", cv=5, random_state=42)

项目：base_function 作者：Rockyzsu | 项目源码 | 文件源码

def case2():
    from sklearn.datasets import make_classification

    x,y = make_classification(n_samples=1000, n_features=2,n_redundant=0,n_informative=1,n_clusters_per_class=1)

    print len(x)
    print len(y)
    print x
    print y
    for i in range(len(x)):
        print x[i],y[i]

    x_data_train = x[:800,:]

    x_data_test = x[800:,:]
    y_data_train = y[:800]
    y_data_test = y[800:]

    print '*'*20
    print x_data_train
    print x_data_test
    print y_data_train
    print y_data_test

    print x[0,0]

项目：dask-searchcv 作者：dask | 项目源码 | 文件源码

def test_visualize():
    pytest.importorskip('graphviz')

    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
                               random_state=0)
    clf = SVC(random_state=0)
    grid = {'C': [.1, .5, .9]}
    gs = dcv.GridSearchCV(clf, grid).fit(X, y)

    assert hasattr(gs, 'dask_graph_')

    with tmpdir() as d:
        gs.visualize(filename=os.path.join(d, 'mydask'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))

    # Doesn't work if not fitted
    gs = dcv.GridSearchCV(clf, grid)
    with pytest.raises(NotFittedError):
        gs.visualize()

项目：dask-searchcv 作者：dask | 项目源码 | 文件源码

def test_feature_union_fit_failure_multiple_metrics():
    scoring = {"score_1": _passthrough_scorer, "score_2": _passthrough_scorer}
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()),
                                             ('bad', FailingClassifier())],
                                            transformer_weights={'bad': 0.5})),
                     ('clf', MockClassifier())])

    grid = {'union__bad__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=scoring)

    # Check that failure raises if error_score is `'raise'`
    with pytest.raises(ValueError):
        gs.fit(X, y)

    # Check that grid scores were set to error_score on failure
    gs.error_score = float('nan')
    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    for key in scoring:
        check_scores_all_nan(gs, 'union__bad__parameter', score_key=key)

项目：dask-searchcv 作者：dask | 项目源码 | 文件源码

def test_pipeline_fit_failure():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    pipe = Pipeline([('bad', FailingClassifier()),
                     ('good1', MockClassifier()),
                     ('good2', MockClassifier())])

    grid = {'bad__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False)

    # Check that failure raises if error_score is `'raise'`
    with pytest.raises(ValueError):
        gs.fit(X, y)

    # Check that grid scores were set to error_score on failure
    gs.error_score = float('nan')
    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    check_scores_all_nan(gs, 'bad__parameter')

项目：dask-searchcv 作者：dask | 项目源码 | 文件源码

def test_feature_union_raises():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)

    union = FeatureUnion([('tr0', MockClassifier()),
                          ('tr1', MockClassifier())])
    pipe = Pipeline([('union', union), ('est', MockClassifier())])

    grid = {'union__tr2__parameter': [0, 1, 2]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False)
    with pytest.raises(ValueError):
        gs.fit(X, y)

    grid = {'union__transformer_list': [[('one', MockClassifier())]]}
    gs = dcv.GridSearchCV(pipe, grid, refit=False)
    with pytest.raises(NotImplementedError):
        gs.fit(X, y)

项目：Gaussian_process 作者：happyjin | 项目源码 | 文件源码

def dataset_generator():
    """
    generate dataset for binary classification
    :return:
    """
    X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                               random_state=1, n_clusters_per_class=1)
    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)

    datasets = [make_moons(noise=0.3, random_state=0),
                make_circles(noise=0.2, factor=0.5, random_state=1),
                linearly_separable
                ]

    X, y = datasets[0]
    y[y == 0] = -1
    X = StandardScaler().fit_transform(X)
    return X, y

项目：MLAlgorithms 作者：rushter | 项目源码 | 文件源码

def classification():
    # Generate a random binary classification problem.
    X, y = make_classification(n_samples=350, n_features=15, n_informative=10,
                               random_state=1111, n_classes=2,
                               class_sep=1., n_redundant=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
                                                        random_state=1111)

    model = GradientBoostingClassifier(n_estimators=50, max_depth=4,
                                       max_features=8, learning_rate=0.1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(predictions)
    print(predictions.min())
    print(predictions.max())
    print('classification, roc auc score: %s'
          % roc_auc_score(y_test, predictions))