我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.datasets.make_classification()。
def test_feature_union_fit_failure(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()), ('bad', FailingClassifier())], transformer_weights={'bad': 0.5})), ('clf', MockClassifier())]) grid = {'union__bad__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=None) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float('nan') with pytest.warns(FitFailedWarning): gs.fit(X, y) check_scores_all_nan(gs, 'union__bad__parameter')
def test_improvement(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) hyperopt = HyperoptOptimizer(model, [p1], clf_score) best_params, best_model = hyperopt.fit(X_train=data, y_train=target, n_iters=10) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score) for status in hyperopt.trials.statuses(): self.assertEqual(status, 'ok')
def test_improvement(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) grid_sizes = {'max_depth': 5} grid_search = GridSearchOptimizer(model, [p1], clf_score, grid_sizes) best_params, best_model = grid_search.fit(X_train=data, y_train=target) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def test_objective_function(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=10, n_informative=10, n_redundant=0, class_sep=100, n_clusters_per_class=1, flip_y=0.0) model = RandomForestClassifier(max_depth=5) model.fit(data, target) fun = partial(objective, model, 'sklearn', clf_score, data, target, data, target) # model should fit the data perfectly final_score = fun(model.get_params())[0] self.assertEqual(final_score,1)
def test_improvement(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) n_init_samples = 4 mutation_noise = {'max_depth': 0.4, 'learning_rate': 0.05, 'reg_lambda':0.5} geneticOpt = GeneticOptimizer(model, [p1], clf_score, n_init_samples, 'RouletteWheel', mutation_noise) best_params, best_model = geneticOpt.fit(X_train=data, y_train=target, n_iters=30) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def test_expected_improvement_tractable(self): np.random.seed(5) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='expected_improvement') best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10) self.assertTrue(bayesOpt.success) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def test_upper_confidence_bound_tractable(self): np.random.seed(5) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='upper_confidence_bound') best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10) self.assertTrue(bayesOpt.success) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def test_improvement(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) rand_search = RandomSearchOptimizer(model, [p1], clf_score) best_params, best_model = rand_search.fit(X_train=data, y_train=target, n_iters=10) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def setUp(self): os.putenv("KMP_DUPLICATE_LIB_OK", "TRUE") self.X_class, self.y_class = datasets.make_classification(random_state=42) self.X_reg, self.y_reg = datasets.make_regression(random_state=42) self.classification_optimizers = [XGBoostOptimizer, RandomForestOptimizer] self.regression_optimizers = [XGBoostOptimizer, RandomForestOptimizer] self.class_scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true)) self.reg_scorer = Scorer("mse", metrics.mean_squared_error) self.classification_task_split = \ Task("class_split", self.X_class, self.y_class, "classification", test_size=0.1, random_state=42) self.regression_task_split = \ Task("reg_split", self.X_class, self.y_class, "regression", test_size=0.1, random_state=42) self.classification_task_cv = \ Task("class_cv", self.X_reg, self.y_reg, "classification", cv=5, random_state=42) self.regression_task_cv = \ Task("reg_cv", self.X_reg, self.y_reg, "regression", cv=5, random_state=42)
def case2(): from sklearn.datasets import make_classification x,y = make_classification(n_samples=1000, n_features=2,n_redundant=0,n_informative=1,n_clusters_per_class=1) print len(x) print len(y) print x print y for i in range(len(x)): print x[i],y[i] x_data_train = x[:800,:] x_data_test = x[800:,:] y_data_train = y[:800] y_data_test = y[800:] print '*'*20 print x_data_train print x_data_test print y_data_train print y_data_test print x[0,0]
def test_visualize(): pytest.importorskip('graphviz') X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0) clf = SVC(random_state=0) grid = {'C': [.1, .5, .9]} gs = dcv.GridSearchCV(clf, grid).fit(X, y) assert hasattr(gs, 'dask_graph_') with tmpdir() as d: gs.visualize(filename=os.path.join(d, 'mydask')) assert os.path.exists(os.path.join(d, 'mydask.png')) # Doesn't work if not fitted gs = dcv.GridSearchCV(clf, grid) with pytest.raises(NotFittedError): gs.visualize()
def test_feature_union_fit_failure_multiple_metrics(): scoring = {"score_1": _passthrough_scorer, "score_2": _passthrough_scorer} X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('union', FeatureUnion([('good', MockClassifier()), ('bad', FailingClassifier())], transformer_weights={'bad': 0.5})), ('clf', MockClassifier())]) grid = {'union__bad__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False, scoring=scoring) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float('nan') with pytest.warns(FitFailedWarning): gs.fit(X, y) for key in scoring: check_scores_all_nan(gs, 'union__bad__parameter', score_key=key)
def test_pipeline_fit_failure(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) pipe = Pipeline([('bad', FailingClassifier()), ('good1', MockClassifier()), ('good2', MockClassifier())]) grid = {'bad__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) # Check that failure raises if error_score is `'raise'` with pytest.raises(ValueError): gs.fit(X, y) # Check that grid scores were set to error_score on failure gs.error_score = float('nan') with pytest.warns(FitFailedWarning): gs.fit(X, y) check_scores_all_nan(gs, 'bad__parameter')
def test_feature_union_raises(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) union = FeatureUnion([('tr0', MockClassifier()), ('tr1', MockClassifier())]) pipe = Pipeline([('union', union), ('est', MockClassifier())]) grid = {'union__tr2__parameter': [0, 1, 2]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(ValueError): gs.fit(X, y) grid = {'union__transformer_list': [[('one', MockClassifier())]]} gs = dcv.GridSearchCV(pipe, grid, refit=False) with pytest.raises(NotImplementedError): gs.fit(X, y)
def dataset_generator(): """ generate dataset for binary classification :return: """ X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] X, y = datasets[0] y[y == 0] = -1 X = StandardScaler().fit_transform(X) return X, y
def classification(): # Generate a random binary classification problem. X, y = make_classification(n_samples=350, n_features=15, n_informative=10, random_state=1111, n_classes=2, class_sep=1., n_redundant=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111) model = GradientBoostingClassifier(n_estimators=50, max_depth=4, max_features=8, learning_rate=0.1) model.fit(X_train, y_train) predictions = model.predict(X_test) print(predictions) print(predictions.min()) print(predictions.max()) print('classification, roc auc score: %s' % roc_auc_score(y_test, predictions))
def test_importances_gini_equal_mse(): # Check that gini is equivalent to mse for binary output variable X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # The gini index and the mean square error (variance) might differ due # to numerical instability. Since those instabilities mainly occurs at # high tree depth, we restrict this maximal depth. clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=0).fit(X, y) reg = DecisionTreeRegressor(criterion="mse", max_depth=5, random_state=0).fit(X, y) assert_almost_equal(clf.feature_importances_, reg.feature_importances_) assert_array_equal(clf.tree_.feature, reg.tree_.feature) assert_array_equal(clf.tree_.children_left, reg.tree_.children_left) assert_array_equal(clf.tree_.children_right, reg.tree_.children_right) assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
def test_importances(): # Check variable importances. X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=1) for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(X, y) importances = clf.feature_importances_ assert_equal(importances.shape[0], 10) assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(), True)
def test_grid_search_labels(): # Check if ValueError (when labels is None) propagates to GridSearchCV # And also check if labels is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) labels = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} label_cvs = [LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(), LabelShuffleSplit()] for cv in label_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The labels parameter should not be None", gs.fit, X, y) gs.fit(X, y, labels) non_label_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_label_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
def test_grid_search_sparse(): # Test that grid search works with both dense and sparse matrices X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180].tocoo(), y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_true(np.mean(y_pred == y_pred2) >= .9) assert_equal(C, C2)
def test_learning_curve(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(20) with warnings.catch_warnings(record=True) as w: train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10)) if len(w) > 0: raise RuntimeError("Unexpected warning: %r" % w[0].message) assert_equal(train_scores.shape, (10, 3)) assert_equal(test_scores.shape, (10, 3)) assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
def test_learning_curve_verbose(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(20) old_stdout = sys.stdout sys.stdout = StringIO() try: train_sizes, train_scores, test_scores = \ learning_curve(estimator, X, y, cv=3, verbose=1) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout assert("[learning_curve]" in out)
def test_learning_curve_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) train_sizes = np.linspace(0.2, 1.0, 5) estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False) train_sizes_inc, train_scores_inc, test_scores_inc = \ learning_curve( estimator, X, y, train_sizes=train_sizes, cv=3, exploit_incremental_learning=True) train_sizes_batch, train_scores_batch, test_scores_batch = \ learning_curve( estimator, X, y, cv=3, train_sizes=train_sizes, exploit_incremental_learning=False) assert_array_equal(train_sizes_inc, train_sizes_batch) assert_array_almost_equal(train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)) assert_array_almost_equal(test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1))
def test_l1_ratio(): # Test if l1 ratio extremes match L1 and L2 penalty settings. X, y = datasets.make_classification(n_samples=1000, n_features=100, n_informative=20, random_state=1234) # test if elasticnet with l1_ratio near 1 gives same result as pure l1 est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', l1_ratio=0.9999999999, random_state=42).fit(X, y) est_l1 = SGDClassifier(alpha=0.001, penalty='l1', random_state=42).fit(X, y) assert_array_almost_equal(est_en.coef_, est_l1.coef_) # test if elasticnet with l1_ratio near 0 gives same result as pure l2 est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', l1_ratio=0.0000000001, random_state=42).fit(X, y) est_l2 = SGDClassifier(alpha=0.001, penalty='l2', random_state=42).fit(X, y) assert_array_almost_equal(est_en.coef_, est_l2.coef_)
def test_liblinear_dual_random_state(): # random_state is relevant for liblinear solver only if dual=True X, y = make_classification(n_samples=20) lr1 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15) lr1.fit(X, y) lr2 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15) lr2.fit(X, y) lr3 = LogisticRegression(random_state=8, dual=True, max_iter=1, tol=1e-15) lr3.fit(X, y) # same result for same random state assert_array_almost_equal(lr1.coef_, lr2.coef_) # different results for different random states msg = "Arrays are not almost equal to 6 decimals" assert_raise_message(AssertionError, msg, assert_array_almost_equal, lr1.coef_, lr3.coef_)
def test_logistic_regression_solvers(): X, y = make_classification(n_features=10, n_informative=5, random_state=0) ncg = LogisticRegression(solver='newton-cg', fit_intercept=False) lbf = LogisticRegression(solver='lbfgs', fit_intercept=False) lib = LogisticRegression(fit_intercept=False) sag = LogisticRegression(solver='sag', fit_intercept=False, random_state=42) ncg.fit(X, y) lbf.fit(X, y) sag.fit(X, y) lib.fit(X, y) assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3) assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3) assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3) assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3)
def test_logistic_regression_solvers_multiclass(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) tol = 1e-6 ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol) lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol) lib = LogisticRegression(fit_intercept=False, tol=tol) sag = LogisticRegression(solver='sag', fit_intercept=False, tol=tol, max_iter=1000, random_state=42) ncg.fit(X, y) lbf.fit(X, y) sag.fit(X, y) lib.fit(X, y) assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4) assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4) assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4) assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4)
def test_logreg_predict_proba_multinomial(): X, y = make_classification(n_samples=10, n_features=20, random_state=0, n_classes=3, n_informative=10) # Predicted probabilites using the true-entropy loss should give a # smaller loss than those using the ovr method. clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs") clf_multi.fit(X, y) clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs") clf_ovr.fit(X, y) clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X)) assert_greater(clf_ovr_loss, clf_multi_loss) # Predicted probabilites using the soft-max function should give a # smaller loss than those using the logistic function. clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X)) assert_greater(clf_wrong_loss, clf_multi_loss)
def test_mean_variance_illegal_axis(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_csr = sp.csr_matrix(X) assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3) assert_raises(ValueError, mean_variance_axis, X_csr, axis=2) assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3, last_mean=None, last_var=None, last_n=None) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2, last_mean=None, last_var=None, last_n=None) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1, last_mean=None, last_var=None, last_n=None)
def test_model_assessment(): X, y = make_classification(n_samples=40, n_features=100, n_informative=2, n_classes=2, n_redundant=0) pipe = Pipeline([('enet', ElasticNetFeatureSelection()), ('ridge', RidgeClassifier())]) ma = ModelAssessment(GridSearchCV(pipe, {'enet__l1_ratio': [2]})).fit(X, y) assert len(ma.cv_results_) == 0
def test_db_logger(self): X, y = datasets.make_classification(random_state=42) task = Task("class_split", X, y, "classification", test_size=0.1, random_state=42) scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true)) logger = DBLogger(task, self.engine) optimizer = XGBoostOptimizer(task, scorer, logger) optimizer.start_optimization(max_evals=10) self.assertEqual(len(list(logger.load_all_results())), 10)
def test_file_logger(self): X, y = datasets.make_classification(random_state=42) task = Task("class_split", X, y, "classification", test_size=0.1, random_state=42) scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true)) logger = FileLogger(task) optimizer = XGBoostOptimizer(task, scorer, logger) optimizer.start_optimization(max_evals=10) self.assertEqual(len(list(logger.load_all_results())), 10) os.remove(task.name + ".log")
def sk_generate_random_classification_set(self, samples, features, classes, informative, rds, dbs, debug=False): record = { "Test" : { "X" : {}, "Y" : {} }, "Train" : { "X" : {}, "Y" : {} } } results = self.build_def_hash("Display Error", "Not Run", record ) try: from sklearn.datasets import make_classification self.lg("Processing ROC", 6) X, Y = make_classification(n_samples=samples, n_features=features, n_classes=classes, n_informative=informative) record["Test"]["X"] = X[9000:] record["Test"]["Y"] = Y[9000:] record["Train"]["X"] = X[:9000] record["Train"]["Y"] = Y[:9000] results = self.build_def_hash("SUCCESS", "", record) except Exception,k: status = "FAILED" err_msg = "Unable to Generate Random Classification set with Ex(" + str(k) + ")" self.lg("ERROR: " + str(err_msg), 0) results = self.build_def_hash("Display Error", err_msg, {}) # end of try/ex return results # end of sk_generate_random_classification_set
def generate_multiclass_dataset(n_samples=100, n_features=10, n_informative=5, n_redundant=3, n_repeated=2, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None, hot_encoded=True, partitions_proportions=None, negative_labels=-1.): X, y = sk_dt.make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, n_repeated=n_repeated, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=weights, flip_y=flip_y, class_sep=class_sep, hypercube=hypercube, shift=shift, scale=scale, shuffle=True, random_state=random_state) if hot_encoded: y = to_one_hot_enc(y) else: y[y == 0] = negative_labels res = Dataset(data=np.array(X, dtype=np.float32), target=np.array(y, dtype=np.float32), info={'n_informative': n_informative, 'n_redundant': n_redundant, 'n_repeated': n_repeated, 'n_classes': n_classes, 'n_clusters_per_class': n_clusters_per_class, 'weights': weights, 'flip_y': flip_y, 'class_sep': class_sep, 'hypercube': hypercube, 'shift': shift, 'scale': scale, 'shuffle': True, 'random_state': random_state}) np.random.seed(random_state) if partitions_proportions: res = redivide_data([res], shuffle=shuffle, partition_proportions=partitions_proportions) res = Datasets.from_list(res) return res
def test_grid_search_dask_inputs(): # Numpy versions np_X, np_y = make_classification(n_samples=15, n_classes=2, random_state=0) np_groups = np.random.RandomState(0).randint(0, 3, 15) # Dask array versions da_X = da.from_array(np_X, chunks=5) da_y = da.from_array(np_y, chunks=5) da_groups = da.from_array(np_groups, chunks=5) # Delayed versions del_X = delayed(np_X) del_y = delayed(np_y) del_groups = delayed(np_groups) cv = GroupKFold() clf = SVC(random_state=0) grid = {'C': [1]} sol = SVC(C=1, random_state=0).fit(np_X, np_y).support_vectors_ for X, y, groups in product([np_X, da_X, del_X], [np_y, da_y, del_y], [np_groups, da_groups, del_groups]): gs = dcv.GridSearchCV(clf, grid, cv=cv) with pytest.raises(ValueError) as exc: gs.fit(X, y) assert "parameter should not be None" in str(exc.value) gs.fit(X, y, groups=groups) np.testing.assert_allclose(sol, gs.best_estimator_.support_vectors_)
def test_bad_error_score(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, error_score='badparam') with pytest.raises(ValueError): gs.fit(X, y)
def test_cache_cv(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) X2 = X.view(CountTakes) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, cache_cv=False, scheduler='sync') gs.fit(X2, y) assert X2.count == 2 * 3 * 3 # (1 train + 1 test) * n_params * n_splits X2 = X.view(CountTakes) assert X2.count == 0 gs.cache_cv = True gs.fit(X2, y) assert X2.count == 2 * 3 # (1 test + 1 train) * n_splits
def test_scheduler_param(scheduler, n_jobs, get): if scheduler == 'multiprocessing': mp = pytest.importorskip('dask.multiprocessing') get = mp.get assert _normalize_scheduler(scheduler, n_jobs) is get X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, scheduler=scheduler, n_jobs=n_jobs) gs.fit(X, y)
def test_scheduler_param_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): with Client(s['address'], loop=loop, set_as_default=False) as client: gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, scheduler=client) gs.fit(X, y)
def test_cv_multiplemetrics_requires_refit_metric(): X, y = make_classification(random_state=0) param_grid = {'max_depth': [1, 5]} a = dcv.GridSearchCV(RandomForestClassifier(), param_grid, refit=True, scoring={'score1': 'accuracy', 'score2': 'accuracy'}) with pytest.raises(ValueError): a.fit(X, y)
def test_cv_multiplemetrics_no_refit(): X, y = make_classification(random_state=0) param_grid = {'max_depth': [1, 5]} a = dcv.GridSearchCV(RandomForestClassifier(), param_grid, refit=False, scoring={'score1': 'accuracy', 'score2': 'accuracy'}) b = GridSearchCV(RandomForestClassifier(), param_grid, refit=False, scoring={'score1': 'accuracy', 'score2': 'accuracy'}) assert hasattr(a, 'best_index_') is hasattr(b, 'best_index_') assert hasattr(a, 'best_estimator_') is hasattr(b, 'best_estimator_') assert hasattr(a, 'best_score_') is hasattr(b, 'best_score_')
def make_test_data(): from sklearn.datasets import make_classification import pandas as pd data = make_classification(n_samples=3, n_features=4) data = data[0] df = pd.DataFrame(data, columns=list("ABCD")) prepare_path(test_data_file) df.to_csv(test_data_file, sep='\t', index=False)
def test_partial_fit_equivalence(): X, y = make_regression(random_state=0, n_samples=100) mtr = MondrianTreeRegressor(random_state=0) mtr.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mtr, 0, X, y) X, y = make_classification(random_state=0, n_samples=100) mtc = MondrianTreeClassifier(random_state=0) mtc.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
def test_partial_fit_equivalence(): X, y = make_regression(random_state=0, n_samples=100) mfr = MondrianForestRegressor(random_state=0) mfr.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mfr, 0, X, y) X, y = make_classification(random_state=0, n_samples=100) mtc = MondrianForestClassifier(random_state=0) mtc.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
def get_sample_dataset(dataset_properties): """Returns sample dataset Args: dataset_properties (dict): Dictionary corresponding to the properties of the dataset used to verify the estimator and metric generators. Returns: X (array-like): Features array y (array-like): Labels array splits (iterator): This is an iterator that returns train test splits for cross-validation purposes on ``X`` and ``y``. """ kwargs = dataset_properties.copy() data_type = kwargs.pop('type') if data_type == 'multiclass': try: X, y = datasets.make_classification(random_state=8, **kwargs) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) except Exception as e: raise exceptions.UserError(repr(e)) elif data_type == 'iris': X, y = datasets.load_iris(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'mnist': X, y = datasets.load_digits(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'breast_cancer': X, y = datasets.load_breast_cancer(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'boston': X, y = datasets.load_boston(return_X_y=True) splits = model_selection.KFold(n_splits=2, random_state=8).split(X) elif data_type == 'diabetes': X, y = datasets.load_diabetes(return_X_y=True) splits = model_selection.KFold(n_splits=2, random_state=8).split(X) else: raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type'])) return X, y, splits