def setUp(self): os.putenv("KMP_DUPLICATE_LIB_OK", "TRUE") self.X_class, self.y_class = datasets.make_classification(random_state=42) self.X_reg, self.y_reg = datasets.make_regression(random_state=42) self.classification_optimizers = [XGBoostOptimizer, RandomForestOptimizer] self.regression_optimizers = [XGBoostOptimizer, RandomForestOptimizer] self.class_scorer = Scorer("auc_error", lambda y_pred, y_true: 1 - metrics.roc_auc_score(y_pred, y_true)) self.reg_scorer = Scorer("mse", metrics.mean_squared_error) self.classification_task_split = \ Task("class_split", self.X_class, self.y_class, "classification", test_size=0.1, random_state=42) self.regression_task_split = \ Task("reg_split", self.X_class, self.y_class, "regression", test_size=0.1, random_state=42) self.classification_task_cv = \ Task("class_cv", self.X_reg, self.y_reg, "classification", cv=5, random_state=42) self.regression_task_cv = \ Task("reg_cv", self.X_reg, self.y_reg, "regression", cv=5, random_state=42)
def test_cv(): """Simple CV check.""" # XXX: don't use scikit-learn for tests. X, y = make_regression() cv = KFold(X.shape[0], 5) glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1) # check that it returns 5 scores scores = cross_val_score(glm_normal, X, y, cv=cv) assert_equal(len(scores), 5) param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)}, {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01), 10, base=np.exp(1))}] glmcv = GridSearchCV(glm_normal, param_grid, cv=cv) glmcv.fit(X, y)
def test_min_samples_split(): X_c, y_c = load_digits(return_X_y=True) X_r, y_r = make_regression(n_samples=10000, random_state=0) for mss in [2, 4, 10, 20]: mtr = MondrianTreeRegressor(random_state=0, min_samples_split=mss) mtr.partial_fit(X_r[: X_r.shape[0] // 2], y_r[: X_r.shape[0] // 2]) mtr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:]) n_node_samples = mtr.tree_.n_node_samples[mtr.tree_.children_left != -1] assert_greater(np.min(n_node_samples) + 1, mss) mtc = MondrianTreeClassifier(random_state=0, min_samples_split=mss) mtc.partial_fit(X_c[: X_c.shape[0] // 2], y_c[: X_c.shape[0] // 2]) mtc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:]) n_node_samples = mtc.tree_.n_node_samples[mtc.tree_.children_left != -1] assert_greater(np.min(n_node_samples) + 1, mss)
def test_min_samples_split(): X_c, y_c = load_digits(return_X_y=True) X_r, y_r = make_regression(n_samples=10000, random_state=0) for mss in [2, 4, 10, 20]: mfr = MondrianForestRegressor(random_state=0, min_samples_split=mss) mfr.partial_fit(X_r[: X_r.shape[0] // 2], y_r[: X_r.shape[0] // 2]) mfr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:]) for est in mfr.estimators_: n_node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1] assert_greater(np.min(n_node_samples) + 1, mss) mfc = MondrianForestClassifier(random_state=0, min_samples_split=mss) mfc.partial_fit(X_c[: X_c.shape[0] // 2], y_c[: X_c.shape[0] // 2]) mfc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:]) for est in mfc.estimators_: n_node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1] assert_greater(np.min(n_node_samples) + 1, mss)
def regression(): # Generate a random regression problem X, y = make_regression(n_samples=5000, n_features=25, n_informative=25, n_targets=1, random_state=100, noise=0.05) y *= 0.01 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111) model = NeuralNet( layers=[ Dense(64, Parameters(init='normal')), Activation('linear'), Dense(32, Parameters(init='normal')), Activation('linear'), Dense(1), ], loss='mse', optimizer=Adam(), metric='mse', batch_size=256, max_epochs=15, ) model.fit(X_train, y_train) predictions = model.predict(X_test) print("regression mse", mean_squared_error(y_test, predictions.flatten()))
def test_get_errors_param(self): """ Test known models we can get the cv errors for alpha selection """ # Test original CV models for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV): try: model = AlphaSelection(model()) X, y = make_regression() model.fit(X, y) errors = model._find_errors_param() self.assertTrue(len(errors) > 0) except YellowbrickValueError: self.fail("could not find errors on {}".format(model.name))
def test_cross_val_score_with_score_func_regression(): X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) reg = Ridge() # Default score of the Ridge regression estimator scores = cross_val_score(reg, X, y, cv=5) assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # R2 score (aka. determination coefficient) - should be the # same as the default estimator score r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative mse_scores = cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error") expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(mse_scores, expected_mse, 2) # Explained variance scoring = make_scorer(explained_variance_score) ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring) assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_cross_val_score_with_score_func_regression(): X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) reg = Ridge() # Default score of the Ridge regression estimator scores = cval.cross_val_score(reg, X, y, cv=5) assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # R2 score (aka. determination coefficient) - should be the # same as the default estimator score r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative mse_scores = cval.cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error") expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(mse_scores, expected_mse, 2) # Explained variance scoring = make_scorer(explained_variance_score) ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring) assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_multi_target_regression(): X, y = datasets.make_regression(n_targets=3) X_train, y_train = X[:50], y[:50] X_test, y_test = X[50:], y[50:] references = np.zeros_like(y_test) for n in range(3): rgr = GradientBoostingRegressor(random_state=0) rgr.fit(X_train, y_train[:, n]) references[:,n] = rgr.predict(X_test) rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr.fit(X_train, y_train) y_pred = rgr.predict(X_test) assert_almost_equal(references, y_pred)
def test_ridge_fit_intercept_sparse(): X, y = make_regression(n_samples=1000, n_features=2, n_informative=2, bias=10., random_state=42) X_csr = sp.csr_matrix(X) dense = Ridge(alpha=1., tol=1.e-15, solver='sag', fit_intercept=True) sparse = Ridge(alpha=1., tol=1.e-15, solver='sag', fit_intercept=True) dense.fit(X, y) sparse.fit(X_csr, y) assert_almost_equal(dense.intercept_, sparse.intercept_) assert_array_almost_equal(dense.coef_, sparse.coef_) # test the solver switch and the corresponding warning sparse = Ridge(alpha=1., tol=1.e-15, solver='lsqr', fit_intercept=True) assert_warns(UserWarning, sparse.fit, X_csr, y) assert_almost_equal(dense.intercept_, sparse.intercept_) assert_array_almost_equal(dense.coef_, sparse.coef_)
def test_make_regression(): X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3, effective_rank=5, coef=True, bias=0.0, noise=1.0, random_state=0) assert_equal(X.shape, (100, 10), "X shape mismatch") assert_equal(y.shape, (100,), "y shape mismatch") assert_equal(c.shape, (10,), "coef shape mismatch") assert_equal(sum(c != 0.0), 3, "Unexpected number of informative features") # Test that y ~= np.dot(X, c) + bias + N(0, 1.0). assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1) # Test with small number of features. X, y = make_regression(n_samples=100, n_features=1) # n_informative=3 assert_equal(X.shape, (100, 1))
def lession_5(): # db = datasets.load_boston() # print db.data.shape # data_X=db.data # data_y=db.target # model = LinearRegression() # model.fit(data_X,data_y) # print model.predict(data_X[:8]) # print data_y[:8] X,y = datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=10) plt.scatter(X,y) plt.show()
def make_regression_example(axis, random_state): X, y = make_regression(n_samples=100, n_features=1, noise=30.0, random_state=random_state) axis.scatter(X[:, 0], y, color="blue", s=10, label="Patients") clf = LinearSVR().fit(X, y) axis.plot(X[:, 0], clf.predict(X), color="black", label="Model") ax2.tick_params(labelbottom='off', labelleft='off') ax2.set_xlabel("Gene 1") ax2.set_ylabel("Survived (years)") ax2.legend()
def main(): X, y = make_regression(n_samples=100, n_features=1, noise=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) n_samples, n_features = np.shape(X) model = LinearRegression(n_iterations=100) model.fit(X_train, y_train) # Training error plot n = len(model.training_errors) training, = plt.plot(range(n), model.training_errors, label="Training Error") plt.legend(handles=[training]) plt.title("Error Plot") plt.ylabel('Mean Squared Error') plt.xlabel('Iterations') plt.show() y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) print ("Mean squared error: %s" % (mse)) y_pred_line = model.predict(X) # Color map cmap = plt.get_cmap('viridis') # Plot the results m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10) m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10) plt.plot(366 * X, y_pred_line, color='black', linewidth=2, label="Prediction") plt.suptitle("Linear Regression") plt.title("MSE: %.2f" % mse, fontsize=10) plt.xlabel('Day') plt.ylabel('Temperature in Celcius') plt.legend((m1, m2), ("Training data", "Test data"), loc='lower right') plt.show()
def test_tau(): """ Test time of split for the root. """ X, y = make_regression(random_state=0, n_features=10) y = np.round(y) rate = np.sum(np.max(X, axis=0) - np.min(X, axis=0)) for est in estimators: est = est.set_params(max_depth=1) taus = [] for random_state in np.arange(100): est.set_params(random_state=random_state).fit(X, y) taus.append(est.tree_.tau[0]) assert_almost_equal(np.mean(taus), 1.0 / rate, 2)
def test_mondrian_tree_n_node_samples(): for r in range(1000): X, y = make_regression(n_samples=2, random_state=r) mtr = MondrianTreeRegressor(random_state=0) mtr.partial_fit(X, y) assert_array_equal(mtr.tree_.n_node_samples, [1, 1, 2])
def test_partial_fit_equivalence(): X, y = make_regression(random_state=0, n_samples=100) mtr = MondrianTreeRegressor(random_state=0) mtr.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mtr, 0, X, y) X, y = make_classification(random_state=0, n_samples=100) mtc = MondrianTreeClassifier(random_state=0) mtc.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
def test_partial_fit_n_samples_1000(): mtc = MondrianTreeClassifier(random_state=0) X, y = load_digits(return_X_y=True) check_online_fit(mtc, X, y, 20) mtc = MondrianTreeClassifier(random_state=0) check_online_fit(mtc, X, y, 100) X, y = make_regression(random_state=0, n_samples=10000) mtr = MondrianTreeRegressor(random_state=0) check_online_fit(mtr, X, y, 100, is_clf=False) mtr = MondrianTreeRegressor(random_state=0) check_online_fit(mtr, X, y, 20, is_clf=False)
def test_multioutput_regression(): """Test whether multi-output regression works as expected.""" X, y = make_regression(n_samples=200, n_targets=5, random_state=random_state) for activation in ACTIVATION_TYPES: elm = ELMRegressor(n_hidden=300, activation=activation, random_state=random_state) elm.fit(X, y) assert_greater(elm.score(X, y), 0.95)
def test_known_values(self): from sklearn.datasets import make_regression X,y, coef = make_regression(200, 15, 15, coef=True) np.testing.assert_equal(relevant_features(X, y), coef != 0.0)
def regression(): # Generate a random regression problem X, y = make_regression(n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111) model = knn.KNNRegressor(k=5, distance_func=distance.euclidean) model.fit(X_train, y_train) predictions = model.predict(X_test) print('regression mse', mean_squared_error(y_test, predictions))
def regression(): # Generate a random regression problem X, y = make_regression(n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111) model = GradientBoostingRegressor(n_estimators=25, max_depth=5, max_features=3, ) model.fit(X_train, y_train) predictions = model.predict(X_test) print('regression, mse: %s' % mean_squared_error(y_test.flatten(), predictions.flatten()))
def regression(): # Generate a random regression problem X, y = make_regression(n_samples=10000, n_features=100, n_informative=75, n_targets=1, noise=0.05, random_state=1111, bias=0.5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111) model = LinearRegression(lr=0.01, max_iters=2000, penalty='l2', C=0.03) model.fit(X_train, y_train) predictions = model.predict(X_test) print('regression mse', mean_squared_error(y_test, predictions))
def regression(): # Generate a random regression problem X, y = make_regression(n_samples=500, n_features=5, n_informative=5, n_targets=1, noise=0.05, random_state=1111, bias=0.5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1111) model = RandomForestRegressor(n_estimators=50, max_depth=10, max_features=3, ) model.fit(X_train, y_train) predictions = model.predict(X_test) print('regression, mse: %s' % mean_squared_error(y_test.flatten(), predictions.flatten()))
def test_get_alphas_param_lassolars(self): """ Assert that we can get alphas from lasso lars. """ X, y = make_regression() model = AlphaSelection(LassoLarsCV()) model.fit(X, y) try: malphas = model._find_alphas_param() self.assertTrue(len(malphas) > 0) except YellowbrickValueError: self.fail("could not find alphas on {}".format(model.name))
def create_regression_dataset(n_samples, n_features, n_informative, effective_rank, tail_strength, noise, random_state=None): """ Creates a regression dataset :param n_samples: number of observations :param n_features: number of features :param n_informative: number of informative features :param n_targets: The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar. :param effective_rank: approximate number of singular vectors required to explain data :param tail_strength: relative importance of the fat noisy tail of the singular values profile :param noise: standard deviation of the gaussian noise applied to the output :param random_state: the numpy RandomState :return: the requested dataframe """ random_state = get_random_state(random_state) X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=1, effective_rank=effective_rank, tail_strength=tail_strength, noise=noise, random_state=random_state) # cast to a data frame df = pd.DataFrame(X) # rename X columns df = rename_columns(df) # and add the Y df['y'] = y return df
def test_Symbolic_fit(n_out): x, y = make_regression(n_features=2, n_informative=1, n_targets=n_out) est = Symbolic(max_nfev=1, lambda_=1).fit(x, y) yhat = est.predict(x) assert yhat.shape == y.shape
def test_Symbolic_joblib(): x, y = make_regression(n_features=2, n_informative=1, n_targets=1) yhat = Symbolic(n_jobs=-1, max_nfev=1, lambda_=1).fit(x, y).predict(x) assert yhat.shape == y.shape
def data(self): X, y = make_regression( 1000, 20, n_informative=10, bias=0, random_state=0) X, y = X.astype(np.float32), y.astype(np.float32).reshape(-1, 1) Xt = StandardScaler().fit_transform(X) yt = StandardScaler().fit_transform(y) return Xt, yt
def test_multi_target_regression_one_target(): # Test multi target regression raises X, y = datasets.make_regression(n_targets=1) X_train, y_train = X[:50], y[:50] X_test, y_test = X[50:], y[50:] rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) assert_raises(ValueError, rgr.fit, X_train, y_train)
def make_regression_with_outliers(n_samples=50, n_features=20): rng = np.random.RandomState(0) # Generate data with outliers by replacing 10% of the samples with noise. X, y = make_regression( n_samples=n_samples, n_features=n_features, random_state=0, noise=0.05) # Replace 10% of the sample with noise. num_noise = int(0.1 * n_samples) random_samples = rng.randint(0, n_samples, num_noise) X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1])) return X, y
def test_make_regression_multitarget(): X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3, n_targets=3, coef=True, noise=1., random_state=0) assert_equal(X.shape, (100, 10), "X shape mismatch") assert_equal(y.shape, (100, 3), "y shape mismatch") assert_equal(c.shape, (10, 3), "coef shape mismatch") assert_array_equal(sum(c != 0.0), 3, "Unexpected number of informative features") # Test that y ~= np.dot(X, c) + bias + N(0, 1.0) assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
def test_mse_solving(): # test the MSE estimate to be sane. # non-regression test for ignoring off-diagonals of feature covariance, # testing with nugget that renders covariance useless, only # using the mean function, with low effective rank of data gp = GaussianProcess(corr='absolute_exponential', theta0=1e-4, thetaL=1e-12, thetaU=1e-2, nugget=1e-2, optimizer='Welch', regr="linear", random_state=0) X, y = make_regression(n_informative=3, n_features=60, noise=50, random_state=0, effective_rank=1) gp.fit(X, y) assert_greater(1000, gp.predict(X, eval_MSE=True)[1].mean())
def test_insight_regression(self): candidates = 4 X, y = make_regression( n_samples=1000, n_features=15, n_informative=candidates, n_targets=1) df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"]) dfe = DataFrameExtension(df, numericals=["target"], target="target") insight = ModelSelectionInsight() insight.adopt(dfe) self.assertTrue(insight.score > 0) print(insight.score)
def test_insight_regression(self): candidates = 4 X, y = make_regression( n_samples=1000, n_features=15, n_informative=candidates, n_targets=1) df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"]) dfe = DataFrameExtension(df, numericals=["target"], target="target") insight = FeatureSelectionInsight() insight.adopt(dfe) print("selected regressor features {}".format(dfe.ftypes.keys())) self.assertTrue(candidates <= len(dfe.ftypes) - 1 < candidates * 2) # -1 is target ftype
def genRegressionData(n_samples: int = 100, n_features: int = 2, n_redundant: int = 0, strRel: int = 1, n_repeated: int = 0, noise: float = 1, random_state: object = None, partition = None) -> object: """Generate synthetic regression data Parameters ---------- n_samples : int, optional Number of samples n_features : int, optional Number of features n_redundant : int, optional Number of features which are part of redundant subsets (weakly relevant) strRel : int, optional Number of features which are mandatory for the underlying model (strongly relevant) n_repeated : int, optional Number of features which are clones of existing ones. noise : float, optional Noise of the created samples around ground truth. random_state : object, optional Randomstate object used for generation. Returns ------- X : array of shape [n_samples, n_features] The generated samples. y : array of shape [n_samples] The output values (target). Raises ------ ValueError Wrong parameters for specified amonut of features/samples. """ _checkParam(**locals()) random_state = check_random_state(random_state) X = np.zeros((int(n_samples), int(n_features))) # Find partitions which defíne the weakly relevant subsets if partition is None: # Legacy behaviour yielding subsets of size 2 partition = int(n_redundant / 2) * [2] part_size = len(partition) X_informative, Y = make_regression(n_features=int(strRel + part_size), n_samples=int(n_samples), noise=noise, n_informative=int(strRel), random_state=random_state, shuffle=False) X = _fillVariableSpace(**locals()) return X, Y
def test_sparse_regression(): # Check regression with sparse input. class CustomSVR(SVR): """SVR variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super(CustomSVR, self).fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostRegressor( base_estimator=CustomSVR(), random_state=1 ).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = dense_results = AdaBoostRegressor( base_estimator=CustomSVR(), random_state=1 ).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])