def test_w_prep_fit(): """[Model Selection] Test run with preprocessing, single step.""" evl = Evaluator(mape_scorer, cv=5, shuffle=False, random_state=100, verbose=True) with open(os.devnull, 'w') as f, redirect_stdout(f): evl.fit(X, y, estimators=[OLS()], param_dicts={'ols': {'offset': randint(1, 10)}}, preprocessing={'pr': [Scale()], 'no': []}, n_iter=3) np.testing.assert_approx_equal( evl.results['test_score-m']['no.ols'], -24.903229451043195) np.testing.assert_approx_equal( evl.results['test_score-m']['pr.ols'], -26.510708862278072, 1) assert evl.results['params']['no.ols']['offset'] == 4 assert evl.results['params']['pr.ols']['offset'] == 4
def get_uniform_paramgrid(hyperparameters, fixed_parameters): param_grid = dict() for param_name, hyperparameter in hyperparameters.items(): if fixed_parameters is not None and param_name in fixed_parameters.keys(): continue if isinstance(hyperparameter, CategoricalHyperparameter): all_values = hyperparameter.choices if all(item in ['True', 'False'] for item in all_values): all_values = [bool(item) for item in all_values] param_grid[param_name] = all_values elif isinstance(hyperparameter, UniformFloatHyperparameter): if hyperparameter.log: param_grid[param_name] = loguniform(base=2, low=hyperparameter.lower, high=hyperparameter.upper) else: param_grid[param_name] = uniform(loc=hyperparameter.lower, scale=hyperparameter.upper-hyperparameter.lower) elif isinstance(hyperparameter, UniformIntegerHyperparameter): if hyperparameter.log: param_grid[param_name] = loguniform_int(base=2, low=hyperparameter.lower, high=hyperparameter.upper) else: param_grid[param_name] = randint(low=hyperparameter.lower, high=hyperparameter.upper+1) else: raise ValueError() return param_grid
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def fit(self, X, Y): """ Train classifier. Parameters ---------- X : np.array [n_samples, n_features] Training features. Y : np.array [n_samples] Training labels """ x_shuffle, y_shuffle = shuffle(X, Y, random_state=self.random_state) clf_cv = RFC(n_estimators=self.n_estimators, n_jobs=self.n_jobs, class_weight=self.class_weight, random_state=self.random_state) param_dist = { "max_depth": sp_randint(1, 101), "max_features": [None, 'auto', 'sqrt', 'log2'], "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } random_search = RandomizedSearchCV( clf_cv, param_distributions=param_dist, refit=True, n_iter=self.n_iter_search, scoring='f1_weighted', random_state=self.random_state ) random_search.fit(x_shuffle, y_shuffle) self.clf = random_search.best_estimator_
def test_params(): """[Model Selection] Test raises on bad params.""" evl = Evaluator(mape_scorer, verbose=2) np.testing.assert_raises(ValueError, evl.fit, X, y, estimators=[OLS()], param_dicts={'bad.ols': {'offset': randint(1, 10)}}, preprocessing={'prep': [Scale()]})
def test_raises(): """[Model Selection] Test raises on error.""" evl = Evaluator(bad_scorer, verbose=1) with open(os.devnull, 'w') as f, redirect_stdout(f): np.testing.assert_raises( ValueError, evl.fit, X, y, estimators=[OLS()], param_dicts={'ols': {'offset': randint(1, 10)}}, n_iter=1)
def test_passes(): """[Model Selection] Test sets error score on failed scoring.""" evl = Evaluator(bad_scorer, error_score=0, n_jobs=1, verbose=5) with open(os.devnull, 'w') as f, redirect_stdout(f): evl = np.testing.assert_warns(FitFailedWarning, evl.fit, X, y, estimators=[OLS()], param_dicts={'ols': {'offset': randint(1, 10)}}, n_iter=1) assert evl.results['test_score-m']['ols'] == 0
def test_no_prep(): """[Model Selection] Test run without preprocessing.""" evl = Evaluator(mape_scorer, cv=5, shuffle=False, random_state=100, verbose=12) with open(os.devnull, 'w') as f, redirect_stdout(f): evl.fit(X, y, estimators=[OLS()], param_dicts={'ols': {'offset': randint(1, 10)}}, n_iter=3) np.testing.assert_approx_equal( evl.results['test_score-m']['ols'], -24.903229451043195) assert evl.results['params']['ols']['offset'] == 4
def test_w_prep_set_params(): """[Model Selection] Test run with preprocessing, sep param dists.""" evl = Evaluator(mape_scorer, cv=5, shuffle=False, random_state=100, verbose=2) params = {'no.ols': {'offset': randint(3, 6)}, 'pr.ols': {'offset': randint(1, 3)}, } with open(os.devnull, 'w') as f, redirect_stdout(f): evl.fit(X, y, estimators={'pr': [OLS()], 'no': [OLS()]}, param_dicts=params, preprocessing={'pr': [Scale()], 'no': []}, n_iter=10) np.testing.assert_approx_equal( evl.results['test_score-m']['no.ols'], -18.684229451043198) np.testing.assert_approx_equal( evl.results['test_score-m']['pr.ols'], -7.2594502123869491) assert evl.results['params']['no.ols']['offset'] == 3 assert evl.results['params']['pr.ols']['offset'] == 1
def test_random_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [StandardScaler(), RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV(pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=2, random_state=42) # fit the search search.fit(X_train, y_train) # test the report report_grid_score_detail(search, charts=False)
def tune_xgb_params_randomized(estimator_cls, label: np.ndarray, metric_sklearn: str, n_jobs: int, params: dict, strat_folds: StratifiedKFold, train: np.ndarray, n_iter: int = 20, verbosity_level: int = 10, **kwargs): """ :param estimator_cls: The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor. :param label: An array-like containing the labels of the classification or regression problem. :param metric_sklearn: The evaluation metric to be passed to scikit-learn's GridSearchCV - see http://scikit-learn.org/stable/modules/model_evaluation.html for the options this can take - e.g. 'neg_mean_squared_error' for RMSE. :param n_jobs: The number of jobs to run simultaneously. :param params: A dictionary of XGB parameters. :param strat_folds: A StratifiedKFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :param n_iter: An optional parameter to control the number of parameter settings that are sampled. :param n_jobs: An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available. :param verbosity_level: An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option. :param kwargs: Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2. :return: A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores. """ params_copy = clean_params_for_sk(params) param_distributions = { 'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)), 'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)), 'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)), 'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)), 'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8)) } rand_search = RandomizedSearchCV( cv=strat_folds.split(train, label), estimator=estimator_cls(**params_copy), n_iter=n_iter, n_jobs=n_jobs, param_distributions=param_distributions, scoring=metric_sklearn, verbose=verbosity_level ) rand_search.fit(train, label) return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]