我们从Python开源项目中,提取了以下12个代码示例,用于说明如何使用sklearn.model_selection.RandomizedSearchCV()。
def fit(self, X, y=None, groups=None): """Run fit on the estimator with randomly drawn parameters. Parameters ---------- X : array-like, shape=(n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape=(n_samples,) or (n_samples, n_output), optional (default=None) Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, shape=(n_samples,), optional (default=None) Group labels for the samples used while splitting the dataset into train/test set. """ return super(RandomizedSearchCV, self).fit(X, _as_numpy(y), groups)
def fit(self, X, *args, **kwargs): if self._grid_search: model = GridSearchCV(self._model, **self._grid_search) elif self._random_search: model = RandomizedSearchCV(self._model, **self._random_search) else: model = self._model if self._grid_search is not None: self._grid = model elif self._random_search is not None: self._rnd = model assert (self.target in X.columns.values), 'X must contain the target column' self._xcols = list(X.columns.values) self._xcols.remove(self.target) if len(self._columns_exclude) == 0 and len(self._columns_include) > 0: self._columns_exclude = list(set(self._xcols) - set(self._columns_include)) [self._xcols.remove(t) for t in self._columns_exclude] x = X[self._xcols] y = X[self.target] model.fit(x, y, **kwargs) return self
def test_RandomizedSearchCV(): ''' Use RandomizedSearchCV and LogisticRegression, to improve C, multi_class. :return: None ''' digits = load_digits() X_train,X_test,y_train,y_test=train_test_split(digits.data, digits.target, test_size=0.25,random_state=0,stratify=digits.target) tuned_parameters ={ 'C': scipy.stats.expon(scale=100), 'multi_class': ['ovr','multinomial']} clf=RandomizedSearchCV(LogisticRegression(penalty='l2',solver='lbfgs',tol=1e-6), tuned_parameters,cv=10,scoring="accuracy",n_iter=100) clf.fit(X_train,y_train) print("Best parameters set found:",clf.best_params_) print("Randomized Grid scores:") for params, mean_score, scores in clf.grid_scores_: print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params)) print("Optimized Score:",clf.score(X_test,y_test)) print("Detailed classification report:") y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred))
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def fit(self, X, Y): """ Train classifier. Parameters ---------- X : np.array [n_samples, n_features] Training features. Y : np.array [n_samples] Training labels """ x_shuffle, y_shuffle = shuffle(X, Y, random_state=self.random_state) clf_cv = RFC(n_estimators=self.n_estimators, n_jobs=self.n_jobs, class_weight=self.class_weight, random_state=self.random_state) param_dist = { "max_depth": sp_randint(1, 101), "max_features": [None, 'auto', 'sqrt', 'log2'], "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } random_search = RandomizedSearchCV( clf_cv, param_distributions=param_dist, refit=True, n_iter=self.n_iter_search, scoring='f1_weighted', random_state=self.random_state ) random_search.fit(x_shuffle, y_shuffle) self.clf = random_search.best_estimator_
def svc_model(self, X, y): X, y = shuffle(X, y, random_state=1337) svc = SVC(kernel='rbf', cache_size=self.cache_size, verbose=True) clf = RandomizedSearchCV(svc, param_distributions=self.params, n_iter=self.iters, n_jobs=-1, verbose=self.verbose) model = clf.fit(X[0:self.sample_size], y[0:self.sample_size]) logging.info('Grid Scores ' + str(model.best_params_)) logging.info('Best Scores ' + str(model.best_score_)) return model.best_estimator_
def get_algorithm(estimator, scoring_metric, hyperparameter_grid, randomized_search, number_iteration_samples=10, **non_randomized_estimator_kwargs): """ Given an estimator and various params, initialize an algorithm with optional randomized search. Args: estimator (sklearn.base.BaseEstimator): a scikit-learn estimator (for example: KNeighborsClassifier) scoring_metric (str): The scoring metric to optimized for if using random search. See http://scikit-learn.org/stable/modules/model_evaluation.html hyperparameter_grid (dict): An object containing key value pairs of the specific hyperparameter space to search through. randomized_search (bool): Whether the method should return a randomized search estimator (as opposed to a simple algorithm). number_iteration_samples (int): If performing randomized search, this is the number of samples that are run in the hyperparameter space. Higher numbers will be slower, but end up with better results, since it is more likely that the true optimal hyperparameter is found. **non_randomized_estimator_kwargs: Keyword arguments that you can pass directly to the algorithm. Only used when radomized_search is False Returns: sklearn.base.BaseEstimator: a scikit learn algorithm ready to `.fit()` """ if randomized_search: algorithm = RandomizedSearchCV(estimator=estimator(), scoring=scoring_metric, param_distributions=hyperparameter_grid, n_iter=number_iteration_samples, verbose=0, n_jobs=1) else: algorithm = estimator(**non_randomized_estimator_kwargs) return algorithm
def test_trivial_grid_scores(): # Test search over a "grid" with only one point. # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV. clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1]}) grid_search.fit(X, y) assert_true(hasattr(grid_search, "grid_scores_")) random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1) random_search.fit(X, y) assert_true(hasattr(random_search, "grid_scores_"))
def test_randomized_search_grid_scores(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # XXX: as of today (scipy 0.12) it's not possible to set the random seed # of scipy.stats distributions: the assertions in this test should thus # not depend on the randomization params = dict(C=expon(scale=10), gamma=expon(scale=0.1)) n_cv_iter = 3 n_search_iter = 30 search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, param_distributions=params, iid=False) search.fit(X, y) assert_equal(len(search.grid_scores_), n_search_iter) # Check consistency of the structure of each cv_score item for cv_score in search.grid_scores_: assert_equal(len(cv_score.cv_validation_scores), n_cv_iter) # Because we set iid to False, the mean_validation score is the # mean of the fold mean scores instead of the aggregate sample-wise # mean score assert_almost_equal(np.mean(cv_score.cv_validation_scores), cv_score.mean_validation_score) assert_equal(list(sorted(cv_score.parameters.keys())), list(sorted(params.keys()))) # Check the consistency with the best_score_ and best_params_ attributes sorted_grid_scores = list(sorted(search.grid_scores_, key=lambda x: x.mean_validation_score)) best_score = sorted_grid_scores[-1].mean_validation_score assert_equal(search.best_score_, best_score) tied_best_params = [s.parameters for s in sorted_grid_scores if s.mean_validation_score == best_score] assert_true(search.best_params_ in tied_best_params, "best_params_={0} is not part of the" " tied best models: {1}".format( search.best_params_, tied_best_params))
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(return_indicator=True, random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(random_state=0) estimators = [DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0)] # Test with grid search cv for est in estimators: grid_search = GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) for parameters, _, cv_validation_scores in grid_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i]) # Test with a randomized search for est in estimators: random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) for parameters, _, cv_validation_scores in random_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i])
def tune_xgb_params_randomized(estimator_cls, label: np.ndarray, metric_sklearn: str, n_jobs: int, params: dict, strat_folds: StratifiedKFold, train: np.ndarray, n_iter: int = 20, verbosity_level: int = 10, **kwargs): """ :param estimator_cls: The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor. :param label: An array-like containing the labels of the classification or regression problem. :param metric_sklearn: The evaluation metric to be passed to scikit-learn's GridSearchCV - see http://scikit-learn.org/stable/modules/model_evaluation.html for the options this can take - e.g. 'neg_mean_squared_error' for RMSE. :param n_jobs: The number of jobs to run simultaneously. :param params: A dictionary of XGB parameters. :param strat_folds: A StratifiedKFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :param n_iter: An optional parameter to control the number of parameter settings that are sampled. :param n_jobs: An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available. :param verbosity_level: An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option. :param kwargs: Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2. :return: A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores. """ params_copy = clean_params_for_sk(params) param_distributions = { 'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)), 'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)), 'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)), 'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)), 'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8)) } rand_search = RandomizedSearchCV( cv=strat_folds.split(train, label), estimator=estimator_cls(**params_copy), n_iter=n_iter, n_jobs=n_jobs, param_distributions=param_distributions, scoring=metric_sklearn, verbose=verbosity_level ) rand_search.fit(train, label) return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]
def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True) grid_search.fit(X, y) pickle.dumps(grid_search) # smoke test random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True, n_iter=3) random_search.fit(X, y) pickle.dumps(random_search) # smoke test