我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.GridSearchCV()。
def train_model_with_cv(model, params, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # Use Train data to parameter selection in a Grid Search gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5) gs_clf = gs_clf.fit(X_train, y_train) model = gs_clf.best_estimator_ # Use best model and test data for final evaluation y_pred = model.predict(X_test) _f1 = f1_score(y_test, y_pred, average='micro') _confusion = confusion_matrix(y_test, y_pred) __precision = precision_score(y_test, y_pred) _recall = recall_score(y_test, y_pred) _statistics = {'f1_score': _f1, 'confusion_matrix': _confusion, 'precision': __precision, 'recall': _recall } return model, _statistics
def fit(self, X, y=None): """Fitting function on the data.""" if self.data_normalizer is not None: X = self.normalize_data(X) if self.label_normalizer is not None: y = self.normalize_label(y) if self.force_classifier: clf = make_classifier(self.learner, params=self.learner_options) elif callable(self.learner): # self.learner = type(self.learner) clf = self.learner(**self.learner_options) else: clf = self.learner self.gs_ = GridSearchCV(estimator=clf, **self.cv_options) self.gs_.fit(X, y)
def train(self, train_size=0.8, k_folds=5): # retrieve data from DB and pre-process self._get_data() # perform train/test split self._get_train_test_split(train_size=train_size) # define text pre-processing pipeline text_pipeline = Pipeline([ ('extract_text', DFColumnExtractor(TEXT_FEATURES)), ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer)) ]) # define pipeline for pre-processing of numeric features numeric_pipeline = Pipeline([ ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)), ('scaler', MinMaxScaler()) ]) # combine both steps into a single pipeline pipeline = Pipeline([ ('features', FeatureUnion([ ('text_processing', text_pipeline), ('num_processing', numeric_pipeline) ])), ('clf', self._estimator) ]) self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds)) gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds) X = self.data.iloc[self.train_inds_, :] y = self.data[LABEL].values[self.train_inds_] gs.fit(X, y) self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_)) self.gs_ = gs self.model_ = gs.best_estimator_
def fit(self, X, y=None, groups=None): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape=(n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape=(n_samples,) or (n_samples, n_output), optional (default=None) Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, shape=(n_samples,), optional (default=None) Group labels for the samples used while splitting the dataset into train/test set. """ return super(GridSearchCV, self).fit(X, _as_numpy(y), groups)
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=4, cv=5): # global dtrain_whole global num_boost_round global params_sklearn # global x # global y for param_untuned in params_untuned: print '========== ', param_untuned, ' ==============' print_params(params_sklearn) estimator = xgb.XGBClassifier(**params_sklearn) grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10) grid_search.fit(x, y) df0 = pd.DataFrame(grid_search.cv_results_) df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']] # print df0 print df print 'the best_params : ', grid_search.best_params_ print 'the best_score : ', grid_search.best_score_ # print grid_search.cv_results_ for k,v in grid_search.best_params_.items(): params_sklearn[k] = v if len(params_untuned)==1: return v
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=1, cv=5): # global dtrain_whole global num_boost_round global params_sklearn # global x # global y for param_untuned in params_untuned: print '========== ', param_untuned, ' ==============' print_params(params_sklearn) estimator = xgb.XGBClassifier(**params_sklearn) grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10) grid_search.fit(x, y) df0 = pd.DataFrame(grid_search.cv_results_) df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']] # print df0 print df print 'the best_params : ', grid_search.best_params_ print 'the best_score : ', grid_search.best_score_ # print grid_search.cv_results_ for k,v in grid_search.best_params_.items(): params_sklearn[k] = v
def tune_classifier(estimator,params,X_train,Y_train,scoring='roc_auc',n_jobs=3,cv=5): results = [] for k,values in params.items(): params_single = dict(k=values) print '========== ',params_single,' ==============' grid_search = GridSearchCV(estimator,param_grid=params_single,scoring=scoring,n_jobs=n_jobs,cv=cv,verbose=5) grid_search.fit(X_train,Y_train) df0 = pd.DataFrame(grid_search.cv_results_) df = pd.DataFrame(grid_search.cv_results_)[['params','mean_train_score','mean_test_score']] # print df0 print df print 'the best_params : ',grid_search.best_params_ print 'the best_score : ',grid_search.best_score_ # print grid_search.cv_results_ results.append(grid_search.best_params_) return results
def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10): for param_untuned in params_untuned: print '========== ', param_untuned, ' ==============' print_params(params_sklearn) estimator = xgb.XGBClassifier(**params_sklearn) # if(param_untuned.keys()[0] == 'n_estimators'): # cv = 1 grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose) grid_search.fit(x, y) df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']] print df print 'the best_params : ', grid_search.best_params_ print 'the best_score : ', grid_search.best_score_ for k,v in grid_search.best_params_.items(): params_sklearn[k] = v return estimator,params_sklearn
def test_pipeline(get_models, get_transform, get_kernel): alg, model = get_models trans = get_transform() kernel = get_kernel() + WhiteKernel() pipe = Pipeline(steps=[(alg, model())]) param_dict = {} if hasattr(model(), 'n_estimators'): param_dict[alg + '__n_estimators'] = [5] if hasattr(model(), 'kernel'): param_dict[alg + '__kernel'] = [kernel] param_dict[alg + '__target_transform'] = [trans] estimator = GridSearchCV(pipe, param_dict, n_jobs=1, iid=False, pre_dispatch=2, verbose=True, ) np.random.seed(10) estimator.fit(X=1 + np.random.rand(10, 3), y=1. + np.random.rand(10)) assert estimator.cv_results_['mean_train_score'][0] > -15.0
def test_svr_pipeline(get_transform, get_svr_kernel): trans = get_transform() pipe = Pipeline(steps=[('svr', svr())]) param_dict = {'svr__kernel': [get_svr_kernel]} param_dict['svr__target_transform'] = [trans] estimator = GridSearchCV(pipe, param_dict, n_jobs=1, iid=False, pre_dispatch=2, verbose=True, ) np.random.seed(1) estimator.fit(X=1 + np.random.rand(10, 5), y=1. + np.random.rand(10)) assert estimator.cv_results_['mean_train_score'][0] > -10.0
def test_krige_pipeline(get_krige_method, get_variogram_model): pipe = Pipeline(steps=[('krige', Krige(method=get_krige_method))]) param_dict = {'krige__variogram_model': [get_variogram_model]} estimator = GridSearchCV(pipe, param_dict, n_jobs=1, iid=False, pre_dispatch=2, verbose=True ) np.random.seed(1) X = np.random.randint(0, 400, size=(20, 2)).astype(float) y = 5*np.random.rand(20) estimator.fit(X=X, y=y) assert estimator.cv_results_['mean_train_score'][0] > -1.0
def test_cv(): """Simple CV check.""" # XXX: don't use scikit-learn for tests. X, y = make_regression() cv = KFold(X.shape[0], 5) glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1) # check that it returns 5 scores scores = cross_val_score(glm_normal, X, y, cv=cv) assert_equal(len(scores), 5) param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)}, {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01), 10, base=np.exp(1))}] glmcv = GridSearchCV(glm_normal, param_grid, cv=cv) glmcv.fit(X, y)
def setBestParameters(self): cv = StratifiedKFold(n_splits = self.conf.num_folds) param_grid = self.conf.getParamGrid() if param_grid is None: # No parameter value to select return if self.conf.families_supervision: scoring = 'f1_macro' else: scoring = 'roc_auc' grid_search = GridSearchCV(self.pipeline, param_grid = param_grid, scoring = scoring, cv = cv, n_jobs = -1, fit_params = {'model__sample_weight': self.datasets.sample_weight}) grid_search.fit(self.datasets.train_instances.getFeatures(), self.getSupervision(self.datasets.train_instances)) self.conf.setBestValues(grid_search) self.pipeline.set_params(**self.conf.getBestValues()) return cv
def xgb_model_select(file_name): train_df = read_from_file(file_name) selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Select Model...' start_time = datetime.datetime.now() xgb_clf = xgb.XGBRegressor() parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9]} grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) end_time = datetime.datetime.now() print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def gbdt_select_model(file_name): train_df = read_from_file(file_name) #featrue 16 selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Select Model...' start_time = datetime.datetime.now() gbdt = GradientBoostingRegressor() parameters = {'n_estimators': [100, 120], 'max_depth':[4, 5, 6]} grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) end_time = datetime.datetime.now() print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def select_model(file_name): train_df = read_from_file(file_name) #featrue 16 selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Select Model...' start_time = datetime.datetime.now() gbdt = GradientBoostingRegressor() parameters = {'n_estimators': [10000, 12000], 'max_depth':[16,15, 14]} grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) end_time = datetime.datetime.now() print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def xgb_model_select(train_file_name): train_df = merge_features_to_use(train_file_name) train_df.drop(['conversionTime'], axis=1, inplace=True) print 'Train And Fix Missing App Count Value...' train_df, xgb_appcount = train_model_for_appcounts(train_df) joblib.dump(xgb_appcount, 'XGB_missing.model') print train_df.info() print train_df.describe() print train_df.isnull().sum() train_np = train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Select Model...' start_time = datetime.datetime.now() xgb_clf = xgb.XGBRegressor() parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9], 'gamma':[0.1,0.3,0.5,0.7], 'min_child_weight':[1,3,5,7], } grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) end_time = datetime.datetime.now() print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def grid(X, y): ''' Adapted from: http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py Perform a grid search. ''' grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=8) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(X, y) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def fit(self, df, y, param_grid=None): from sklearn.model_selection import GridSearchCV X = df.drop(y, axis=1).values y = df[y].values meta_X = self.get_meta(X) if param_grid is not None: model = self.stacked_model_class() gridsearch = GridSearchCV(model, param_grid) gridsearch.fit(meta_X, y) self.stacked_model = self.stacked_model_class(**gridsearch.best_params_) else: self.stacked_model = self.stacked_model_class() self.stacked_model.fit(meta_X, y)
def grid_search_cv(clf, x, y, params, cv = 5): """ :param clf: The classifier over which we want to perform gridsearch. :param x: Features :param y: Target :param params: Hyperparameters to perform gs on :cv: kfold cv parameter """ gs = GridSearchCV(clf, param_grid = params, cv = cv) gs.fit(x, y) print print 'BEST PARAMS:', gs.best_params_ print 'BEST SCORE:', gs.best_score_ print best_estimator = gs.best_estimator_ return best_estimator ###################### # PREPARING THE DATA # ###################### #get the last 4 images from each file
def LogisticRegression(X_train, y_train): from sklearn.linear_model import LogisticRegression parameters = { 'C':[0.6, 0.8, 1.0, 1.2], 'class_weight':[None, 'balanced'], } LR = LogisticRegression() grid_search = GridSearchCV(estimator=LR, param_grid=parameters, cv=5, scoring='neg_log_loss',n_jobs=4) now = datetime.datetime.now() print ("logestic regression grid_search start in " + now.strftime('%Y-%m-%d %H:%M:%S')) grid_search.fit(X_train, y_train) print ("logestic regression grid_search done in " + now.strftime('%Y-%m-%d %H:%M:%S')) results = grid_search.grid_scores_ for result in results: print(result) print("\nBest score: %0.3f\n" % grid_search.best_score_) print ("---------best parameters---------") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print ("%s: %r" % (param_name, best_parameters[param_name]))
def build_grid_search(X, y): parameters = { "estimator__criterion": ['gini', 'entropy'], "estimator__max_depth": [10, 15, 20, 25, None], "estimator__max_features": ['auto', 'sqrt', 'log2', None] } ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000, oob_score=True, n_jobs=-1, verbose=1)) model_tunning = GridSearchCV(ovr, param_grid=parameters, verbose=1, n_jobs=-1, cv=10, scoring=make_scorer(f1_score)) model_tunning.fit(X, y) test_score = model_tunning.best_score_ print 'The best test score: ', test_score y_score = model_tunning.predict_proba(X_test) multiclass_roc(y_score, 'grid_search_02') return model_tunning
def clean_params_for_sk(params: dict) -> dict: """ Given a dictionary of XGB parameters, return a copy without parameters that will cause issues with scikit-learn's grid or randomized search estimators. :param params: A dictionary of XGB parameters. :return: A copy of the same dictionary without the aforementioned problematic parameters. """ # In the xgb.cv call, nthread should be equal to the CPU count, but this causes a hang when # called through GridSearchCV - parallelism should be achieved through its n_jobs parameter. # See https://github.com/scikit-learn/scikit-learn/issues/6627 for more details. params_copy = params.copy() params_copy['nthread'] = 1 # In multiclass problems, this parameter is required for XGBoost, but is not a parameter of interest to be tuned. if 'num_class' in params_copy.keys(): del params_copy['num_class'] return params_copy
def fit(self, X, *args, **kwargs): if self._grid_search: model = GridSearchCV(self._model, **self._grid_search) elif self._random_search: model = RandomizedSearchCV(self._model, **self._random_search) else: model = self._model if self._grid_search is not None: self._grid = model elif self._random_search is not None: self._rnd = model assert (self.target in X.columns.values), 'X must contain the target column' self._xcols = list(X.columns.values) self._xcols.remove(self.target) if len(self._columns_exclude) == 0 and len(self._columns_include) > 0: self._columns_exclude = list(set(self._xcols) - set(self._columns_include)) [self._xcols.remove(t) for t in self._columns_exclude] x = X[self._xcols] y = X[self.target] model.fit(x, y, **kwargs) return self
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True): """ Grid search method with numpy array of X and Y Previously, np.mat are used for compatible with Matlab notation. """ if disp: print( X.shape, Y.shape) clf = getattr( linear_model, method)() parmas = {'alpha': np.logspace( *alphas_log)} kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True) #kf5 = kf5_c.split( X) gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs) gs.fit( X, Y) return gs
def gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1): """ gs = gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1) Inputs ====== classifier = svm.SVC(), for example param = {"C": np.logspace(-2,2,5)} """ #print(xM.shape, yVc.shape) kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=True) gs = model_selection.GridSearchCV( classifier, params, cv=kf5_c, n_jobs=n_jobs) gs.fit( xM, yVc) return gs
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1): """ As is a list of A matrices where A is similarity matrix. X is a concatened linear descriptors. If no X is used, X can be empty """ clf = binary_model.BIKE_Ridge( A_list, XX) parmas = {'alpha': np.logspace( *alphas_log)} ln = A_list[0].shape[0] # ls is the number of molecules. kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) #kf_n = kf5_ext_c.split( A_list[0]) gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs) AX_idx = np.array([list(range( ln))]).T gs.fit( AX_idx, yV) return gs
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1): """ As is a list of A matrices where A is similarity matrix. X is a concatened linear descriptors. If no X is used, X can be empty """ clf = binary_model.BIKE_Ridge( A_list, X_concat) parmas = {'alpha': np.logspace( *alphas_log)} ln = A_list[0].shape[0] # ls is the number of molecules. kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) #kf_n = kf5_ext_c.split( A_list[0]) gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs) AX_idx = np.array([list(range( ln))]).T gs.fit( AX_idx, yV) return gs
def gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1, graph=False): """ gs = gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1) Inputs ====== model = svm.SVC(), or linear_model.LinearRegression(), for example param = {"C": np.logspace(-2,2,5)} """ #print(xM.shape, yVc.shape) kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) gs = model_selection.GridSearchCV( model, param_grid, cv=kf5_c, n_jobs=n_jobs) gs.fit( X, y) if graph: plt.plot( gs.cv_results_["mean_train_score"], label='E[Train]') plt.plot( gs.cv_results_["mean_test_score"], label='E[Test]') plt.legend(loc=0) plt.grid() return gs
def gs_Lasso(xM, yV, alphas_log=(-1, 1, 9), n_folds=5, n_jobs=-1): print(xM.shape, yV.shape) clf = linear_model.Lasso() #parmas = {'alpha': np.logspace(1, -1, 9)} parmas = {'alpha': np.logspace(*alphas_log)} kf5_c = model_selection.KFold(n_folds=n_folds, shuffle=True) kf5 = kf5_c.split(xM) gs = model_selection.GridSearchCV( clf, parmas, scoring='r2', cv=kf5, n_jobs=n_jobs) gs.fit(xM, yV) return gs
def _gs_SVC_r0(xM, yVc, params): """ Since classification is considered, we use yVc which includes digital values whereas yV can include float point values. """ print(xM.shape, yVc.shape) clf = svm.SVC() #parmas = {'alpha': np.logspace(1, -1, 9)} kf5_c = model_selection.KFold(n_splits=5, shuffle=True) kf5 = kf5_c.split(xM) gs = model_selection.GridSearchCV(clf, params, cv=kf5, n_jobs=-1) gs.fit(xM, yVc) return gs
def gs_SVC(xM, yVc, params, n_folds=5): """ Since classification is considered, we use yVc which includes digital values whereas yV can include float point values. """ print(xM.shape, yVc.shape) clf = svm.SVC() #parmas = {'alpha': np.logspace(1, -1, 9)} kf5_c = model_selection.KFold(n_splits=n_folds, shuffle=True) kf5 = kf5_c.split(xM) gs = model_selection.GridSearchCV(clf, params, cv=kf5, n_jobs=-1) gs.fit(xM, yVc) return gs
def gs_Ridge(xM, yV, alphas_log=(1, -1, 9), n_folds=5, n_jobs=-1, scoring='r2'): """ Parameters ------------- scoring: mean_absolute_error, mean_squared_error, median_absolute_error, r2 """ print('If scoring is not r2 but error metric, output score is revered for scoring!') print(xM.shape, yV.shape) clf = linear_model.Ridge() #parmas = {'alpha': np.logspace(1, -1, 9)} parmas = {'alpha': np.logspace(*alphas_log)} kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True) kf_n = kf_n_c.split(xM) gs = model_selection.GridSearchCV( clf, parmas, scoring=scoring, cv=kf_n, n_jobs=n_jobs) gs.fit(xM, yV) return gs
def gs_Ridge_BIKE(A_list, yV, XX=None, alphas_log=(1, -1, 9), n_folds=5, n_jobs=-1): """ As is a list of A matrices where A is similarity matrix. X is a concatened linear descriptors. If no X is used, X can be empty """ clf = binary_model.BIKE_Ridge(A_list, XX) parmas = {'alpha': np.logspace(*alphas_log)} ln = A_list[0].shape[0] # ls is the number of molecules. kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True) kf_n = kf_n_c.split(A_list) gs = model_selection.GridSearchCV( clf, parmas, scoring='r2', cv=kf_n, n_jobs=n_jobs) AX_idx = np.array([list(range(ln))]).T gs.fit(AX_idx, yV) return gs
def train_logistic(): df = pd.read_csv(config.activations_path) df, y, classes = encode(df) X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.2, random_state=17) params = {'C': [10, 2, .9, .4, .1], 'tol': [0.0001, 0.001, 0.0005]} log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced') clf = GridSearchCV(log_reg, params, scoring='neg_log_loss', refit=True, cv=3, n_jobs=-1) clf.fit(X_train, y_train) print("best params: " + str(clf.best_params_)) print("Accuracy: ", accuracy_score(y_test, clf.predict(X_test))) setattr(clf, '__classes', classes) # save results for further using joblib.dump(clf, config.get_novelty_detection_model_path())
def perform(): # Create a new grid search classifier from a sci-kit pipeline model = GridSearchCV(pipeline(), gs_clf_params(), n_jobs=-1) # Get your training and testing sets of data with 50/50 split (train_data, train_targets), (test_data, test_targets) = dp.get_data() # Train your model model = model.fit(train_data, train_targets) # Test it's accuracy predictions = model.predict(test_data) # Display the model's accuracy print "\nModel Accuracy: {}\n".format(np.mean(predictions == test_targets)) # Save the trained model to disk save_model(model)
def test_gridsearch(): # Check that base trees can be grid-searched. # AdaBoost classification boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2), 'algorithm': ('SAMME', 'SAMME.R')} clf = GridSearchCV(boost, parameters) clf.fit(iris.data, iris.target) # AdaBoost regression boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2)} clf = GridSearchCV(boost, parameters) clf.fit(boston.data, boston.target)
def test_grid_search(): # Test that the best estimator contains the right value for foo_param clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3) # make sure it selects the smallest parameter in case of ties old_stdout = sys.stdout sys.stdout = StringIO() grid_search.fit(X, y) sys.stdout = old_stdout assert_equal(grid_search.best_estimator_.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): assert_true(grid_search.grid_scores_[i][0] == {'foo_param': foo_i}) # Smoke test the score etc: grid_search.score(X, y) grid_search.predict_proba(X) grid_search.decision_function(X) grid_search.transform(X) # Test exception handling on scoring grid_search.scoring = 'sklearn' assert_raises(ValueError, grid_search.fit, X, y)
def test_grid_search_labels(): # Check if ValueError (when labels is None) propagates to GridSearchCV # And also check if labels is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) labels = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} label_cvs = [LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(), LabelShuffleSplit()] for cv in label_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The labels parameter should not be None", gs.fit, X, y) gs.fit(X, y, labels) non_label_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_label_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
def test_grid_search_sparse(): # Test that grid search works with both dense and sparse matrices X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180].tocoo(), y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_true(np.mean(y_pred == y_pred2) >= .9) assert_equal(C, C2)
def test_pandas_input(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((DataFrame, Series)) except ImportError: pass X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) for InputFeatureType, TargetType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) grid_search.fit(X_df, y_ser).score(X_df, y_ser) grid_search.predict(X_df) assert_true(hasattr(grid_search, "grid_scores_"))
def test_ridgecv_sample_weight(): rng = np.random.RandomState(0) alphas = (0.1, 1.0, 10.0) # There are different algorithms for n_samples > n_features # and the opposite, so test them both. for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) cv = KFold(5) ridgecv = RidgeCV(alphas=alphas, cv=cv) ridgecv.fit(X, y, sample_weight=sample_weight) # Check using GridSearchCV directly parameters = {'alpha': alphas} fit_params = {'sample_weight': sample_weight} gs = GridSearchCV(Ridge(), parameters, fit_params=fit_params, cv=cv) gs.fit(X, y) assert_equal(ridgecv.alpha_, gs.best_estimator_.alpha) assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
def print_training_summary(self, gs): print('The best CV score from GridSearchCV (by default averaging across k-fold CV) for ' + self.output_column + ' is:') if self.took_log_of_y: print(' Note that this score is calculated using the natural logs of the y values.') print(gs.best_score_) print('The best params were') # Remove 'final_model__model' from what we print- it's redundant with model name, and is difficult to read quickly in a list since it's a python object. if 'model' in gs.best_params_: printing_copy = {} for k, v in gs.best_params_.items(): if k != 'model': printing_copy[k] = v else: printing_copy[k] = utils_models.get_name_from_model(v) else: printing_copy = gs.best_params_ print(printing_copy) if self.verbose: print('Here are all the hyperparameters that were tried:') raw_scores = gs.grid_scores_ sorted_scores = sorted(raw_scores, key=lambda x: x[1], reverse=True) for score in sorted_scores: for k, v in score[0].items(): if k == 'model': score[0][k] = utils_models.get_name_from_model(v) print(score)
def test_model_assessment(): X, y = make_classification(n_samples=40, n_features=100, n_informative=2, n_classes=2, n_redundant=0) pipe = Pipeline([('enet', ElasticNetFeatureSelection()), ('ridge', RidgeClassifier())]) ma = ModelAssessment(GridSearchCV(pipe, {'enet__l1_ratio': [2]})).fit(X, y) assert len(ma.cv_results_) == 0
def _get_best_params(obj): # if obj is a ModelAssessment, then get the first GridSearch if isinstance(obj, ModelAssessment): obj = pd.DataFrame(obj.cv_results_).sort_values( 'test_score', ascending=False).iloc[0].estimator elif not isinstance(obj, GridSearchCV): raise NotImplementedError("This can only work with a ModelAssessment " "or GridSearchCV object. You passed " "a %s object" % obj.__class__.__name__) return obj.best_params_
def cv_results_(self): """Get GridSearchCV results.""" check_is_fitted(self, 'gs_') return self.gs_.cv_results_
def best_params_(self): """Get GridSearchCV best_params.""" check_is_fitted(self, 'gs_') return self.gs_.best_params_
def tune_n_estimators_cv(estimator,params,X_train,Y_train): grid_search = GridSearchCV(estimator,param_grid=params,scoring='roc_auc',n_jobs=-1,cv=10,verbose=10) grid_search.fit(X_train,Y_train) return grid_search.best_params_