Python sklearn.model_selection 模块，GridSearchCV() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用sklearn.model_selection.GridSearchCV()。

项目：texta 作者：texta-tk | 项目源码 | 文件源码

def train_model_with_cv(model, params, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    # Use Train data to parameter selection in a Grid Search
    gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
    gs_clf = gs_clf.fit(X_train, y_train)
    model = gs_clf.best_estimator_

    # Use best model and test data for final evaluation
    y_pred = model.predict(X_test)

    _f1 = f1_score(y_test, y_pred, average='micro')
    _confusion = confusion_matrix(y_test, y_pred)
    __precision = precision_score(y_test, y_pred)
    _recall = recall_score(y_test, y_pred)
    _statistics = {'f1_score': _f1,
                   'confusion_matrix': _confusion,
                   'precision': __precision,
                   'recall': _recall
                   }

    return model, _statistics

项目：palladio 作者：slipguru | 项目源码 | 文件源码

def fit(self, X, y=None):
        """Fitting function on the data."""
        if self.data_normalizer is not None:
            X = self.normalize_data(X)

        if self.label_normalizer is not None:
            y = self.normalize_label(y)

        if self.force_classifier:
            clf = make_classifier(self.learner, params=self.learner_options)
        elif callable(self.learner):
            # self.learner = type(self.learner)
            clf = self.learner(**self.learner_options)
        else:
            clf = self.learner

        self.gs_ = GridSearchCV(estimator=clf, **self.cv_options)
        self.gs_.fit(X, y)

项目：probablyPOTUS 作者：jjardel | 项目源码 | 文件源码

def train(self, train_size=0.8, k_folds=5):

        # retrieve data from DB and pre-process
        self._get_data()

        # perform train/test split
        self._get_train_test_split(train_size=train_size)

        # define text pre-processing pipeline
        text_pipeline = Pipeline([
            ('extract_text', DFColumnExtractor(TEXT_FEATURES)),
            ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
        ])

        # define pipeline for pre-processing of numeric features
        numeric_pipeline = Pipeline([
            ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
            ('scaler', MinMaxScaler())
        ])

        # combine both steps into a single pipeline
        pipeline = Pipeline([
            ('features', FeatureUnion([
                ('text_processing', text_pipeline),
                ('num_processing', numeric_pipeline)
            ])),
            ('clf', self._estimator)
        ])

        self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds))
        gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds)

        X = self.data.iloc[self.train_inds_, :]
        y = self.data[LABEL].values[self.train_inds_]

        gs.fit(X, y)

        self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_))

        self.gs_ = gs
        self.model_ = gs.best_estimator_

项目：skutil 作者：tgsmith61591 | 项目源码 | 文件源码

def fit(self, X, y=None, groups=None):
            """Run fit with all sets of parameters.

            Parameters
            ----------

            X : array-like, shape=(n_samples, n_features)
                Training vector, where n_samples is the number of samples and
                n_features is the number of features.

            y : array-like, shape=(n_samples,) or (n_samples, n_output), optional (default=None)
                Target relative to X for classification or regression;
                None for unsupervised learning.

            groups : array-like, shape=(n_samples,), optional (default=None)
                Group labels for the samples used while splitting the dataset into
                train/test set.
            """
            return super(GridSearchCV, self).fit(X, _as_numpy(y), groups)

项目：trend_ml_toolkit_xgboost 作者：raymon-tian | 项目源码 | 文件源码

def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=4, cv=5):
    # global  dtrain_whole
    global  num_boost_round
    global  params_sklearn

    # global x
    # global y
    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
        grid_search.fit(x, y)
        df0 = pd.DataFrame(grid_search.cv_results_)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        # print df0
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        # print grid_search.cv_results_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v
            if len(params_untuned)==1:
                return v

项目：trend_ml_toolkit_xgboost 作者：raymon-tian | 项目源码 | 文件源码

def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=1, cv=5):
    # global  dtrain_whole
    global  num_boost_round
    global  params_sklearn
    # global x
    # global y
    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
        grid_search.fit(x, y)
        df0 = pd.DataFrame(grid_search.cv_results_)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        # print df0
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        # print grid_search.cv_results_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v

项目：trend_ml_toolkit_xgboost 作者：raymon-tian | 项目源码 | 文件源码

def tune_classifier(estimator,params,X_train,Y_train,scoring='roc_auc',n_jobs=3,cv=5):
    results = []
    for k,values in params.items():
        params_single = dict(k=values)
        print '==========  ',params_single,'  =============='
        grid_search = GridSearchCV(estimator,param_grid=params_single,scoring=scoring,n_jobs=n_jobs,cv=cv,verbose=5)
        grid_search.fit(X_train,Y_train)
        df0 = pd.DataFrame(grid_search.cv_results_)
        df = pd.DataFrame(grid_search.cv_results_)[['params','mean_train_score','mean_test_score']]
        # print df0
        print df
        print 'the best_params : ',grid_search.best_params_
        print 'the best_score  : ',grid_search.best_score_
        # print grid_search.cv_results_
        results.append(grid_search.best_params_)
    return results

项目：trend_ml_toolkit_xgboost 作者：raymon-tian | 项目源码 | 文件源码

def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10):

    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        # if(param_untuned.keys()[0] == 'n_estimators'):
        #     cv = 1
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose)
        grid_search.fit(x, y)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v
    return estimator,params_sklearn

项目：uncover-ml 作者：GeoscienceAustralia | 项目源码 | 文件源码

def test_pipeline(get_models, get_transform, get_kernel):

    alg, model = get_models
    trans = get_transform()
    kernel = get_kernel() + WhiteKernel()

    pipe = Pipeline(steps=[(alg, model())])
    param_dict = {}
    if hasattr(model(), 'n_estimators'):
        param_dict[alg + '__n_estimators'] = [5]
    if hasattr(model(), 'kernel'):
        param_dict[alg + '__kernel'] = [kernel]
    param_dict[alg + '__target_transform'] = [trans]

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True,
                             )
    np.random.seed(10)
    estimator.fit(X=1 + np.random.rand(10, 3), y=1. + np.random.rand(10))
    assert estimator.cv_results_['mean_train_score'][0] > -15.0

项目：uncover-ml 作者：GeoscienceAustralia | 项目源码 | 文件源码

def test_svr_pipeline(get_transform, get_svr_kernel):
    trans = get_transform()
    pipe = Pipeline(steps=[('svr', svr())])
    param_dict = {'svr__kernel': [get_svr_kernel]}
    param_dict['svr__target_transform'] = [trans]

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True,
                             )
    np.random.seed(1)
    estimator.fit(X=1 + np.random.rand(10, 5), y=1. + np.random.rand(10))
    assert estimator.cv_results_['mean_train_score'][0] > -10.0

项目：uncover-ml 作者：GeoscienceAustralia | 项目源码 | 文件源码

def test_krige_pipeline(get_krige_method, get_variogram_model):
    pipe = Pipeline(steps=[('krige', Krige(method=get_krige_method))])
    param_dict = {'krige__variogram_model': [get_variogram_model]}

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True
                            )
    np.random.seed(1)
    X = np.random.randint(0, 400, size=(20, 2)).astype(float)
    y = 5*np.random.rand(20)
    estimator.fit(X=X, y=y)
    assert estimator.cv_results_['mean_train_score'][0] > -1.0

项目：pyglmnet 作者：glm-tools | 项目源码 | 文件源码

def test_cv():
    """Simple CV check."""
    # XXX: don't use scikit-learn for tests.
    X, y = make_regression()
    cv = KFold(X.shape[0], 5)

    glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
    # check that it returns 5 scores
    scores = cross_val_score(glm_normal, X, y, cv=cv)
    assert_equal(len(scores), 5)

    param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
                  {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
                                             10, base=np.exp(1))}]
    glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
    glmcv.fit(X, y)

项目：SecuML 作者：ANSSI-FR | 项目源码 | 文件源码

def setBestParameters(self):
        cv = StratifiedKFold(n_splits = self.conf.num_folds)
        param_grid = self.conf.getParamGrid()
        if param_grid is None:
            # No parameter value to select
            return
        if self.conf.families_supervision:
            scoring = 'f1_macro'
        else:
            scoring = 'roc_auc'
        grid_search = GridSearchCV(self.pipeline, param_grid = param_grid,
                scoring = scoring,
                cv = cv,
                n_jobs = -1,
                fit_params = {'model__sample_weight': self.datasets.sample_weight})
        grid_search.fit(self.datasets.train_instances.getFeatures(),
                self.getSupervision(self.datasets.train_instances))
        self.conf.setBestValues(grid_search)
        self.pipeline.set_params(**self.conf.getBestValues())
        return cv

项目：tpai_comp 作者：luuuyi | 项目源码 | 文件源码

def xgb_model_select(file_name):  
    train_df = read_from_file(file_name)
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    xgb_clf = xgb.XGBRegressor() 
    parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9]}
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)

项目：tpai_comp 作者：luuuyi | 项目源码 | 文件源码

def gbdt_select_model(file_name):
    train_df = read_from_file(file_name)
    #featrue 16
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    gbdt = GradientBoostingRegressor() 
    parameters = {'n_estimators': [100, 120], 'max_depth':[4, 5, 6]}
    grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)

项目：tpai_comp 作者：luuuyi | 项目源码 | 文件源码

def select_model(file_name):
    train_df = read_from_file(file_name)
    #featrue 16
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    gbdt = GradientBoostingRegressor() 
    parameters = {'n_estimators': [10000, 12000], 'max_depth':[16,15, 14]}
    grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)

项目：tpai_comp 作者：luuuyi | 项目源码 | 文件源码

def xgb_model_select(train_file_name):  
    train_df = merge_features_to_use(train_file_name)
    train_df.drop(['conversionTime'], axis=1, inplace=True)
    print 'Train And Fix Missing App Count Value...'
    train_df, xgb_appcount = train_model_for_appcounts(train_df)
    joblib.dump(xgb_appcount, 'XGB_missing.model')
    print train_df.info()
    print train_df.describe()
    print train_df.isnull().sum()
    train_np = train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    xgb_clf = xgb.XGBRegressor() 
    parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9], 'gamma':[0.1,0.3,0.5,0.7], 'min_child_weight':[1,3,5,7], }
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)

项目：LSAT 作者：BillVanderLugt | 项目源码 | 文件源码

def grid(X, y):
    '''
    Adapted from: http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py
    Perform a grid search.
    '''

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=8)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

项目：eezzy 作者：3Blades | 项目源码 | 文件源码

def fit(self, df, y, param_grid=None):
        from sklearn.model_selection import GridSearchCV
        X = df.drop(y, axis=1).values
        y = df[y].values

        meta_X = self.get_meta(X)

        if param_grid is not None:
            model = self.stacked_model_class()
            gridsearch = GridSearchCV(model, param_grid)
            gridsearch.fit(meta_X, y)
            self.stacked_model = self.stacked_model_class(**gridsearch.best_params_)
        else:
            self.stacked_model = self.stacked_model_class()

        self.stacked_model.fit(meta_X, y)

项目：Emotion-Recognition 作者：HashCode55 | 项目源码 | 文件源码

def grid_search_cv(clf, x, y, params, cv = 5):
    """
    :param clf: The classifier over which we want to perform 
    gridsearch.
    :param x: Features 
    :param y: Target
    :param params: Hyperparameters to perform gs on
    :cv: kfold cv parameter
    """
    gs = GridSearchCV(clf, param_grid = params, cv = cv)
    gs.fit(x, y)
    print 
    print 'BEST PARAMS:', gs.best_params_
    print 'BEST SCORE:', gs.best_score_
    print 
    best_estimator = gs.best_estimator_
    return best_estimator

######################
# PREPARING THE DATA #
######################

#get the last 4 images from each file

项目：Quora-Kaggle 作者：PPshrimpGo | 项目源码 | 文件源码

def LogisticRegression(X_train, y_train):
    from sklearn.linear_model import LogisticRegression
    parameters = {
        'C':[0.6, 0.8, 1.0, 1.2],
        'class_weight':[None, 'balanced'],
    }

    LR = LogisticRegression()
    grid_search = GridSearchCV(estimator=LR, param_grid=parameters, cv=5, scoring='neg_log_loss',n_jobs=4)

    now = datetime.datetime.now()
    print ("logestic regression grid_search start in " + now.strftime('%Y-%m-%d %H:%M:%S'))

    grid_search.fit(X_train, y_train)
    print ("logestic regression grid_search done in " + now.strftime('%Y-%m-%d %H:%M:%S'))

    results = grid_search.grid_scores_
    for result in results:
        print(result)
    print("\nBest score: %0.3f\n" % grid_search.best_score_)
    print ("---------best parameters---------")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print ("%s: %r" % (param_name, best_parameters[param_name]))

项目：crime_prediction 作者：livenb | 项目源码 | 文件源码

def build_grid_search(X, y):
    parameters = {
        "estimator__criterion": ['gini', 'entropy'],
        "estimator__max_depth": [10, 15, 20, 25, None],
        "estimator__max_features": ['auto', 'sqrt', 'log2', None]
    }
    ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000,
                                    oob_score=True, n_jobs=-1, verbose=1))
    model_tunning = GridSearchCV(ovr, param_grid=parameters, verbose=1,
                                 n_jobs=-1, cv=10,
                                 scoring=make_scorer(f1_score))
    model_tunning.fit(X, y)
    test_score = model_tunning.best_score_
    print 'The best test score: ', test_score
    y_score = model_tunning.predict_proba(X_test)
    multiclass_roc(y_score, 'grid_search_02')
    return model_tunning

项目：xgboost-tuner 作者：cwerner87 | 项目源码 | 文件源码

def clean_params_for_sk(params: dict) -> dict:
    """
    Given a dictionary of XGB parameters, return a copy without parameters that will cause issues with scikit-learn's grid or
    randomized search estimators.

    :param params:
        A dictionary of XGB parameters.
    :return: 
        A copy of the same dictionary without the aforementioned problematic parameters.
    """
    # In the xgb.cv call, nthread should be equal to the CPU count, but this causes a hang when
    # called through GridSearchCV - parallelism should be achieved through its n_jobs parameter.
    # See https://github.com/scikit-learn/scikit-learn/issues/6627 for more details.
    params_copy = params.copy()
    params_copy['nthread'] = 1

    # In multiclass problems, this parameter is required for XGBoost, but is not a parameter of interest to be tuned.
    if 'num_class' in params_copy.keys():
        del params_copy['num_class']

    return params_copy

项目：Bacchus 作者：surfstudio | 项目源码 | 文件源码

def fit(self, X, *args, **kwargs):
        if self._grid_search:
            model = GridSearchCV(self._model, **self._grid_search)
        elif self._random_search:
            model = RandomizedSearchCV(self._model, **self._random_search)
        else:
            model = self._model

        if self._grid_search is not None:
            self._grid = model
        elif self._random_search is not None:
            self._rnd = model

        assert (self.target in X.columns.values), 'X must contain the target column'
        self._xcols = list(X.columns.values)
        self._xcols.remove(self.target)
        if len(self._columns_exclude) == 0 and len(self._columns_include) > 0:
            self._columns_exclude = list(set(self._xcols) - set(self._columns_include))
        [self._xcols.remove(t) for t in self._columns_exclude]
        x = X[self._xcols]
        y = X[self.target]
        model.fit(x, y, **kwargs)
        return self