Python sklearn.model_selection 模块,GridSearchCV() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.GridSearchCV()

项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def train_model_with_cv(model, params, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    # Use Train data to parameter selection in a Grid Search
    gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
    gs_clf = gs_clf.fit(X_train, y_train)
    model = gs_clf.best_estimator_

    # Use best model and test data for final evaluation
    y_pred = model.predict(X_test)

    _f1 = f1_score(y_test, y_pred, average='micro')
    _confusion = confusion_matrix(y_test, y_pred)
    __precision = precision_score(y_test, y_pred)
    _recall = recall_score(y_test, y_pred)
    _statistics = {'f1_score': _f1,
                   'confusion_matrix': _confusion,
                   'precision': __precision,
                   'recall': _recall
                   }

    return model, _statistics
项目:palladio    作者:slipguru    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Fitting function on the data."""
        if self.data_normalizer is not None:
            X = self.normalize_data(X)

        if self.label_normalizer is not None:
            y = self.normalize_label(y)

        if self.force_classifier:
            clf = make_classifier(self.learner, params=self.learner_options)
        elif callable(self.learner):
            # self.learner = type(self.learner)
            clf = self.learner(**self.learner_options)
        else:
            clf = self.learner

        self.gs_ = GridSearchCV(estimator=clf, **self.cv_options)
        self.gs_.fit(X, y)
项目:probablyPOTUS    作者:jjardel    | 项目源码 | 文件源码
def train(self, train_size=0.8, k_folds=5):

        # retrieve data from DB and pre-process
        self._get_data()

        # perform train/test split
        self._get_train_test_split(train_size=train_size)

        # define text pre-processing pipeline
        text_pipeline = Pipeline([
            ('extract_text', DFColumnExtractor(TEXT_FEATURES)),
            ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
        ])

        # define pipeline for pre-processing of numeric features
        numeric_pipeline = Pipeline([
            ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
            ('scaler', MinMaxScaler())
        ])

        # combine both steps into a single pipeline
        pipeline = Pipeline([
            ('features', FeatureUnion([
                ('text_processing', text_pipeline),
                ('num_processing', numeric_pipeline)
            ])),
            ('clf', self._estimator)
        ])

        self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds))
        gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds)

        X = self.data.iloc[self.train_inds_, :]
        y = self.data[LABEL].values[self.train_inds_]

        gs.fit(X, y)

        self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_))

        self.gs_ = gs
        self.model_ = gs.best_estimator_
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def fit(self, X, y=None, groups=None):
            """Run fit with all sets of parameters.

            Parameters
            ----------

            X : array-like, shape=(n_samples, n_features)
                Training vector, where n_samples is the number of samples and
                n_features is the number of features.

            y : array-like, shape=(n_samples,) or (n_samples, n_output), optional (default=None)
                Target relative to X for classification or regression;
                None for unsupervised learning.

            groups : array-like, shape=(n_samples,), optional (default=None)
                Group labels for the samples used while splitting the dataset into
                train/test set.
            """
            return super(GridSearchCV, self).fit(X, _as_numpy(y), groups)
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=4, cv=5):
    # global  dtrain_whole
    global  num_boost_round
    global  params_sklearn

    # global x
    # global y
    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
        grid_search.fit(x, y)
        df0 = pd.DataFrame(grid_search.cv_results_)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        # print df0
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        # print grid_search.cv_results_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v
            if len(params_untuned)==1:
                return v
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_xgb_cv(params_untuned,scoring='roc_auc', n_jobs=1, cv=5):
    # global  dtrain_whole
    global  num_boost_round
    global  params_sklearn
    # global x
    # global y
    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=10)
        grid_search.fit(x, y)
        df0 = pd.DataFrame(grid_search.cv_results_)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        # print df0
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        # print grid_search.cv_results_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_classifier(estimator,params,X_train,Y_train,scoring='roc_auc',n_jobs=3,cv=5):
    results = []
    for k,values in params.items():
        params_single = dict(k=values)
        print '==========  ',params_single,'  =============='
        grid_search = GridSearchCV(estimator,param_grid=params_single,scoring=scoring,n_jobs=n_jobs,cv=cv,verbose=5)
        grid_search.fit(X_train,Y_train)
        df0 = pd.DataFrame(grid_search.cv_results_)
        df = pd.DataFrame(grid_search.cv_results_)[['params','mean_train_score','mean_test_score']]
        # print df0
        print df
        print 'the best_params : ',grid_search.best_params_
        print 'the best_score  : ',grid_search.best_score_
        # print grid_search.cv_results_
        results.append(grid_search.best_params_)
    return results
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_xgb_cv(params_untuned,params_sklearn,scoring='roc_auc', n_jobs=4, cv=5,verbose=10):

    for param_untuned in params_untuned:
        print '==========  ', param_untuned, '  =============='
        print_params(params_sklearn)
        estimator = xgb.XGBClassifier(**params_sklearn)
        # if(param_untuned.keys()[0] == 'n_estimators'):
        #     cv = 1
        grid_search = GridSearchCV(estimator, param_grid=param_untuned, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose)
        grid_search.fit(x, y)
        df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
        print df
        print 'the best_params : ', grid_search.best_params_
        print 'the best_score  : ', grid_search.best_score_
        for k,v in grid_search.best_params_.items():
            params_sklearn[k] = v
    return estimator,params_sklearn
项目:uncover-ml    作者:GeoscienceAustralia    | 项目源码 | 文件源码
def test_pipeline(get_models, get_transform, get_kernel):

    alg, model = get_models
    trans = get_transform()
    kernel = get_kernel() + WhiteKernel()

    pipe = Pipeline(steps=[(alg, model())])
    param_dict = {}
    if hasattr(model(), 'n_estimators'):
        param_dict[alg + '__n_estimators'] = [5]
    if hasattr(model(), 'kernel'):
        param_dict[alg + '__kernel'] = [kernel]
    param_dict[alg + '__target_transform'] = [trans]

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True,
                             )
    np.random.seed(10)
    estimator.fit(X=1 + np.random.rand(10, 3), y=1. + np.random.rand(10))
    assert estimator.cv_results_['mean_train_score'][0] > -15.0
项目:uncover-ml    作者:GeoscienceAustralia    | 项目源码 | 文件源码
def test_svr_pipeline(get_transform, get_svr_kernel):
    trans = get_transform()
    pipe = Pipeline(steps=[('svr', svr())])
    param_dict = {'svr__kernel': [get_svr_kernel]}
    param_dict['svr__target_transform'] = [trans]

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True,
                             )
    np.random.seed(1)
    estimator.fit(X=1 + np.random.rand(10, 5), y=1. + np.random.rand(10))
    assert estimator.cv_results_['mean_train_score'][0] > -10.0
项目:uncover-ml    作者:GeoscienceAustralia    | 项目源码 | 文件源码
def test_krige_pipeline(get_krige_method, get_variogram_model):
    pipe = Pipeline(steps=[('krige', Krige(method=get_krige_method))])
    param_dict = {'krige__variogram_model': [get_variogram_model]}

    estimator = GridSearchCV(pipe,
                             param_dict,
                             n_jobs=1,
                             iid=False,
                             pre_dispatch=2,
                             verbose=True
                            )
    np.random.seed(1)
    X = np.random.randint(0, 400, size=(20, 2)).astype(float)
    y = 5*np.random.rand(20)
    estimator.fit(X=X, y=y)
    assert estimator.cv_results_['mean_train_score'][0] > -1.0
项目:pyglmnet    作者:glm-tools    | 项目源码 | 文件源码
def test_cv():
    """Simple CV check."""
    # XXX: don't use scikit-learn for tests.
    X, y = make_regression()
    cv = KFold(X.shape[0], 5)

    glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
    # check that it returns 5 scores
    scores = cross_val_score(glm_normal, X, y, cv=cv)
    assert_equal(len(scores), 5)

    param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
                  {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
                                             10, base=np.exp(1))}]
    glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
    glmcv.fit(X, y)
项目:SecuML    作者:ANSSI-FR    | 项目源码 | 文件源码
def setBestParameters(self):
        cv = StratifiedKFold(n_splits = self.conf.num_folds)
        param_grid = self.conf.getParamGrid()
        if param_grid is None:
            # No parameter value to select
            return
        if self.conf.families_supervision:
            scoring = 'f1_macro'
        else:
            scoring = 'roc_auc'
        grid_search = GridSearchCV(self.pipeline, param_grid = param_grid,
                scoring = scoring,
                cv = cv,
                n_jobs = -1,
                fit_params = {'model__sample_weight': self.datasets.sample_weight})
        grid_search.fit(self.datasets.train_instances.getFeatures(),
                self.getSupervision(self.datasets.train_instances))
        self.conf.setBestValues(grid_search)
        self.pipeline.set_params(**self.conf.getBestValues())
        return cv
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def xgb_model_select(file_name):  
    train_df = read_from_file(file_name)
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    xgb_clf = xgb.XGBRegressor() 
    parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9]}
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def gbdt_select_model(file_name):
    train_df = read_from_file(file_name)
    #featrue 16
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    gbdt = GradientBoostingRegressor() 
    parameters = {'n_estimators': [100, 120], 'max_depth':[4, 5, 6]}
    grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def select_model(file_name):
    train_df = read_from_file(file_name)
    #featrue 16
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    gbdt = GradientBoostingRegressor() 
    parameters = {'n_estimators': [10000, 12000], 'max_depth':[16,15, 14]}
    grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def xgb_model_select(train_file_name):  
    train_df = merge_features_to_use(train_file_name)
    train_df.drop(['conversionTime'], axis=1, inplace=True)
    print 'Train And Fix Missing App Count Value...'
    train_df, xgb_appcount = train_model_for_appcounts(train_df)
    joblib.dump(xgb_appcount, 'XGB_missing.model')
    print train_df.info()
    print train_df.describe()
    print train_df.isnull().sum()
    train_np = train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    xgb_clf = xgb.XGBRegressor() 
    parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9], 'gamma':[0.1,0.3,0.5,0.7], 'min_child_weight':[1,3,5,7], }
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
项目:LSAT    作者:BillVanderLugt    | 项目源码 | 文件源码
def grid(X, y):
    '''
    Adapted from: http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py
    Perform a grid search.
    '''

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=8)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
项目:eezzy    作者:3Blades    | 项目源码 | 文件源码
def fit(self, df, y, param_grid=None):
        from sklearn.model_selection import GridSearchCV
        X = df.drop(y, axis=1).values
        y = df[y].values

        meta_X = self.get_meta(X)

        if param_grid is not None:
            model = self.stacked_model_class()
            gridsearch = GridSearchCV(model, param_grid)
            gridsearch.fit(meta_X, y)
            self.stacked_model = self.stacked_model_class(**gridsearch.best_params_)
        else:
            self.stacked_model = self.stacked_model_class()

        self.stacked_model.fit(meta_X, y)
项目:Emotion-Recognition    作者:HashCode55    | 项目源码 | 文件源码
def grid_search_cv(clf, x, y, params, cv = 5):
    """
    :param clf: The classifier over which we want to perform 
    gridsearch.
    :param x: Features 
    :param y: Target
    :param params: Hyperparameters to perform gs on
    :cv: kfold cv parameter
    """
    gs = GridSearchCV(clf, param_grid = params, cv = cv)
    gs.fit(x, y)
    print 
    print 'BEST PARAMS:', gs.best_params_
    print 'BEST SCORE:', gs.best_score_
    print 
    best_estimator = gs.best_estimator_
    return best_estimator

######################
# PREPARING THE DATA #
######################

#get the last 4 images from each file
项目:Quora-Kaggle    作者:PPshrimpGo    | 项目源码 | 文件源码
def LogisticRegression(X_train, y_train):
    from sklearn.linear_model import LogisticRegression
    parameters = {
        'C':[0.6, 0.8, 1.0, 1.2],
        'class_weight':[None, 'balanced'],
    }

    LR = LogisticRegression()
    grid_search = GridSearchCV(estimator=LR, param_grid=parameters, cv=5, scoring='neg_log_loss',n_jobs=4)

    now = datetime.datetime.now()
    print ("logestic regression grid_search start in " + now.strftime('%Y-%m-%d %H:%M:%S'))

    grid_search.fit(X_train, y_train)
    print ("logestic regression grid_search done in " + now.strftime('%Y-%m-%d %H:%M:%S'))

    results = grid_search.grid_scores_
    for result in results:
        print(result)
    print("\nBest score: %0.3f\n" % grid_search.best_score_)
    print ("---------best parameters---------")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print ("%s: %r" % (param_name, best_parameters[param_name]))
项目:crime_prediction    作者:livenb    | 项目源码 | 文件源码
def build_grid_search(X, y):
    parameters = {
        "estimator__criterion": ['gini', 'entropy'],
        "estimator__max_depth": [10, 15, 20, 25, None],
        "estimator__max_features": ['auto', 'sqrt', 'log2', None]
    }
    ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000,
                                    oob_score=True, n_jobs=-1, verbose=1))
    model_tunning = GridSearchCV(ovr, param_grid=parameters, verbose=1,
                                 n_jobs=-1, cv=10,
                                 scoring=make_scorer(f1_score))
    model_tunning.fit(X, y)
    test_score = model_tunning.best_score_
    print 'The best test score: ', test_score
    y_score = model_tunning.predict_proba(X_test)
    multiclass_roc(y_score, 'grid_search_02')
    return model_tunning
项目:xgboost-tuner    作者:cwerner87    | 项目源码 | 文件源码
def clean_params_for_sk(params: dict) -> dict:
    """
    Given a dictionary of XGB parameters, return a copy without parameters that will cause issues with scikit-learn's grid or
    randomized search estimators.

    :param params:
        A dictionary of XGB parameters.
    :return: 
        A copy of the same dictionary without the aforementioned problematic parameters.
    """
    # In the xgb.cv call, nthread should be equal to the CPU count, but this causes a hang when
    # called through GridSearchCV - parallelism should be achieved through its n_jobs parameter.
    # See https://github.com/scikit-learn/scikit-learn/issues/6627 for more details.
    params_copy = params.copy()
    params_copy['nthread'] = 1

    # In multiclass problems, this parameter is required for XGBoost, but is not a parameter of interest to be tuned.
    if 'num_class' in params_copy.keys():
        del params_copy['num_class']

    return params_copy
项目:Bacchus    作者:surfstudio    | 项目源码 | 文件源码
def fit(self, X, *args, **kwargs):
        if self._grid_search:
            model = GridSearchCV(self._model, **self._grid_search)
        elif self._random_search:
            model = RandomizedSearchCV(self._model, **self._random_search)
        else:
            model = self._model

        if self._grid_search is not None:
            self._grid = model
        elif self._random_search is not None:
            self._rnd = model

        assert (self.target in X.columns.values), 'X must contain the target column'
        self._xcols = list(X.columns.values)
        self._xcols.remove(self.target)
        if len(self._columns_exclude) == 0 and len(self._columns_include) > 0:
            self._columns_exclude = list(set(self._xcols) - set(self._columns_include))
        [self._xcols.remove(t) for t in self._columns_exclude]
        x = X[self._xcols]
        y = X[self.target]
        model.fit(x, y, **kwargs)
        return self
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True):
    """
    Grid search method with numpy array of X and Y
    Previously, np.mat are used for compatible with Matlab notation.    
    """
    if disp:
        print( X.shape, Y.shape)

    clf = getattr( linear_model, method)()
    parmas = {'alpha': np.logspace( *alphas_log)}
    kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf5 = kf5_c.split( X)
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)

    gs.fit( X, Y)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1):
    """
    gs = gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1)

    Inputs
    ======
    classifier = svm.SVC(), for example

    param = {"C": np.logspace(-2,2,5)}
    """
    #print(xM.shape, yVc.shape)
    kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=True)
    gs = model_selection.GridSearchCV( classifier, params, cv=kf5_c, n_jobs=n_jobs)
    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, X_concat)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True):
    """
    Grid search method with numpy array of X and Y
    Previously, np.mat are used for compatible with Matlab notation.    
    """
    if disp:
        print( X.shape, Y.shape)

    clf = getattr( linear_model, method)()
    parmas = {'alpha': np.logspace( *alphas_log)}
    kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf5 = kf5_c.split( X)
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)

    gs.fit( X, Y)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1, graph=False):
    """
    gs = gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1)

    Inputs
    ======
    model = svm.SVC(), or linear_model.LinearRegression(), for example
    param = {"C": np.logspace(-2,2,5)}
    """
    #print(xM.shape, yVc.shape)
    kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
    gs = model_selection.GridSearchCV( model, param_grid, cv=kf5_c, n_jobs=n_jobs)
    gs.fit( X, y)

    if graph:
        plt.plot( gs.cv_results_["mean_train_score"], label='E[Train]')
        plt.plot( gs.cv_results_["mean_test_score"], label='E[Test]')
        plt.legend(loc=0)
        plt.grid()

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Lasso(xM, yV, alphas_log=(-1, 1, 9), n_folds=5, n_jobs=-1):

    print(xM.shape, yV.shape)

    clf = linear_model.Lasso()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    parmas = {'alpha': np.logspace(*alphas_log)}
    kf5_c = model_selection.KFold(n_folds=n_folds, shuffle=True)
    kf5 = kf5_c.split(xM)

    gs = model_selection.GridSearchCV(
        clf, parmas, scoring='r2', cv=kf5, n_jobs=n_jobs)

    gs.fit(xM, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _gs_SVC_r0(xM, yVc, params):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5_c = model_selection.KFold(n_splits=5, shuffle=True)
    kf5 = kf5_c.split(xM)
    gs = model_selection.GridSearchCV(clf, params, cv=kf5, n_jobs=-1)

    gs.fit(xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_SVC(xM, yVc, params, n_folds=5):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5_c = model_selection.KFold(n_splits=n_folds, shuffle=True)
    kf5 = kf5_c.split(xM)
    gs = model_selection.GridSearchCV(clf, params, cv=kf5, n_jobs=-1)

    gs.fit(xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge(xM, yV, alphas_log=(1, -1, 9), n_folds=5, n_jobs=-1, scoring='r2'):
    """
    Parameters
    -------------
    scoring: mean_absolute_error, mean_squared_error, median_absolute_error, r2
    """
    print('If scoring is not r2 but error metric, output score is revered for scoring!')
    print(xM.shape, yV.shape)

    clf = linear_model.Ridge()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    parmas = {'alpha': np.logspace(*alphas_log)}
    kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True)
    kf_n = kf_n_c.split(xM)
    gs = model_selection.GridSearchCV(
        clf, parmas, scoring=scoring, cv=kf_n, n_jobs=n_jobs)

    gs.fit(xM, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE(A_list, yV, XX=None, alphas_log=(1, -1, 9), n_folds=5, n_jobs=-1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge(A_list, XX)
    parmas = {'alpha': np.logspace(*alphas_log)}
    ln = A_list[0].shape[0]  # ls is the number of molecules.

    kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True)
    kf_n = kf_n_c.split(A_list)
    gs = model_selection.GridSearchCV(
        clf, parmas, scoring='r2', cv=kf_n, n_jobs=n_jobs)

    AX_idx = np.array([list(range(ln))]).T
    gs.fit(AX_idx, yV)

    return gs
项目:keras-transfer-learning-for-oxford102    作者:Arsey    | 项目源码 | 文件源码
def train_logistic():
    df = pd.read_csv(config.activations_path)
    df, y, classes = encode(df)

    X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.2, random_state=17)

    params = {'C': [10, 2, .9, .4, .1], 'tol': [0.0001, 0.001, 0.0005]}
    log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced')
    clf = GridSearchCV(log_reg, params, scoring='neg_log_loss', refit=True, cv=3, n_jobs=-1)
    clf.fit(X_train, y_train)

    print("best params: " + str(clf.best_params_))
    print("Accuracy: ", accuracy_score(y_test, clf.predict(X_test)))

    setattr(clf, '__classes', classes)
    # save results for further using
    joblib.dump(clf, config.get_novelty_detection_model_path())
项目:jarvis    作者:whittlbc    | 项目源码 | 文件源码
def perform():
    # Create a new grid search classifier from a sci-kit pipeline
    model = GridSearchCV(pipeline(), gs_clf_params(), n_jobs=-1)

    # Get your training and testing sets of data with 50/50 split
    (train_data, train_targets), (test_data, test_targets) = dp.get_data()

    # Train your model
    model = model.fit(train_data, train_targets)

    # Test it's accuracy
    predictions = model.predict(test_data)

    # Display the model's accuracy
    print "\nModel Accuracy: {}\n".format(np.mean(predictions == test_targets))

    # Save the trained model to disk
    save_model(model)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_gridsearch():
    # Check that base trees can be grid-searched.
    # AdaBoost classification
    boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
    parameters = {'n_estimators': (1, 2),
                  'base_estimator__max_depth': (1, 2),
                  'algorithm': ('SAMME', 'SAMME.R')}
    clf = GridSearchCV(boost, parameters)
    clf.fit(iris.data, iris.target)

    # AdaBoost regression
    boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
                              random_state=0)
    parameters = {'n_estimators': (1, 2),
                  'base_estimator__max_depth': (1, 2)}
    clf = GridSearchCV(boost, parameters)
    clf.fit(boston.data, boston.target)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search():
    # Test that the best estimator contains the right value for foo_param
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
    # make sure it selects the smallest parameter in case of ties
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    grid_search.fit(X, y)
    sys.stdout = old_stdout
    assert_equal(grid_search.best_estimator_.foo_param, 2)

    for i, foo_i in enumerate([1, 2, 3]):
        assert_true(grid_search.grid_scores_[i][0]
                    == {'foo_param': foo_i})
    # Smoke test the score etc:
    grid_search.score(X, y)
    grid_search.predict_proba(X)
    grid_search.decision_function(X)
    grid_search.transform(X)

    # Test exception handling on scoring
    grid_search.scoring = 'sklearn'
    assert_raises(ValueError, grid_search.fit, X, y)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_labels():
    # Check if ValueError (when labels is None) propagates to GridSearchCV
    # And also check if labels is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    labels = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    label_cvs = [LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(),
                 LabelShuffleSplit()]
    for cv in label_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The labels parameter should not be None",
                             gs.fit, X, y)
        gs.fit(X, y, labels)

    non_label_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_label_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_sparse():
    # Test that grid search works with both dense and sparse matrices
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180].tocoo(), y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_true(np.mean(y_pred == y_pred2) >= .9)
    assert_equal(C, C2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_pandas_input():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((DataFrame, Series))
    except ImportError:
        pass

    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)

    for InputFeatureType, TargetType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)

        grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
        grid_search.fit(X_df, y_ser).score(X_df, y_ser)
        grid_search.predict(X_df)
        assert_true(hasattr(grid_search, "grid_scores_"))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_ridgecv_sample_weight():
    rng = np.random.RandomState(0)
    alphas = (0.1, 1.0, 10.0)

    # There are different algorithms for n_samples > n_features
    # and the opposite, so test them both.
    for n_samples, n_features in ((6, 5), (5, 10)):
        y = rng.randn(n_samples)
        X = rng.randn(n_samples, n_features)
        sample_weight = 1.0 + rng.rand(n_samples)

        cv = KFold(5)
        ridgecv = RidgeCV(alphas=alphas, cv=cv)
        ridgecv.fit(X, y, sample_weight=sample_weight)

        # Check using GridSearchCV directly
        parameters = {'alpha': alphas}
        fit_params = {'sample_weight': sample_weight}
        gs = GridSearchCV(Ridge(), parameters, fit_params=fit_params,
                          cv=cv)
        gs.fit(X, y)

        assert_equal(ridgecv.alpha_, gs.best_estimator_.alpha)
        assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def print_training_summary(self, gs):
        print('The best CV score from GridSearchCV (by default averaging across k-fold CV) for ' + self.output_column + ' is:')
        if self.took_log_of_y:
            print('    Note that this score is calculated using the natural logs of the y values.')
        print(gs.best_score_)
        print('The best params were')

        # Remove 'final_model__model' from what we print- it's redundant with model name, and is difficult to read quickly in a list since it's a python object.
        if 'model' in gs.best_params_:
            printing_copy = {}
            for k, v in gs.best_params_.items():
                if k != 'model':
                    printing_copy[k] = v
                else:
                    printing_copy[k] = utils_models.get_name_from_model(v)
        else:
            printing_copy = gs.best_params_

        print(printing_copy)

        if self.verbose:
            print('Here are all the hyperparameters that were tried:')
            raw_scores = gs.grid_scores_
            sorted_scores = sorted(raw_scores, key=lambda x: x[1], reverse=True)
            for score in sorted_scores:
                for k, v in score[0].items():
                    if k == 'model':
                        score[0][k] = utils_models.get_name_from_model(v)
                print(score)
项目:palladio    作者:slipguru    | 项目源码 | 文件源码
def test_model_assessment():
    X, y = make_classification(n_samples=40, n_features=100, n_informative=2,
                               n_classes=2, n_redundant=0)
    pipe = Pipeline([('enet', ElasticNetFeatureSelection()),
                     ('ridge', RidgeClassifier())])

    ma = ModelAssessment(GridSearchCV(pipe, {'enet__l1_ratio': [2]})).fit(X, y)
    assert len(ma.cv_results_) == 0
项目:palladio    作者:slipguru    | 项目源码 | 文件源码
def _get_best_params(obj):
    # if obj is a ModelAssessment, then get the first GridSearch
    if isinstance(obj, ModelAssessment):
        obj = pd.DataFrame(obj.cv_results_).sort_values(
            'test_score', ascending=False).iloc[0].estimator
    elif not isinstance(obj, GridSearchCV):
        raise NotImplementedError("This can only work with a ModelAssessment "
                                  "or GridSearchCV object. You passed "
                                  "a %s object" % obj.__class__.__name__)

    return obj.best_params_
项目:palladio    作者:slipguru    | 项目源码 | 文件源码
def cv_results_(self):
        """Get GridSearchCV results."""
        check_is_fitted(self, 'gs_')
        return self.gs_.cv_results_
项目:palladio    作者:slipguru    | 项目源码 | 文件源码
def best_params_(self):
        """Get GridSearchCV best_params."""
        check_is_fitted(self, 'gs_')
        return self.gs_.best_params_
项目:trend_ml_toolkit_xgboost    作者:raymon-tian    | 项目源码 | 文件源码
def tune_n_estimators_cv(estimator,params,X_train,Y_train):
    grid_search = GridSearchCV(estimator,param_grid=params,scoring='roc_auc',n_jobs=-1,cv=10,verbose=10)
    grid_search.fit(X_train,Y_train)
    return grid_search.best_params_