def learn_decision_tree(data): DT = tree.DecisionTreeClassifier(max_depth=7) scorer = make_scorer(matthews_corrcoef) for i in range(5): scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer) print("iteration",i, "dt mean:", scores.mean()) scores = list(scores) print("Decision Tree train scores:\n", scores) return DT # DT = DT.fit(train_data[:, :-1], train_data[:, -1]) # predictionsDT = DT.predict(validation_data[:, :-1]) # validating predicions # dtError = 0 # for i in range(0, len(validation_data)): # if(validation_data[i][20] != predictionsDT[i]): # dtError = dtError + 1 # print("DT Error : ", float(dtError)/len(validation_data)*100.0)
def _sfn(l, mask, myrad, bcast_var): """Score classifier on searchlight data using cross-validation. The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The number of cross-validation folds is in `bast_var[1]. """ clf = bcast_var[2] data = l[0][mask, :].T # print(l[0].shape, mask.shape, data.shape) skf = model_selection.StratifiedKFold(n_splits=bcast_var[1], shuffle=False) accuracy = np.mean(model_selection.cross_val_score(clf, data, y=bcast_var[0], cv=skf, n_jobs=1)) return accuracy
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj): # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel # when the kernel matrix is computed in portions; also, this method only works # for self-correlation, i.e. correlation between the same data matrix. # no shrinking, set C=1 svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) #logit_clf = LogisticRegression() clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj) # doing leave-one-subject-out cross validation # no shuffling in cv skf = model_selection.StratifiedKFold(n_splits=num_subjects, shuffle=False) scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)), y=labels, cv=skf) print(scores) logger.info( 'the overall cross validation accuracy is %.2f' % np.mean(scores) )
def test_cv(): """Simple CV check.""" # XXX: don't use scikit-learn for tests. X, y = make_regression() cv = KFold(X.shape[0], 5) glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1) # check that it returns 5 scores scores = cross_val_score(glm_normal, X, y, cv=cv) assert_equal(len(scores), 5) param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)}, {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01), 10, base=np.exp(1))}] glmcv = GridSearchCV(glm_normal, param_grid, cv=cv) glmcv.fit(X, y)
def model_cross_valid(X,Y): seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) def bulid_model(model_name): model = model_name() return model scoring = 'neg_mean_squared_error' # + random fest boost lstm gbdt for model_name in [LinearRegression,ElasticNet]: #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]: model = bulid_model(model_name) results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(model_name,results.mean())
def fit_regression(X, y, regression_class=LinearRegression, regularization_const=.001): ''' Given a dataset and some solutions (X, y) a regression class (from scikit learn) and an Lambda which is required if the regression class is Lasso or Ridge X (pandas DataFrame): The data. y (pandas DataFrame or Series): The answers. regression_class (class): One of sklearn.linear_model.[LinearRegression, Ridge, Lasso] regularization_const: the regularization_const value (regularization parameter) for Ridge or Lasso. Called alpha by scikit learn for interface reasons. Return: tuple, (the_fitted_regressor, mean(cross_val_score)). ''' if regression_class is LinearRegression: predictor = regression_class() else: predictor = regression_class(alpha=regularization_const, normalize=True) predictor.fit(X, y) cross_scores = cross_val_score(predictor, X, y=y, scoring='neg_mean_squared_error') cross_scores_corrected = np.sqrt(-1 * cross_scores) # Scikit learn returns negative vals && we need root return (predictor, np.mean(cross_scores_corrected))
def test_mdr_sklearn_pipeline(): """Ensure that MDR can be used as a transformer in a scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True)) assert np.mean(cv_scores) > 0.
def test_mdr_sklearn_pipeline_parallel(): """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1) assert np.mean(cv_scores) > 0.
def eval_models(eda_objs, clfs): ''' Uses a given set of classifiers objects to evaluates a given set of pipelines and return their CV scores. Parameters ---------- pipelines_names: list of strings names of the pipelines to compare eda_objs : list of objects clfs : list of classifiers *kwargs : Additional arguments to pass to sikit-learn's cross_val_score ''' if isinstance(clfs, list) is False: clfs = [clfs] acc = [] for clf_name, clf in clfs: for pipe_name, obj in eda_objs: X, y = obj.df[obj._get_input_features()], obj.df[obj.y] cv_score = cross_val_score(estimator=clf, X=X, y=y, cv=5, scoring='r2') #neg_mean_squared_error acc.append([(clf_name, pipe_name, v) for v in cv_score]) acc = [item for sublist in acc for item in sublist] # flatten the list of lists return acc
def __init__(self, model, ax=None, alphas=None, cv=None, scoring=None, **kwargs): # Check to make sure this is not a "RegressorCV" name = model.__class__.__name__ if name.endswith("CV"): raise YellowbrickTypeError(( "'{}' is a CV regularization model;" " try AlphaSelection instead." ).format(name)) # Call super to initialize the class super(ManualAlphaSelection, self).__init__(model, ax=ax, **kwargs) # Set manual alpha selection parameters self.alphas = alphas or np.logspace(-10, -2, 200) self.errors = None self.score_method = partial(cross_val_score, cv=cv, scoring=scoring)
def train_model(team_stats, result_data, test_data): # ?????? X, y = build_dataSet(team_stats, result_data) # ?????? print("Fitting on %d game samples.." % len(X)) model = LogisticRegression() model.fit(X, y) #??10???????????? print("Doing cross-validation..") print(cross_val_score(model, X, y, cv = 10, scoring='accuracy', n_jobs=-1).mean()) #??????model????????? print('Predicting on test data..') result = [] for index, row in test_data.iterrows(): team1 = row['Vteam'] team2 = row['Hteam'] pred = predict_winner(team1, team2, model, team_stats) result.append(pred[0][0]) return result
def compute_cross_fold(data): data_table = pd.read_csv("total_set.csv",index_col=0) #data_norm = (data - data.mean()) / (data.sum()) scaler = preprocessing.StandardScaler().fit(data) data_scaled = scaler.transform(data) #print data_scaled profitability_target = data_table['Profitable'] #print profitability_target #gross_target = data_table['Domestic Gross'] #tomato = data_table['Rotten'] #normalized_target_gross = (gross_target - gross_target.mean()) / (gross_target.max() - gross_target.min()) #tomato = (tomato - tomato.mean()) / (tomato.max() - tomato.min()) #clf_profit = svm.SVC(kernel='rbf',C=0.8, gamma=5,verbose=True) clf_profit = svm.LinearSVC(C=0.001,verbose=True,tol=.1) clf_profit.fit(data_scaled,profitability_target) scores = cross_val_score(clf_profit, data_scaled, profitability_target, cv=10) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) return (scores.mean(), scores.std() * 2)
def cross_validation(): x_train, x_test, y_train, y_test = load_data() k_lst = list(range(1, 30)) lst_scores = [] for k in k_lst: knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy') lst_scores.append(scores.mean()) # changing to misclassification error MSE = [1 - x for x in lst_scores] optimal_k = k_lst[MSE.index(min(MSE))] print "The optimal number of neighbors is %d" % optimal_k # plot misclassification error vs k # plt.plot(k_lst, MSE) # plt.ylabel('Misclassification Error') plt.plot(k_lst, lst_scores) plt.xlabel('Number of Neighbors K') plt.ylabel('correct classification rate') plt.show()
def test_cross_val_score_predict_labels(): # Check if ValueError (when labels is None) propagates to cross_val_score # and cross_val_predict # And also check if labels is correctly passed to the cv object X, y = make_classification(n_samples=20, n_classes=2, random_state=0) clf = SVC(kernel="linear") label_cvs = [LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(), LabelShuffleSplit()] for cv in label_cvs: assert_raise_message(ValueError, "The labels parameter should not be None", cross_val_score, estimator=clf, X=X, y=y, cv=cv) assert_raise_message(ValueError, "The labels parameter should not be None", cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
def test_cross_val_score_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((Series, DataFrame)) except ImportError: pass for TargetType, InputFeatureType in types: # X dataframe, y series # 3 fold cross val is used so we need atleast 3 samples per class X_df, y_ser = InputFeatureType(X), TargetType(y2) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) cross_val_score(clf, X_df, y_ser)
def test_cross_val_score_precomputed(): # test for svm with precomputed kernel svm = SVC(kernel="precomputed") iris = load_iris() X, y = iris.data, iris.target linear_kernel = np.dot(X, X.T) score_precomputed = cross_val_score(svm, linear_kernel, y) svm = SVC(kernel="linear") score_linear = cross_val_score(svm, X, y) assert_array_equal(score_precomputed, score_linear) # Error raised for non-square X svm = SVC(kernel="precomputed") assert_raises(ValueError, cross_val_score, svm, X, y) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cross_val_score, svm, linear_kernel.tolist(), y)
def test_cross_val_score_with_score_func_classification(): iris = load_iris() clf = SVC(kernel='linear') # Default score (should be the accuracy score) scores = cross_val_score(clf, iris.data, iris.target, cv=5) assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2) # Correct classification score (aka. zero / one score) - should be the # same as the default estimator score zo_scores = cross_val_score(clf, iris.data, iris.target, scoring="accuracy", cv=5) assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2) # F1 score (class are balanced so f1_score should be equal to zero/one # score f1_scores = cross_val_score(clf, iris.data, iris.target, scoring="f1_weighted", cv=5) assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
def rmse_cv(model, x_train, y_train): rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=5)) return rmse
def KFold_CrossValidation(self, scoring_metric): # Generate cross validation folds for the training dataset. error = model_selection.cross_val_score( estimator=self.alg, X=self.datablock.train[self.predictors].values, y=self.datablock.train[self.datablock.target].values, cv=self.cv_folds, scoring=scoring_metric, n_jobs=-1 ) return { 'mean_error': np.mean(error), 'std_error': np.std(error), 'all_error': error }
def feval(d): max_depth = d['max_depth'] n_estimators = d['n_estimators'] clf = RandomForestClassifier(n_jobs=-1, max_depth=max_depth, n_estimators=n_estimators) scores = cross_val_score(clf, data_X, data_y, cv=5, scoring='accuracy') return np.mean(scores) - np.std(scores)
def _cross_validation_for_one_voxel(clf, vid, num_folds, subject_data, labels): """Score classifier on data using cross validation.""" # no shuffling in cv skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=False) scores = model_selection.cross_val_score(clf, subject_data, y=labels, cv=skf, n_jobs=1) logger.debug( 'cross validation for voxel %d is done' % vid ) return (vid, scores.mean())
def adaBoost(self, settings, data=None, dropna=True): df = self.__loadData(data, dropna) features = df.columns[:-1] X = df[features] y = df.iloc[:, -1].values seed = 7 num_trees = 500 kfold = model_selection.KFold(n_splits=10, random_state=seed) print kfold model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) results = model_selection.cross_val_score(model, X, y, cv=kfold) model.fit(X, y) print results.mean() print model.score(X, y) return True
def compute_accuracies(lr, dt, svc, vc, X, Y): accuracies = [] accuracies.append(cross_val_score(lr, X, Y, scoring='accuracy', cv=10).mean()) accuracies.append(cross_val_score(dt, X, Y, scoring='accuracy', cv=10).mean()) accuracies.append(cross_val_score(svc, X, Y, scoring='accuracy', cv=10).mean()) accuracies.append(cross_val_score(vc, X, Y, scoring='accuracy', cv=10).mean()) print('Accuracies:') print(np.array(accuracies)) return accuracies
def multiprocessing_grid_search(queue, shared_list, persistent_object): """Explore cross validation grid using multiprocessing.""" # scores = cross_val_score(*cross_val_score_args, **cross_val_score_kwargs) # queue.put(scores) while True: # All parameters from cross_val_score, i to compute pickle name and # persistent_path. passed_parameters = queue.get() if passed_parameters is None: break # Dismember arguments and values. grid, cvs_args, cvs_kwargs = passed_parameters estimator, x = cvs_args estimator.set_params(**grid) del cvs_args # Check if value was already calculated: stored_value = persistent_object.retrieve(estimator, grid) if stored_value is None: scores = cross_val_score(estimator, x, **cvs_kwargs) persistent_object.update(estimator, grid, scores) else: scores = stored_value grid_result = grid.copy() grid_result['scores'] = scores shared_list.append(grid_result)
def score(self, params): self.change_to_int(params, self.to_int_params) self.level0.set_params(**params) score = model_selection.cross_val_score(self.level0, self.trainX, self.trainY, cv=5, n_jobs=-1) print('%s ------ Score Mean:%f, Std:%f' % (params, score.mean(), score.std())) return {'loss': score.mean(), 'status': STATUS_OK}
def evaluateModel(C, gamma): clf = SVC(C=10**C, gamma=10**gamma) return np.average(cross_val_score(clf, X, y))
def rmsle_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf)) return(rmse)
def cross_validate(estimator, training_data, training_targets): mse = cross_val_score(estimator, X=training_data, y=training_targets, scoring=root_mean_log_squared_error) r2 = cross_val_score(estimator, X=training_data, y=training_targets, scoring='r2') return (-1 * np.mean(mse), np.mean(r2))
def perform_adaboost(self,X_train_std,y_train,X_test_std, y_test): ##perform adaboost ada = AdaBoostClassifier(n_estimators=10) ada.fit(X_train_std, y_train) train_score=cross_val_score(ada,X_train_std, y_train) print('The training accuracy is {:.2f}%'.format(train_score.mean()*100)) test_score=cross_val_score(ada,X_test_std, y_test) print('The test accuracy is {:.2f}%'.format(test_score.mean()*100)) X=X_test_std y=y_test resolution=0.01 #Z = svm.predict(np.array([xx1.ravel(), xx2.ravel()]).T) markers = ('s', 'x', 'o', '^', 'v') colors = ('red', 'blue', 'green', 'gray', 'cyan') cmap = ListedColormap(colors[:len(np.unique(y_test))]) X=X_test_std y=y_test # plot the decision surface x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution)) Z = ada.predict(np.array([xx1.ravel(), xx2.ravel()]).T) Z = Z.reshape(xx1.shape) plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) for idx, cl in enumerate(np.unique(y)): plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.5, c=cmap(idx), marker=markers[idx], label=cl) plt.show()
def perform_random_forest(self,X_train_std,y_train,X_test_std, y_test): ## perform random forest rfc = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0) # we create an instance of Neighbours Classifier and fit the data. rfc.fit(X_train_std, y_train) train_score=cross_val_score(rfc,X_train_std, y_train) print('The training accuracy is {:.2f}%'.format(train_score.mean()*100)) test_score=cross_val_score(rfc,X_test_std, y_test) print('The test accuracy is {:.2f}%'.format(test_score.mean()*100)) X=X_test_std y=y_test resolution=0.01 #Z = svm.predict(np.array([xx1.ravel(), xx2.ravel()]).T) markers = ('s', 'x', 'o', '^', 'v') colors = ('red', 'blue', 'green', 'gray', 'cyan') cmap = ListedColormap(colors[:len(np.unique(y_test))]) X=X_test_std y=y_test # plot the decision surface x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution)) Z = rfc.predict(np.array([xx1.ravel(), xx2.ravel()]).T) Z = Z.reshape(xx1.shape) plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) for idx, cl in enumerate(np.unique(y)): plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.5, c=cmap(idx), marker=markers[idx], label=cl) plt.show()
def CV_eval(model, X, y): ''' Perform 8-fold cross-validation Input: model, X data, Y data Return: mean of cross-val accuracy scores ''' scores = cross_val_score(model, X, y, cv=8) pprint (scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) return scores.mean()
def knn(data, predict=False, best_n=None): if best_n: # prediction clf = KNeighborsClassifier(n_neighbors=best_n) return clf knn_scores = [] for n_neighbors in range(4, 51): clf = KNeighborsClassifier(n_neighbors=n_neighbors) scores = cross_val_score(clf, data.X_train, data.y_train, cv=5) knn_scores.append((n_neighbors, scores.mean())) knn_scores = sorted(knn_scores, key=lambda x: x[1], reverse=True) print(knn_scores)
def svm_clf(data): clf = svm.LinearSVC(C=1) for i in range(5): scores = cross_val_score(clf, data.X_train, data.y_train, cv=10) print("iteration",i, "svm mean:", scores.mean()) scores = list(scores) print("svm train scores:\n", scores) return clf # use knn for impute missing values
def knn(data, predict=False): n_neighbors = 3 clf = KNeighborsClassifier(n_neighbors=n_neighbors) for i in range(5): scores = cross_val_score(clf, data.X_train, data.y_train, cv=10) print("svm mean:", scores.mean()) scores = list(scores) print("svm train scores:\n", scores) # prediction best_n = n_neighbors clf = KNeighborsClassifier(n_neighbors=best_n) return clf
def regression(filename): from sklearn.cross_validation import train_test_split print(filename) X,y = loadDataSet(filename) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) from sklearn.linear_model import LinearRegression from sklearn import metrics linreg = LinearRegression() linreg.fit(X_train, y_train) # print(linreg.intercept_, linreg.coef_) # pair the feature names with the coefficients feature_cols = ['????', '????', '??????','?????','??????','???????','???????','?????????','??????'] #print(feature_cols, linreg.coef_) #zip(feature_cols, linreg.coef_) y_pred = linreg.predict(X_test) print("MAE:",metrics.mean_absolute_error(y_test, y_pred)) print("MSE:",metrics.mean_squared_error(y_test, y_pred)) print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred))) scores = cross_val_score(linreg, X, y,cv=5) # print(filename) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) res = pd.DataFrame(linreg.coef_,columns=feature_cols,index=[filename]) return (res) #files = ['?????3?.xlsx','?????4?.xlsx','?????5?.xlsx','?????6?.xlsx']
def regression(filename): from sklearn.linear_model import LinearRegression from sklearn import metrics X,y = loadDataSet(filename) print(filename,X.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25) linreg = LinearRegression() linreg.fit(X_train, y_train) # print(linreg.intercept_, linreg.coef_) # pair the feature names with the coefficients feature_cols = ['????', '????', '??????','?????','??????','???????','???????','?????????','??????'] # feature_cols = ['????', '??????','?????','??????','???????','???????','?????????','??????'] #print(feature_cols, linreg.coef_) #zip(feature_cols, linreg.coef_) y_pred = linreg.predict(X_test) print("MAE:",metrics.mean_absolute_error(y_test, y_pred)) print("MSE:",metrics.mean_squared_error(y_test, y_pred)) print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred))) scores = cross_val_score(linreg, X, y,cv=3) print('scores:',scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) res = pd.DataFrame(linreg.coef_.T[:len(feature_cols)].T,columns=feature_cols,index=[filename.split('.')[0]]) # res = pd.DataFrame(linreg.coef_,index=[filename.split('.')[0]]) return (res) #files = ['201603.xlsx','201604.xlsx','201605.xlsx','?????3?.xlsx','?????4?.xlsx','?????5?.xlsx','?????6?.xlsx'] #files = ['?????3?.xlsx','?????4?.xlsx','?????5?.xlsx','?????6?.xlsx','201703_06.xlsx'] #files = ['201703_06.xlsx']
def cross_validation(self): cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20) scores = cross_val_score(self.clf, self.training_data, self.training_target, cv=cv, scoring='f1_macro') print scores print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def score(self, clf, X, y, groups, n_jobs=1): """get the score""" if len(np.unique(groups)) > 1: # if group as different values, use group cv = LeaveOneGroupOut() else: # else use kfold cv = KFold(5, shuffle=True, random_state=45) auc = cross_val_score(clf, X, y, groups=groups, cv=cv, scoring='accuracy', n_jobs=n_jobs) return auc.mean()
def score(self, clf, X, y, groups, n_jobs=1): """get the score""" if len(np.unique(groups)) > 1: # if group as different values, use group cv = LeaveOneGroupOut() else: # else use kfold cv = KFold(5, shuffle=True, random_state=45) auc = cross_val_score(clf, X, y, groups=groups, cv=cv, scoring='roc_auc', n_jobs=n_jobs) return auc.mean()
def rmse_cv(model, X , y): rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 5)) return(rmse) #%%
def rmse_cv(model, X, Y): rmse = np.sqrt(-cross_val_score(model, X, Y, scoring=scorer, cv=10)) return (rmse)
def check_model(model, splits, X, y): model_scores = cross_val_score(model, X, y, cv=splits, scoring='neg_mean_absolute_error') return sum(model_scores) / len(model_scores)
def perform_CV(self, X_train, y_train, number_folds, n, m): model = RandomForestClassifier(n_estimators=n, max_features=m, n_jobs=8, verbose=self.paras.verbose) acc = np.mean(cross_val_score(model, X_train, y_train, cv=number_folds)) #print 'Size of Forrest : number of trees : ' + str(n) + ', maximum of features : ' + str(m) + '. Accuracy : ' + str(acc) return acc # MODEL SELECTION : Find best parameters ###################################### ## Inputs : X_train, y_train, number of folds, range of number of trees, range of max of features ## Outputs : optimal number of trees, optimal max of features, accuracy
def predict_trait(X, Y): scores = cross_val_score(svm.SVC(), X, Y, scoring='accuracy', cv=10) return scores.mean()
def fit_and_predict(nome, modelo, treino_dados, treino_marcacoes): k = 10 scores = cross_val_score(modelo, treino_dados, treino_marcacoes, cv = k) taxa_de_acerto = np.mean(scores) msg = "Taxa de acerto do {0}: {1}".format(nome, taxa_de_acerto) print(msg) return taxa_de_acerto