我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cross_validation.cross_val_score()。
def print_accuracy_report(classifier, X, y, num_validations=5): accuracy = cross_validation.cross_val_score(classifier, X, y, scoring='accuracy', cv=num_validations) print "Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%" f1 = cross_validation.cross_val_score(classifier, X, y, scoring='f1_weighted', cv=num_validations) print "F1: " + str(round(100*f1.mean(), 2)) + "%" precision = cross_validation.cross_val_score(classifier, X, y, scoring='precision_weighted', cv=num_validations) print "Precision: " + str(round(100*precision.mean(), 2)) + "%" recall = cross_validation.cross_val_score(classifier, X, y, scoring='recall_weighted', cv=num_validations) print "Recall: " + str(round(100*recall.mean(), 2)) + "%"
def rfr_feature_select(): from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.cross_validation import cross_val_score, ShuffleSplit boston = load_boston() X = boston["data"] Y = boston["target"] names = boston["feature_names"] rf = RandomForestRegressor(n_estimators=20, max_depth=4) scores = [] for i in range(X.shape[1]): score = cross_val_score(rf, X[:, i:i + 1], Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3)) scores.append((round(np.mean(score), 3), names[i])) print sorted(scores, reverse=True)
def evaluate_model(model, X_train, y_train): ''' INPUT - model: this is a classification model from sklearn - X_train: 2d array of the features - y_train: 1d array of the target OUTPUT - information about the model's accuracy using 10 fold cross validation - model: the fit model Returns the model ''' print(np.mean(cross_val_score(model, X_train, y_train, cv=10, n_jobs=-1, verbose=10))) model.fit(X_train, y_train) return model
def experiment(model_class, vectorizer, xval): name = model_class.__class__.__name__ + '.' + model_class.penalty model = model_class.fit(X, y) model_weights = vectorizer.inverse_transform(model.coef_)[0] with open('weights.%s.txt' % name, 'w') as f: f.write('%s\t%f\n' % ('(intercept)', model.intercept_)) f.writelines('%s\t%f\n' % k for k in model_weights.items()) acc_scores = cross_validation.cross_val_score(model, X, y, cv=xval) auc_scores = cross_validation.cross_val_score(model, X, y, scoring='roc_auc', cv=xval) prec_scores = cross_validation.cross_val_score(model, X, y, scoring='precision', cv=xval) recall_scores = cross_validation.cross_val_score(model, X, y, scoring='recall', cv=xval) f1_scores = cross_validation.cross_val_score(model, X, y, scoring='f1', cv=xval) print '-'*80 print 'acc\t%.4f\t%s' % (np.mean(acc_scores), name) print 'auc\t%.4f\t%s' % (np.mean(auc_scores), name) print 'prec\t%.4f\t%s' % (np.mean(prec_scores), name) print 'recall\t%.4f\t%s' % (np.mean(recall_scores), name) print 'f1\t%.4f\t%s' % (np.mean(f1_scores), name)
def trainLimited(self, featureFile, n_datapoints): (label_vector, input_vector) = loadData(featureFile) trainData, testData, trainLabels, testLabels = \ cross_validation.train_test_split(input_vector, label_vector, test_size=(0)) n_totalrows = int((len(label_vector)/n_datapoints)) for n in range(0, n_totalrows): limited_label_vector = trainLabels[0: (n+1) * n_datapoints] limited_input_vector = trainData[0: (n+1) * n_datapoints] kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') kNNClassifier.fit(limited_input_vector, limited_label_vector) scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5) print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
def run_model(model,dtrain,predictor_var,target,scoring_method='mean_squared_error'): cv_method = KFold(len(dtrain),5) cv_scores = cross_val_score(model,dtrain[predictor_var],dtrain[target],cv=cv_method,scoring=scoring_method) #print cv_scores, np.mean(cv_scores), np.sqrt((-1)*np.mean(cv_scores)) dtrain_for_val = dtrain[dtrain['Year']<2000] dtest_for_val = dtrain[dtrain['Year']>1999] #cv_method = KFold(len(dtrain_for_val),5) #cv_scores_2 = cross_val_score(model,dtrain_for_val[predictor_var],dtrain_for_val[target],cv=cv_method,scoring=scoring_method) #print cv_scores_2, np.mean(cv_scores_2) dtrain_for_val_ini = dtrain_for_val[predictor_var] dtest_for_val_ini = dtest_for_val[predictor_var] model.fit(dtrain_for_val_ini,dtrain_for_val[target]) pred_for_val = model.predict(dtest_for_val_ini) #print math.sqrt(mean_squared_error(dtest_for_val['Footfall'],pred_for_val))
def eval_model(name, model, data): print '=' * 20 print name, 'training' model.fit(data, train.target, sample_weight=sample_weights) print name, 'trained' predictions = model.predict(processed_test_data) print name, 'accuracy', np.mean(predictions == test.target) print(metrics.classification_report(test.target, predictions)) print metrics.confusion_matrix(test.target, predictions) print name, 'f1 cross validation', cross_validation.cross_val_score(model, grammar_processed_data, train.target, scoring='f1') print name, 'precision cross validation', cross_validation.cross_val_score( model, grammar_processed_data, train.target, scoring='precision' ) return model, predictions # SVM need balance on input features, same ranges and variances and stuff like that
def cross_validation_report(clf, dataset): data = count_vectorizer.transform([row[0] for row in dataset]) target = [row[1] for row in dataset] return cross_validation.cross_val_score(clf, data, target)
def evaluate(model, name): """ Evaluates model by cross validation. """ # Get scores through cross validation score_f1 = cross_val_score(model, X, y, scoring='f1', cv=splitter_) score_pr = cross_val_score(model, X, y, scoring='precision', cv=splitter_) score_re = cross_val_score(model, X, y, scoring='recall', cv=splitter_) # Save image of score distributions save_dist(name, score_f1, score_pr, score_re) # Compute mean and std of each score result = DataFrame(index=['f1', 'precision', 'recall'], columns=['mean', 'std']) result.loc['f1', 'mean'] = np.mean(score_f1) result.loc['precision', 'mean'] = np.mean(score_pr) result.loc['recall', 'mean'] = np.mean(score_re) result.loc['f1', 'std'] = np.std(score_f1) result.loc['precision', 'std'] = np.std(score_pr) result.loc['recall', 'std'] = np.std(score_re) print model print result
def rf_from_cfg(cfg, seed): """ Creates a random forest regressor from sklearn and fits the given data on it. This is the function-call we try to optimize. Chosen values are stored in the configuration (cfg). Parameters: ----------- cfg: Configuration configuration chosen by smac seed: int or RandomState used to initialize the rf's random generator Returns: ----------- np.mean(rmses): float mean of root mean square errors of random-forest test predictions per cv-fold """ rfr = RandomForestRegressor( n_estimators=cfg["num_trees"], criterion=cfg["criterion"], min_samples_split=cfg["min_samples_to_split"], min_samples_leaf=cfg["min_samples_in_leaf"], min_weight_fraction_leaf=cfg["min_weight_frac_leaf"], max_features=cfg["max_features"], max_leaf_nodes=cfg["max_leaf_nodes"], bootstrap=cfg["do_bootstrapping"], random_state=seed) def rmse(y, y_pred): return np.sqrt(np.mean((y_pred - y)**2)) # Creating root mean square error for sklearns crossvalidation rmse_scorer = make_scorer(rmse, greater_is_better=False) score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer) return -1 * np.mean(score) # Because cross_validation sign-flips the score
def Second_Model_KRR(Scaled_Input_Data, Output_Data): T0 = time.time() n = len(Scaled_Input_Data) Grid_Dict = {"alpha": [1e0, 1e-1, 1e-2],"gamma": np.logspace(-2, 1, 3)} krr_Tuned = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5 ,param_grid=Grid_Dict, scoring="mean_absolute_error") krr_Tuned.fit(Scaled_Input_Data, Output_Data) KRR_MSE = KernelRidge(kernel='rbf', alpha=krr_Tuned.best_params_['alpha'], gamma=krr_Tuned.best_params_['gamma']) KRR_Time = time.time() - T0 print('The computational time of Kernel Ridge Regression for ', n, ' examples is: ', KRR_Time) MSEs_KRR = cross_validation.cross_val_score(KRR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error") MeanMSE_KRR = np.mean(list(MSEs_KRR)) print('The average MSE of Kernel Ridge Regression for ', n, ' examples is: ', (-1*MeanMSE_KRR)) return(MeanMSE_KRR, krr_Tuned)
def evaluate_model(model, X_train, y_train): """ Args: model (sklearn classification model): this model from sklearn that will be used to fit the data and to see the 10 fold cross val score of X_train (2d numpy array): this is the feature matrix y_train (1d numpy array): this is the array of targets Returns: prints information about the model's accuracy using 10 fold cross validation model (sklearn classification model): the model that has already been fit to the data """ print(np.mean(cross_val_score(model, X_train, y_train, cv=10, n_jobs=-1, verbose=10))) model.fit(X_train, y_train) return model
def clf_scores(clf, x_train, y_train, x_test, y_test): info = dict() # TODO: extend this to a confusion matrix per fold for more flexibility downstream (tuning) # TODO: calculate a set of ROC curves per fold instead of running it on test, currently introducing bias scores = cross_val_score(clf, x_train, y_train, cv=cv, n_jobs=-1) runtime = time() clf.fit(x_train, y_train) runtime = time() - runtime y_test_predicted = clf.predict(x_test) info['runtime'] = runtime info['accuracy'] = min(scores) info['accuracy_test'] = accuracy_score(y_test, y_test_predicted) info['accuracy_folds'] = scores info['confusion_matrix'] = confusion_matrix(y_test, y_test_predicted) clf.fit(x_train, y_train) fpr, tpr, _ = roc_curve(y_test, clf_predict_proba(clf, x_test)) info['fpr'] = fpr info['tpr'] = tpr info['auc'] = auc(fpr, tpr) return info
def test_cross_val_score_mask(): # test that cross_val_score works with boolean masks svm = SVC(kernel="linear") iris = load_iris() X, y = iris.data, iris.target cv_indices = cval.KFold(len(y), 5) scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices) cv_indices = cval.KFold(len(y), 5) cv_masks = [] for train, test in cv_indices: mask_train = np.zeros(len(y), dtype=np.bool) mask_test = np.zeros(len(y), dtype=np.bool) mask_train[train] = 1 mask_test[test] = 1 cv_masks.append((train, test)) scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks) assert_array_equal(scores_indices, scores_masks)
def test_cross_val_score_precomputed(): # test for svm with precomputed kernel svm = SVC(kernel="precomputed") iris = load_iris() X, y = iris.data, iris.target linear_kernel = np.dot(X, X.T) score_precomputed = cval.cross_val_score(svm, linear_kernel, y) svm = SVC(kernel="linear") score_linear = cval.cross_val_score(svm, X, y) assert_array_equal(score_precomputed, score_linear) # Error raised for non-square X svm = SVC(kernel="precomputed") assert_raises(ValueError, cval.cross_val_score, svm, X, y) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cval.cross_val_score, svm, linear_kernel.tolist(), y)
def test_cross_val_score_with_score_func_classification(): iris = load_iris() clf = SVC(kernel='linear') # Default score (should be the accuracy score) scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5) assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2) # Correct classification score (aka. zero / one score) - should be the # same as the default estimator score zo_scores = cval.cross_val_score(clf, iris.data, iris.target, scoring="accuracy", cv=5) assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2) # F1 score (class are balanced so f1_score should be equal to zero/one # score f1_scores = cval.cross_val_score(clf, iris.data, iris.target, scoring="f1_weighted", cv=5) assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
def test_cross_val_score_with_score_func_regression(): X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) reg = Ridge() # Default score of the Ridge regression estimator scores = cval.cross_val_score(reg, X, y, cv=5) assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # R2 score (aka. determination coefficient) - should be the # same as the default estimator score r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative mse_scores = cval.cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error") expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(mse_scores, expected_mse, 2) # Explained variance scoring = make_scorer(explained_variance_score) ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring) assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_cross_val_score_multilabel(): X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]]) y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]) clf = KNeighborsClassifier(n_neighbors=1) scoring_micro = make_scorer(precision_score, average='micro') scoring_macro = make_scorer(precision_score, average='macro') scoring_samples = make_scorer(precision_score, average='samples') score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5) score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5) score_samples = cval.cross_val_score(clf, X, y, scoring=scoring_samples, cv=5) assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def run(): tr_data = np.loadtxt('../new/TRAIN_LRFORMAT.txt') te_data = np.loadtxt('../new/TEST_LRFORMAT.txt') tr_x = tr_data[:,1:] tr_y = tr_data[:,0] te_x = te_data[:,1:] lr = LogisticRegression( solver='liblinear', multi_class='ovr', class_weight='balanced', penalty='l2', n_jobs=-1) #te_pred = lr.predict_proba(te_x) cv = 10 scores = cross_val_score(lr,tr_x,tr_y,cv=cv,scoring='accuracy') print(str(scores)) #np.savetxt('result/te_lr.txt',te_pred)
def rmse_cv(model, X, y): return (cross_val_score(model, X, y, scoring=scorer)).mean()
def baseline_logisticRegression(): train_data = pd.read_csv(r"data/train.csv") #print u"?????\n",train_data.info() #print u'?????\n',train_data.describe() #display_data(train_data) # ???????? #display_with_process(train_data) # ??????????????????,???? process_data = pre_processData(train_data,'process_train_data') # ???????????? train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X = train_np[:,1:] y = train_np[:,0] #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y) print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) #=prediction = model.predict(X_test) #=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:]) #=cv_error.to_csv(r'error.csv',index=True) #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0]) '''??????''' test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data') # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False) #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5) # baseline?SVM??——0.78947
def baseline_logisticRegression_crossValidate(): origin_train_data = pd.read_csv(r"data/train.csv") process_data = fe_preprocessData(origin_train_data,'process_train_data') # ???????????? process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2) train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X_train = train_np[:,1:] y_train = train_np[:,0] model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)}) cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') cv_np = cv_data.as_matrix() X_cv = cv_np[:,1:] y_cv = cv_np[:,0] predictions = model.predict(X_cv) print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0]) '''?????????????????''' error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)] predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId']) predictions_item.columns=['error_PassengerId'] error_result = pd.concat([error_items,predictions_item],axis=1) error_result.to_csv(r'error.csv',index=False) #=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) #=prediction = model.predict(X_test) #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0]) '''??????''' '''test_data = pd.read_csv(r"data/test.csv") process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True) # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'logisticRegression_result/prediction.csv',index=False)''' #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5)
def optimize_logisticRegression(): train_data = pd.read_csv(r"data/train.csv") print u"?????\n",train_data.info() print u'?????\n',train_data.describe() #display_data(train_data) # ???????? #display_with_process(train_data) # ??????????????????,???? process_data = fe_preprocessData(train_data,'process_train_data') # ???????????? train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X = train_np[:,1:] y = train_np[:,0] #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y) print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) '''??????''' test_data = pd.read_csv(r"data/test.csv") process_test_data = fe_preprocessData(test_data,'process_test_data') # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False) #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5) ## ????????
def stump(X, y): score = cross_val_score(LinearSVC(), X, y, cv = 5, n_jobs=5, scoring = 'average_precision') clf = LinearSVC() clf.fit(X, y) coef = clf.coef_[0,0] inter = clf.intercept_[0] return np.mean(score), np.sign(coef), inter / np.abs(coef)
def run_croos_validation(self): features,labels,cv = self.getFeaturesLabel() scores = cross_validation.cross_val_score(self.clf, features, labels, cv=cv, scoring=mean_absolute_percentage_error_scoring, n_jobs = -1) print "cross validation scores: means, {}, std, {}, details,{}".format(np.absolute(scores.mean()), scores.std(), np.absolute(scores)) return -np.absolute(scores.mean())
def build_random_forest_model(x_train, y_train): rf_model = RandomForestClassifier(random_state=42) rf_model.fit(x_train, y_train.ravel()) print "10-fold Cross validation score is :" print np.mean(cross_val_score(rf_model, x_train, y_train, cv=10)) return rf_model
def evaluate_cross_validation(clf, X, y, K): # create a k-fold croos validation iterator of k=5 folds cv = KFold(len(y), K, shuffle=True, random_state=0) # by default the score used is the one returned by score method of the estimator (accuracy) scores = cross_val_score(clf, X, y, cv=cv) print scores print ("Mean score: {0:.3f} (+/-{1:.3f})").format( np.mean(scores), sem(scores))
def hackathon_GBC_model(clf, train, features): clf.fit(train[features], train["Class"]) probab_of_predict = clf.predict_proba(train[features])[:,1] predict_train = clf.predict(train[features]) cv_score = cross_val_score(clf, train[features], train["Class"], cv=5, scoring="roc_auc") print("----------------------Model performance-----------------------") print("Accuracy score: ", accuracy_score(train["Class"].values, predict_train)) print("AUC: ", roc_auc_score(train["Class"],probab_of_predict) ) print("CV score: Mean - {}, Max - {}, Min - {}, Std - {}".format(np.mean(cv_score), np.max(cv_score), np.min(cv_score), np.std(cv_score))) Relative_Feature_importance = pd.Series(clf.feature_importances_, features).sort_values(ascending=False) Relative_Feature_importance.plot(kind='bar', title='Order of Feature Importance') plt.ylabel('Feature Importance') plt.show()
def print_metrics(clf): #scores = cross_validation.cross_val_score(clf,features,labels,cv=5,scoring='accuracy') #print 'Accuracy:',scores.mean() cv = cross_validation.StratifiedKFold(labels,n_folds=5) mean_tpr = 0.0 mean_fpr = np.linspace(0,1,100) all_tpr = [] for i, (train,test) in enumerate(cv): probas_ = clf.fit(features[train],labels[train]).predict_proba(features[test]) fpr,tpr,thresholds = metrics.roc_curve(labels[test],probas_[:,1]) mean_tpr += interp(mean_fpr,fpr,tpr) mean_tpr[0] = 0.0 roc_auc = metrics.auc(fpr,tpr) plt.plot(fpr,tpr,lw=1,label='ROC fold %d (area = %0.2f)' % (i,roc_auc)) plt.plot([0,1],[0,1],'--',color=(0.6,0.6,0.6),label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = metrics.auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig('auc_sent.png')
def experiment(model_class, vectorizer, xval): name = model_class.__class__.__name__ model = model_class.fit(X, y) model_weights = vectorizer.inverse_transform(model.coef_)[0] with open('weights.%s.txt' % name, 'w') as f: f.write('%s\t%f\n' % ('(intercept)', model.intercept_)) f.writelines('%s\t%f\n' % k for k in model_weights.items()) r2_scores = cross_validation.cross_val_score(model, X, y, scoring='r2', cv=xval) mae_scores = cross_validation.cross_val_score(model, X, y, scoring='mean_absolute_error', cv=xval) print '-'*80 print 'r2\t%.4f\t%s' % (np.mean(r2_scores), name) print 'mae\t%.4f\t%s' % (np.mean(mae_scores), name)
def calculate(X, y): best_p, best_score = 0, -float('inf') kf = KFold(len(y), n_folds=5, shuffle=True, random_state=42) for p in numpy.linspace(1, 10, num=200): knr = KNeighborsRegressor(n_neighbors=5, weights='distance', p=p) score = max(cross_val_score(knr, X, y, cv=kf, scoring='mean_squared_error')) if score > best_score: best_score = score best_p = p return best_p, best_score
def calculate(X, y): kf = KFold(len(data), n_folds=5, shuffle=True, random_state=42) best_k, best_score = 0, 0 for k in xrange(1, 51): knn = KNeighborsClassifier(n_neighbors=k) score = cross_val_score(knn, X, y, cv=kf, scoring='accuracy').mean() if score > best_score: best_score = score best_k = k return best_k, best_score
def calculate(X, y, threshold): best_t, best_score = 0, -float('inf') kf = KFold(len(y), n_folds=5, random_state=1, shuffle=True) for t in xrange(1, 51): clf = RandomForestRegressor(n_estimators=t, random_state=1) score = np.mean(cross_val_score(clf, X, y, cv=kf, scoring='r2')) if score > threshold: return t
def accuracy(features, labels): from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn import cross_validation # We use logistic regression because it is very fast. # Feel free to experiment with other classifiers clf = Pipeline([('preproc', StandardScaler()), ('classifier', LogisticRegression())]) cv = cross_validation.LeaveOneOut(len(features)) scores = cross_validation.cross_val_score( clf, features, labels, cv=cv) return scores.mean()
def regression_with_GBR(X_train, y_train, X_test, y_test, parmsFromNormalization, params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01, 'loss': 'ls'}): #GradientBoostingRegressor gfr = GradientBoostingRegressor(**params) gfr.fit(X_train, y_train) y_pred_gbr = gfr.predict(X_test) print_regression_model_summary("GBR", y_test, y_pred_gbr, parmsFromNormalization) print_feature_importance(X_test, y_test,gfr.feature_importances_) #cross validation ( not sure this make sense for regression #http://scikit-learn.org/stable/modules/cross_validation.html #gfr = GradientBoostingRegressor(**params) #scores = cross_validation.cross_val_score(gfr, X_train, y_train, cv=5) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) return y_pred_gbr
def crossValidateModel(self): (label_vector, input_vector) = loadData(self.featureFile) kFold = 5 kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') scores = cross_validation.cross_val_score(kNNClassifier, input_vector, label_vector, cv = kFold) print("\n----- k-fold Cross Validation -----") print(scores) print("Average: ", sum(scores) / len(scores))
def cv(self, estimator_params): if self.ptypes != None: if self.ptypes == 'int': for key in estimator_params.keys(): estimator_params[key] = int(estimator_params[key]) else: for key in self.ptypes.keys(): estimator_params[key] = self.ptypes[key](estimator_params[key]) if self.pfixed != None: for key in self.pfixed.keys(): estimator_params[key] = self.pfixed[key] if self.plist != None: for key in self.plist.keys(): estimator_params[key] = self.plist[key][int(estimator_params[key]) - 1] self.estimator.set_params(**estimator_params) v = self.estimator.evaluate(self.cv_params['X']) return v # self.cv_params['estimator'] = estim # cvscore = cross_val_score(**self.cv_params) # return numpy.mean(cvscore) # --------------------------------------------- // --------------------------------------------- #
def test_iris(self): dataset = load_iris() score = np.mean(cross_val_score( DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10)) print('iris: tree_type: {}, score = {}'.format(self.tree_type, score)) self.assertTrue(score > 0.8)
def test_breast_cancer(self): dataset = load_breast_cancer() score = np.mean(cross_val_score( DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10)) print('breast_cancer: tree_type: {}, score = {}'.format(self.tree_type, score)) self.assertTrue(score > 0.8)
def test_iris(self): dataset = load_iris() score = np.mean(cross_val_score( DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10)) self.assertTrue(score > 0.8) print('iris: tree_type: {}, score = {}'.format(self.tree_type, score))
def test_breast_cancer(self): dataset = load_breast_cancer() score = np.mean(cross_val_score( DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10)) self.assertTrue(score > 0.8) print('breast_cancer: tree_type: {}, score = {}'.format(self.tree_type, score))
def cv(model, X, y, n_iter=5, test_size=0.3): split = cross_validation.ShuffleSplit( len(X), n_iter=n_iter, test_size=test_size, ) return cross_validation.cross_val_score(model, X, y, cv=split, scoring='accuracy', n_jobs=-1)
def random_forest_classify(my_train_data, my_train_label, my_test_data, estimators): clf = RandomForestClassifier(n_estimators=estimators) scores = cross_validation.cross_val_score(clf, my_train_data, my_train_label, cv=5) print("random forest(%d) accuracy: %0.3f (+/- %0.3f)" % (estimators, scores.mean(), scores.std() * 2)) clf.fit(my_train_data, my_train_label) my_test_label = clf.predict(my_test_data) file_name = "random_forest_%d.csv" % estimators save_data(my_test_label, file_name)
def gradient_boosting_classify(my_train_data, my_train_label, my_test_data, estimators): clf = GradientBoostingClassifier(n_estimators=estimators) scores = cross_validation.cross_val_score(clf, my_train_data, my_train_label, cv=5) print("gradient boosting(%d) accuracy: %0.3f (+/- %0.3f)" % (estimators, scores.mean(), scores.std() * 2)) clf.fit(my_train_data, my_train_label) my_test_label = clf.predict(my_test_data) file_name = "gradient_boosting_%d.csv" % estimators save_data(my_test_label, file_name)
def svc_classify(my_train_data, my_train_label, my_test_data, svc_c): # clf = svm.SVC(C=svc_c, kernel='poly') clf = svm.SVC(C=svc_c) scores = cross_validation.cross_val_score(clf, my_train_data, my_train_label, cv=5) print("svc(C=%.1f) accuracy: %0.3f (+/- %0.3f)" % (svc_c, scores.mean(), scores.std() * 2)) clf.fit(my_train_data, my_train_label) my_test_label = clf.predict(my_test_data) file_name = "svc_%.1f.csv" % svc_c save_data(my_test_label, file_name)
def cross_validate(self): clf = self._clf[self._learner] (X_train, y_train) = self._train_data print " + Cross-validating classifier (learner = %s)..." \ % self._learner,; stdout.flush() scores = cross_val_score( self._clf[self._learner], X_train, y_train, scoring=make_scorer(roc_auc_score), cv=3) print "done.\n * Scores: %r" % scores
def First_Model_SVR(Scaled_Input_Data, Output_Data): T0 = time.time() n = len(Scaled_Input_Data) Grid_Dict = {"C": [1e-2, 1e-1,1e0, 1e1, 1e2],"gamma": np.logspace(-4, 2, 6)} svr_Tuned = GridSearchCV(SVR(kernel='rbf', gamma=0.1, tol = 0.005), cv=5,param_grid=Grid_Dict, scoring="mean_absolute_error") svr_Tuned.fit(Scaled_Input_Data, Output_Data) SVR_MSE = SVR(kernel='rbf', C=svr_Tuned.best_params_['C'], gamma=svr_Tuned.best_params_['gamma'], tol = 0.01) SVR_Time = time.time() - T0 print('The computational time of Radial based Support Vector Regression for ', n, ' examples is: ', SVR_Time) MSEs_SVR = cross_validation.cross_val_score(SVR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error") MeanMSE_SVR = np.mean(list(MSEs_SVR)) print('The average MSE of Radial based Support Vector Regression for ', n, ' examples is: ', (-1*MeanMSE_SVR)) return(MeanMSE_SVR, svr_Tuned)
def RF_Model(Scaled_Input_Data, Output_Data): T0 = time.time() n = len(Scaled_Input_Data) RFModel = RandomForestRegressor() RFModel.fit(Scaled_Input_Data, Output_Data) RF_Time = time.time() - T0 print('The computational time of Random Forest Regression for ', n, ' examples is: ', RF_Time) MSEs_RF = cross_validation.cross_val_score(RFModel, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error") MeanMSE_RF = np.mean(list(MSEs_RF)) print('The average MSE of Random Forest Regression for ', n, ' examples is: ', (-1*MeanMSE_RF)) return(MeanMSE_RF, RFModel)
def hyperopt_train_test(params): clf = rxn_estimator(np.float32(params[0]), np.float32(params[1]), np.int(params[2]), other_param_dict) return cross_val_score(clf, X, y, cv=3).mean()