我们从Python开源项目中,提取了以下23个代码示例,用于说明如何使用sklearn.cross_validation.cross_val_predict()。
def _cv_r0( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def test_model(self, n_folds=10): """ ?? `??K-??????Stratified K-folds cross-validating?` ??????? """ logging.debug("testing model with {}-folds CV".format(n_folds)) model = self.init_model() X = self.data.data y = self.data.target cv = cross_validation.StratifiedKFold(y, n_folds=n_folds, random_state=42) t0 = time() y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=-1, cv=cv) t = time() - t0 print("=" * 52) print("time cost: {}".format(t)) print() print("confusion matrix\n", metrics.confusion_matrix(y, y_pred)) print() print("\t\taccuracy: {}".format(metrics.accuracy_score(y, y_pred))) print() print("\t\tclassification report") print("-" * 52) print(metrics.classification_report(y, y_pred))
def get_logistic_regression_coefs_l2(self, category, clf=RidgeClassifierCV()): ''' Computes l2-penalized logistic regression score. Parameters ---------- category : str category name to score category : str category name to score Returns ------- (coefficient array, accuracy, majority class baseline accuracy) ''' from sklearn.cross_validation import cross_val_predict y = self._get_mask_from_category(category) X = TfidfTransformer().fit_transform(self._X) clf.fit(X, y) y_hat = cross_val_predict(clf, X, y) acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat) return clf.coef_[0], acc, baseline
def get_logistic_regression_coefs_l1(self, category, clf=LassoCV(alphas=[0.1, 0.001], max_iter=10000, n_jobs=-1)): ''' Computes l1-penalized logistic regression score. Parameters ---------- category : str category name to score Returns ------- (coefficient array, accuracy, majority class baseline accuracy) ''' from sklearn.cross_validation import cross_val_predict y = self._get_mask_from_category(category) y_continuous = self._get_continuous_version_boolean_y(y) # X = TfidfTransformer().fit_transform(self._X) X = self._X clf.fit(X, y_continuous) y_hat = (cross_val_predict(clf, X, y_continuous) > 0) acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat) clf.fit(X, y_continuous) return clf.coef_, acc, baseline
def fit(self, xy_file, fname_out): """ All grid results will be saved later, although only the best result is saved. """ df = read_csv( xy_file) X = df['X'].values y = df['y'].values super().fit( X, y) yp = cross_validation.cross_val_predict( self.best_estimator_, X, y) m_idx = pd.MultiIndex.from_product([['yp'], df['y'].columns]) yp_df = pd.DataFrame( yp, index = df.index, columns=m_idx) df_out = pd.concat([df, yp_df], axis = 1) df_out.to_csv( fname_out) return self
def cross_val_predict(self, fname_out = None): """ This function is added to save the result of the predicted values. """ yp = cross_validation.cross_val_predict( self.best_estimator_, self.X, self.y) idx = pd.MultiIndex.from_product([['yp'], self.df['y'].columns]) yp_df = pd.DataFrame( yp, index = self.df.index, columns=idx) df_out_org = self.df.merge( yp_df, left_index = True, right_index = True) self.df_out = DataFrame( df_out_org[["X", "y", "yp", "param"]]) # df_out = pd.concat([self.df, yp_df], axis = 1) self.df_out.to_csv_excel( '_out', self.fname, fname_out) return yp
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cv_Ridge_BIKE( A_list, yV, XX = None, alpha = 0.5, n_folds = 5, n_jobs = -1, grid_std = None): clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha) ln = A_list[0].shape[0] # ls is the number of molecules. kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True) AX_idx = np.array([list(range( ln))]).T yV_pred = cross_validation.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs) print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=shuffle) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def _cv_LOO_r0( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ n_folds = xM.shape[0] print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cross_predict(feat, f_name, X=X, y=y): if os.name == 'nt': n_jobs = 1 else: n_jobs = -1 # ???? # clf_1 = MultinomialNB(alpha=5) clf_2 = LinearSVC(C=0.02) # ???? (CV) # This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, # which returns stratified randomized folds. The folds are made by preserving # the percentage of samples for each class. # # Note: like the ShuffleSplit strategy, stratified random splits do not guarantee # that all folds will be different, although this is still # very likely for sizeable datasets. # # Pass this cv to cross_val_predict will raise # ValueError:cross_val_predict only works for partitions # # ? cv ?????? fold ? fold ???????? # cv = cross_validation.StratifiedShuffleSplit(y, test_size=0.2, random_state=42) # This cross-validation object is a variation of KFold that returns stratified folds. # The folds are made by preserving the percentage of samples for each class. cv = cross_validation.StratifiedKFold(y, n_folds=5, random_state=42) model = Pipeline([('feat', feat), ('clf', clf_2)]) t0 = time() y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=n_jobs, cv=cv) t = time() - t0 print("=" * 20, f_name, "=" * 20) print("time cost: {}".format(t)) # print("y_predict: {}".format(y_pred)) print() print('confusion matrix:\n', confusion_matrix(y, y_pred)) print() print('\t\taccuracy: {}'.format(accuracy_score(y, y_pred))) print() print("\t\tclassification report") print("-" * 52) print(classification_report(y, y_pred)) # ?? # ???? (tfidf: baseline feature)
def cv_BIKE_Ridge( A_list, yV, alpha = 0.5, XX = None, n_folds = 5, n_jobs = -1, grid_std = None): clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha) ln = A_list[0].shape[0] # ls is the number of molecules. kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True) AX_idx = np.array([list(range( ln))]).T yV_pred = cross_validation.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs) print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def multireg(self,Xtrain,ytrain, Xtest, ytest): self.normalize(Xtrain) ''' # polynomial try poly = PolynomialFeatures(degree=2) Xtrain = poly.fit_transform(Xtrain) Xtest = poly.fit_transform(Xtest) ''' # normal clf fit clf = linear_model.LinearRegression() clf.fit (Xtrain, ytrain) coeffients = clf.coef_ print "coefficients:", coeffients print "intercept:", clf.intercept_ print "train score", clf.score(Xtrain,ytrain) print "test score", clf.score(Xtest,ytest) # manual calculate train accuracy train_results = clf.predict(Xtrain) print "first x:", Xtrain[0] print "first result:", train_results[0] correct = 0 for i in range(len(train_results)): if round(train_results[i], 1) == round(ytrain[i], 1): correct += 1 accuracy = correct * 1.0 / len(ytrain) print "train accuracy: ", accuracy * 100, "%" # cross validation score = cross_validation.cross_val_score(clf, Xtrain, ytrain, scoring='mean_squared_error', cv = 5) print "cross validation score: ", score predict = cross_val_predict(clf, Xtrain, ytrain, cv = 5) correct = 0 for i in range(len(predict)): if round(predict[i], 1) == round(ytrain[i], 1): correct += 1 accuracy = correct * 1.0 / len(ytrain) print "cross validation accuracy: ", accuracy * 100, "%" # manual calculate test accuracy self.normalize(Xtest) results = clf.predict(Xtest) correct = 0 for i in range(len(results)): if round(results[i], 1) == round(ytest[i], 1): correct += 1 accuracy = correct * 1.0 / len(ytest) print "test accuracy: ", accuracy * 100, "%" return coeffients
def lasso_multireg(self,Xtrain,ytrain, Xtest, ytest): self.normalize(Xtrain) clf = linear_model.Lasso(alpha = 0.5) clf.fit (Xtrain, ytrain) coeffients = clf.coef_ print "coeffients: ", coeffients print "train score", clf.score(Xtrain,ytrain) print "test score", clf.score(Xtest,ytest) # manual calculate train accuracy train_results = clf.predict(Xtrain) correct = 0 for i in range(len(train_results)): if round(train_results[i], 1) == round(ytrain[i], 1): correct += 1 accuracy = correct * 1.0 / len(ytrain) print "train accuracy: ", accuracy * 100, "%" # cross validation predict = cross_val_predict(clf, Xtrain, ytrain, cv = 5) correct = 0 for i in range(len(predict)): if round(predict[i], 1) == round(ytrain[i], 1): correct += 1 accuracy = correct * 1.0 / len(ytrain) print "cross validation accuracy: ", accuracy * 100, "%" # manual calculate test accuracy self.normalize(Xtest) results = clf.predict(Xtest) correct = 0 for i in range(len(results)): #print round(results[i], 1), round(ytest[i], 1) if round(results[i], 1) == round(ytest[i], 1): correct += 1 accuracy = correct * 1.0 / len(ytest) print "test accuracy: ", accuracy * 100, "%" return coeffients
def _generate_cross_val_predict_test(X, y, est, pd_est, must_match): def test(self): self.assertEqual( hasattr(est, 'predict'), hasattr(pd_est, 'predict')) if not hasattr(est, 'predict'): return pd_y_hat = pd_cross_val_predict(pd_est, X, y) self.assertTrue(isinstance(pd_y_hat, pd.Series)) self.assertTrue(pd_y_hat.index.equals(X.index)) if must_match: y_hat = cross_val_predict(est, X.as_matrix(), y.values) np.testing.assert_allclose(pd_y_hat, y_hat) return test
def test_cross_val_predict(): boston = load_boston() X, y = boston.data, boston.target cv = cval.KFold(len(boston.target)) est = Ridge() # Naive loop (should be same as cross_val_predict): preds2 = np.zeros_like(y) for train, test in cv: est.fit(X[train], y[train]) preds2[test] = est.predict(X[test]) preds = cval.cross_val_predict(est, X, y, cv=cv) assert_array_almost_equal(preds, preds2) preds = cval.cross_val_predict(est, X, y) assert_equal(len(preds), len(y)) cv = cval.LeaveOneOut(len(y)) preds = cval.cross_val_predict(est, X, y, cv=cv) assert_equal(len(preds), len(y)) Xsp = X.copy() Xsp *= (Xsp > np.median(Xsp)) Xsp = coo_matrix(Xsp) preds = cval.cross_val_predict(est, Xsp, y) assert_array_almost_equal(len(preds), len(y)) preds = cval.cross_val_predict(KMeans(), X) assert_equal(len(preds), len(y)) def bad_cv(): for i in range(4): yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8]) assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv())
def test_cross_val_predict_input_types(): clf = Ridge() # Smoke test predictions = cval.cross_val_predict(clf, X, y) assert_equal(predictions.shape, (10,)) # test with multioutput y predictions = cval.cross_val_predict(clf, X_sparse, X) assert_equal(predictions.shape, (10, 2)) predictions = cval.cross_val_predict(clf, X_sparse, y) assert_array_equal(predictions.shape, (10,)) # test with multioutput y predictions = cval.cross_val_predict(clf, X_sparse, X) assert_array_equal(predictions.shape, (10, 2)) # test with X and y as list list_check = lambda x: isinstance(x, list) clf = CheckingClassifier(check_X=list_check) predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist()) clf = CheckingClassifier(check_y=list_check) predictions = cval.cross_val_predict(clf, X, y.tolist()) # test with 3d X and X_3d = X[:, :, np.newaxis] check_3d = lambda x: x.ndim == 3 clf = CheckingClassifier(check_X=check_3d) predictions = cval.cross_val_predict(clf, X_3d, y) assert_array_equal(predictions.shape, (10,))
def test_cross_val_predict_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((Series, DataFrame)) except ImportError: pass for TargetType, InputFeatureType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) cval.cross_val_predict(clf, X_df, y_ser)
def test_cross_val_predict_sparse_prediction(): # check that cross_val_predict gives same result for sparse and dense input X, y = make_multilabel_classification(n_classes=2, n_labels=1, allow_unlabeled=False, return_indicator=True, random_state=1) X_sparse = csr_matrix(X) y_sparse = csr_matrix(y) classif = OneVsRestClassifier(SVC(kernel='linear')) preds = cval.cross_val_predict(classif, X, y, cv=10) preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10) preds_sparse = preds_sparse.toarray() assert_array_almost_equal(preds_sparse, preds)
def validate(self, features, labels, number_folds): """ Compute a model's performance metrics based on k-fold cross-validation technique. Parameters ---------- features: array-like of shape = [number_samples, number_features] The validation input samples. labels: array-like of shape = [number_samples] or [number_samples, number_outputs] The target values (class labels in classification). number_folds: int The amount of folds for the k-fold cross-validation. If 0 compute metrics withput folds. If > 0 compute metrics with n folds, n=number_folds. Return ---------- accuracy: float The accuracy of the model based on it's confusion matrix. precision: float The precision of the model based on it's confusion matrix. sensitivity: float The sensitivity of the model based on it's confusion matrix. specificity: float The specificity of the model based on it's confusion matrix. kappa: float The Cohen's Kappa of the model based on it's confusion matrix. """ if number_folds == 0: predictions = self.model.predict(features) else: predictions = cross_val_predict(self.model, features, labels, cv = number_folds) matrix = confusion_matrix(labels, predictions) sum_columns = numpy.sum(matrix, 0) sum_rows = numpy.sum(matrix, 1) diagonal_sum = numpy.trace(matrix) total_sum = numpy.sum(sum_rows) accuracy = diagonal_sum / total_sum temp_precision = [] temp_sensitivity = [] temp_specificity = [] for i in range(len(matrix)): temp_precision.append(matrix[i][i] / sum_columns[i]) temp_sensitivity.append(matrix[i][i] / sum_rows[i]) temp_reduced_sum = total_sum - sum_rows[i] - sum_columns[i] + matrix[i][i] temp_specificity.append(temp_reduced_sum / (temp_reduced_sum + sum_columns[i] - matrix[i][i])) precision = sum(temp_precision * sum_rows) / total_sum sensitivity = sum(temp_sensitivity * sum_rows) / total_sum specificity = sum(temp_specificity * sum_rows) / total_sum kappa_sum = sum(sum_rows * sum_columns) kappa_numerator = (total_sum * diagonal_sum) - kappa_sum kappa_denominator = (total_sum * total_sum) - kappa_sum kappa = kappa_numerator / kappa_denominator return accuracy, precision, sensitivity, specificity, kappa
def ridge_multireg(self,Xtrain,ytrain, Xtest, ytest): self.normalize(Xtrain) ''' # polynomial try poly = PolynomialFeatures(degree=2) Xtrain = poly.fit_transform(Xtrain) Xtest = poly.fit_transform(Xtest) ''' # normal clf try clf = linear_model.Ridge(alpha = 10000) clf.fit (Xtrain, ytrain) coeffients = clf.coef_ print "train score", clf.score(Xtrain,ytrain) print "test score", clf.score(Xtest,ytest) # manual calculate train accuracy train_results = clf.predict(Xtrain) correct = 0 for i in range(len(train_results)): if round(train_results[i], 1) == round(ytrain[i], 1): correct += 1 accuracy = correct * 1.0 / len(ytrain) print "train accuracy: ", accuracy * 100, "%" # cross validation score = cross_validation.cross_val_score(clf, Xtrain, ytrain, scoring='mean_squared_error', cv = 5) print "cross validation score: ", score ''' predict = cross_val_predict(clf, Xtrain, ytrain, cv = 5) correct = 0 for i in range(len(predict)): if round(predict[i]) == round(ytrain[i]): correct += 1 accuracy = correct * 1.0 / len(ytrain) print "cross validation accuracy: ", accuracy * 100, "%" ''' # manual calculate test accuracy self.normalize(Xtest) results = clf.predict(Xtest) correct = 0 for i in range(len(results)): if round(results[i], 1) == round(ytest[i], 1): correct += 1 accuracy = correct * 1.0 / len(ytest) print "test accuracy: ", accuracy * 100, "%" return coeffients