我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cross_validation.KFold()。
def _cv_r0( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def getConfidenceScores(features_train, labels_train, C): train_confidence = [] #confidence scores for training data are computed using K-fold cross validation kfold = KFold(features_train.shape[0], n_folds=10) for train_index,test_index in kfold: X_train, X_test = features_train[train_index], features_train[test_index] y_train, y_test = labels_train[train_index], labels_train[test_index] #train classifier for the subset of train data m = SVM.train(X_train,y_train,c=C,k="linear") #predict confidence for test data and append it to list conf = m.decision_function(X_test) for x in conf: train_confidence.append(x) return np.array(train_confidence) #save pos scores
def test_cv(): """Simple CV check.""" # XXX: don't use scikit-learn for tests. X, y = make_regression() cv = KFold(X.shape[0], 5) glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1) # check that it returns 5 scores scores = cross_val_score(glm_normal, X, y, cv=cv) assert_equal(len(scores), 5) param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)}, {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01), 10, base=np.exp(1))}] glmcv = GridSearchCV(glm_normal, param_grid, cv=cv) glmcv.fit(X, y)
def _calculate(self, X, y, categorical, metafeatures, helpers): import sklearn.lda if len(y.shape) == 1 or y.shape[1] == 1: kf = cross_validation.StratifiedKFold(y, n_folds=10) else: kf = cross_validation.KFold(y.shape[0], n_folds=10) accuracy = 0. try: for train, test in kf: lda = sklearn.lda.LDA() if len(y.shape) == 1 or y.shape[1] == 1: lda.fit(X[train], y[train]) else: lda = OneVsRestClassifier(lda) lda.fit(X[train], y[train]) predictions = lda.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10 except LinAlgError as e: self.logger.warning("LDA failed: %s Returned 0 instead!" % e) return np.NaN except ValueError as e: self.logger.warning("LDA failed: %s Returned 0 instead!" % e) return np.NaN
def _calculate(self, X, y, categorical, metafeatures, helpers): import sklearn.naive_bayes if len(y.shape) == 1 or y.shape[1] == 1: kf = cross_validation.StratifiedKFold(y, n_folds=10) else: kf = cross_validation.KFold(y.shape[0], n_folds=10) accuracy = 0. for train, test in kf: nb = sklearn.naive_bayes.GaussianNB() if len(y.shape) == 1 or y.shape[1] == 1: nb.fit(X[train], y[train]) else: nb = OneVsRestClassifier(nb) nb.fit(X[train], y[train]) predictions = nb.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def _calculate(self, X, y, categorical, metafeatures, helpers): import sklearn.tree if len(y.shape) == 1 or y.shape[1] == 1: kf = cross_validation.StratifiedKFold(y, n_folds=10) else: kf = cross_validation.KFold(y.shape[0], n_folds=10) accuracy = 0. for train, test in kf: random_state = check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit(X[train], y[train]) else: tree = OneVsRestClassifier(tree) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def _calculate(self, X, y, categorical, metafeatures, helpers): import sklearn.tree if len(y.shape) == 1 or y.shape[1] == 1: kf = cross_validation.StratifiedKFold(y, n_folds=10) else: kf = cross_validation.KFold(y.shape[0], n_folds=10) accuracy = 0. for train, test in kf: random_state = check_random_state(42) node = sklearn.tree.DecisionTreeClassifier( criterion="entropy", max_depth=1, random_state=random_state, min_samples_split=1, min_samples_leaf=1, max_features=None) if len(y.shape) == 1 or y.shape[1] == 1: node.fit(X[train], y[train]) else: node = OneVsRestClassifier(node) node.fit(X[train], y[train]) predictions = node.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def _calculate(self, X, y, categorical, metafeatures, helpers): import sklearn.tree if len(y.shape) == 1 or y.shape[1] == 1: kf = cross_validation.StratifiedKFold(y, n_folds=10) else: kf = cross_validation.KFold(y.shape[0], n_folds=10) accuracy = 0. for train, test in kf: random_state = check_random_state(42) node = sklearn.tree.DecisionTreeClassifier( criterion="entropy", max_depth=1, random_state=random_state, min_samples_split=1, min_samples_leaf=1, max_features=1) if len(y.shape) == 1 or y.shape[1] == 1: node.fit(X[train], y[train]) else: node = OneVsRestClassifier(node) node.fit(X[train], y[train]) predictions = node.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def rede_neural(X, y): print("Iniciando treinamento da Rede Neural") X2 = normalize(X) clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', algorithm='adam', alpha=1e-5, learning_rate='constant',tol=1e-8,learning_rate_init=0.0002, early_stopping=True,validation_fraction=0.2) kf = KFold(len(y),n_folds=3) i = 0 for train,test in kf: start = time.time() i = i + 1 print("Treinamento",i) # dividindo dataset em treino e test #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1) X_train, X_test, y_train, y_test = X2[train], X2[test], y[train], y[test] # fit clf.fit(X_train, y_train) print("score:",clf.score(X_test, y_test),"(",(time.time()-start)/60.0,"minutos )") return clf
def run_cross_validation_create_models(cnn, nfolds, submission_version): from sklearn.cross_validation import KFold files = glob.glob(INPUT_PATH + "*/*.jpg") additional_files = glob.glob(INPUT_PATH_ADD + "*/*.jpg") kf = KFold(len(files), n_folds=nfolds, shuffle=True, random_state=get_random_state(cnn)) num_fold = 0 sum_score = 0 print('Len of additional files: {}'.format(len(additional_files))) for train_index, test_index in kf: num_fold += 1 print('Start KFold number {} from {}'.format(num_fold, nfolds)) print('Split train: ', len(train_index)) print('Split valid: ', len(test_index)) score = train_single_model(cnn, num_fold, train_index, test_index, files, additional_files, submission_version) sum_score += score print('Avg loss: {}'.format(sum_score/nfolds))
def cross_validation_accuracy(clf, X, labels, k): """ Compute the average testing accuracy over k folds of cross-validation. You can use sklearn's KFold class here (no random seed, and no shuffling needed). Params: clf......A LogisticRegression classifier. X........A csr_matrix of features. labels...The true labels for each instance in X k........The number of cross-validation folds. Returns: The average testing accuracy of the classifier over each fold of cross-validation. """ ###TODO pass
def make_kfold(target, feature): preds = [] kf = KFold(len(target), n_folds=folds,shuffle=True) test_numbers = [] for trains, tests in kf: test_numbers.append(tests) pred_list = [] feature_list = word_vec.fit_transform([dict(Counter(feature[train])) for train in trains]) target_list = [target[train] for train in trains] logreg.fit(feature_list, target_list) for test in tests: feature_dict = defaultdict(int) for f in word_vec.get_feature_names(): feature_dict[f] = 0 for key, value in dict(Counter(feature[test])).items(): if key in feature_dict: feature_dict[key] = value pred_list.append(feature_dict) preds.append(logreg.predict(word_vec.fit_transform(pred_list))) return preds, test_numbers
def eval_cv5(model, x, y): kf = KFold(len(y), n_folds=5) acc = np.array([]) pre = np.array([]) rec = np.array([]) f1 = np.array([]) for train_index, test_index in kf: x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(x_train, y_train) prediction = model.predict(x_test) evaluation = get_eval(prediction, y_test) acc = np.append(acc, np.array(evaluation[0])) pre = np.append(pre, np.array(evaluation[1])) rec = np.append(rec, np.array(evaluation[2])) f1 = np.append(f1, np.array(evaluation[3])) return acc.mean(), pre.mean(), rec.mean(), f1.mean()
def __init__(self, estimator_cls, parameter_grid, score_fns, nfolds=10, shuffle=False, seed=None, njobs=1, checkpoint_path=None): self.estimator_cls = estimator_cls self.parameter_grid = parameter_grid self.nfolds = nfolds self.seed = seed assert njobs == 1, "# jobs > 1 not supported." self.njobs = njobs assert _is_arraylike(score_fns) self.score_fns = score_fns self.checkpoint_path = checkpoint_path self.grid_scores = None self.kf = KFold(n_folds=self.nfolds, shuffle=shuffle, random_state=seed)
def cached_run(steps, X, y): step_identifier = '' # split data n = len(y) kf = KFold(n, _n_fold, random_state=_random_state) folded_data = [(X[train_index], y[train_index], X[test_index], y[test_index]) for train_index, test_index in kf] # last step is estimator, handle separately for step in steps[:-1]: step_identifier += "/%s" % _step_identifier(step) logger.info("Processing %s", step_identifier) folded_data = run_step_on_demand(step_identifier, step, folded_data) scores = [] estimator = steps[-1] step_identifier += "/%s" % _step_identifier(estimator) for (X_train, y_train, X_test, y_test) in folded_data: estimator.fit(X_train, y_train) scores.append(estimator.score(X_test, y_test)) score = np.mean(scores) logger.info("score of %s is %r", step_identifier, score) return score
def k_fold_sample_data_set(x, y, folds): """ This function uses a k-fold approach as a re-sampling strategy :param x: numpy array - Includes the train data :param y: numpy array - Includes the actual value of each data sample :param folds: integer - The number of folds that splits the data set :return: list of lists - The training and test samples extracted from the training set """ x_train_list, y_train_list, x_test_list, y_test_list = list(), list(), list(), list() try: kf = KFold(x.shape[0], n_folds=folds, shuffle=True) for train_index, test_index in kf: x_train_list.append(x[train_index]) y_train_list.append(y[train_index]) x_test_list.append(x[test_index]) y_test_list.append(y[test_index]) return x_train_list, y_train_list, x_test_list, y_test_list except AttributeError as e: print(e.args, "- Please, use numpy arrays as inputs") exit()
def run_example(): data, target = _get_data() n_folds = 5 accuracy = 0.0 for (train_idx, test_idx) in KFold(n=len(data), n_folds=n_folds, shuffle=True): train_X = data[train_idx] train_y = target[train_idx] test_X = data[test_idx] test_y = target[test_idx] model = SGDClassifier() model.fit(train_X, train_y) predictions = model.predict(test_X) accuracy += accuracy_score(predictions, test_y) return accuracy / n_folds
def kfold_train_and_predict(X, Y, classifier, k = 5, indices = None, features = None): if indices is None: indices = np.array(list(range(X.shape[0]))) if features is None: features = np.array(list(range(X.shape[1]))) kf = cross_validation.KFold(len(indices), n_folds=k) accurs = [] for train, test in kf: train_ind = indices[train].astype("int") test_ind = indices[test].astype("int") classifier.fit(X[train_ind,:][:,features], Y[train_ind]) accurs += [classifier.score(X[test_ind,:][:,features], Y[test_ind])] accurs = np.array(accurs) return np.mean(accurs), np.std(accurs)
def run_model(model,dtrain,predictor_var,target,scoring_method='mean_squared_error'): cv_method = KFold(len(dtrain),5) cv_scores = cross_val_score(model,dtrain[predictor_var],dtrain[target],cv=cv_method,scoring=scoring_method) #print cv_scores, np.mean(cv_scores), np.sqrt((-1)*np.mean(cv_scores)) dtrain_for_val = dtrain[dtrain['Year']<2000] dtest_for_val = dtrain[dtrain['Year']>1999] #cv_method = KFold(len(dtrain_for_val),5) #cv_scores_2 = cross_val_score(model,dtrain_for_val[predictor_var],dtrain_for_val[target],cv=cv_method,scoring=scoring_method) #print cv_scores_2, np.mean(cv_scores_2) dtrain_for_val_ini = dtrain_for_val[predictor_var] dtest_for_val_ini = dtest_for_val[predictor_var] model.fit(dtrain_for_val_ini,dtrain_for_val[target]) pred_for_val = model.predict(dtest_for_val_ini) #print math.sqrt(mean_squared_error(dtest_for_val['Footfall'],pred_for_val))
def getTestAndTrainingSet(X,y,K=10): N = len(X) CV = cross_validation.KFold(N,K,shuffle=True) k=0 for train_index, test_index in CV: # extract training and test set for current CV fold X_train = X[train_index,:] y_train = y[train_index,:] X_test = X[test_index,:] y_test = y[test_index,:] k+=1 if(k==K): return (X_train,y_train),(X_test,y_test)
def cv(feature_dict, feature, polarity, folds): kfold = KFold(len(polarity), n_folds = folds) count, f1, recall, precision, accuracy = 0, 0, 0, 0, 0 for train, test in kfold: LR = LogisticRegression() count += 1 x = [(feature[i]) for i in train] y = [(polarity[i])for i in train] LR.fit(scipy.sparse.vstack(x), (y)) test_label = [] answer_label = [(polarity[j]) for j in test] for j in test: query = feature[j] result = -1 if query.shape[1] != len(feature_dict) else predict(LR, query) test_label.append(int(result[0])) accuracy += accuracy_score(answer_label, test_label) precision += precision_score(answer_label, test_label) recall += recall_score(answer_label, test_label) f1 += f1_score(answer_label, test_label) print('{}_fold finished.'.format(count)) return accuracy, precision, recall, f1
def cv(feature_dict, feature, polarity, folds): kfold = KFold(len(polarity), n_folds = folds) count, f1, recall, precision, accuracy = 0, 0, 0, 0, 0 for train, test in kfold: LR = LogisticRegression() count += 1 x = [(feature[i]) for i in train] y = [(polarity[i])for i in train] LR.fit(scipy.sparse.vstack(x), (y)) test_label = [] answer_label = [(polarity[j]) for j in test] for j in test: query = feature[j] result = -1 if query.shape[1] != len(feature_dict) else predict(LR, query) test_label.append(result[1][1]) pre, rec, thr = precision_recall_curve(answer_label, test_label) return pre, rec, thr return accuracy, precision, recall, f1
def cross_validate_Softmax(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5): from sklearn.cross_validation import KFold m = len(np.squeeze(Y)) CGrid = [0.1, 0.03, 0.01, 0.003, 0.001, 3e-4, 1e-4, 3e-5, 1e-5] kf = KFold(m, n_folds=n_folds) mean_FoMs = [] for C in CGrid: fold = 1 FoMs = [] for train, test in kf: print("[+] training Softmax: LAMBDA : %e, fold : %d" % (C, fold)) prefix = "cv/cv_fold%d" % fold FoM, threshold = train_Softmax(C, dataFile, X[train], Y[train], X[test], Y[test], \ pooledFile, imageDim, sgd, prefix=prefix) FoMs.append(FoM) fold += 1 mean_FoMs.append(np.mean(FoMs)) best_FoM_index = np.argmin(mean_FoMs) print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index]) return CGrid[best_FoM_index]
def cross_validate_SoftMaxOnline(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5): from sklearn.cross_validation import KFold m = len(np.squeeze(Y)) CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001] kf = KFold(m, n_folds=n_folds, indices=False) mean_FoMs = [] for C in CGrid: fold = 1 FoMs = [] for train, test in kf: print("[+] training SoftMaxOnline: LAMBDA : %e, fold : %d" % (C, fold)) prefix = "cv/cv_fold%d" % fold FoM, threshold = train_SoftMaxOnline(C, dataFile, X[train], Y[train], X[test], Y[test], \ pooledFile, imageDim, sgd, prefix=prefix) FoMs.append(FoM) fold += 1 mean_FoMs.append(np.mean(FoMs)) best_FoM_index = np.argmin(mean_FoMs) print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index]) return CGrid[best_FoM_index]
def cross_validate_linearSVM(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5): from sklearn.cross_validation import KFold m = len(np.squeeze(Y)) CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001] kf = KFold(m, n_folds=n_folds, indices=False) mean_FoMs = [] for C in CGrid: fold = 1 FoMs = [] for train, test in kf: print("[+] training linear SVM: C : %e, fold : %d" % (C, fold)) prefix = "cv/cv_fold%d" % fold FoM, threshold = train_linearSVM(C, dataFile, X[train], Y[train], X[test], Y[test], \ pooledFile, imageDim, sgd, prefix=prefix) FoMs.append(FoM) fold += 1 mean_FoMs.append(np.mean(FoMs)) best_FoM_index = np.argmin(mean_FoMs) print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index]) return CGrid[best_FoM_index]
def cross_validate_Softmax(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5): from sklearn.cross_validation import KFold m = len(np.squeeze(Y)) CGrid = [0.1, 0.03, 0.01, 0.003, 0.001, 3e-4, 1e-4, 3e-5, 1e-5] kf = KFold(m, n_folds=n_folds, indices=False) mean_FoMs = [] for C in CGrid: fold = 1 FoMs = [] for train, test in kf: print "[+] training Softmax: LAMBDA : %e, fold : %d" % (C, fold) prefix = "cv/cv_fold%d" % fold FoM, threshold = train_Softmax(C, dataFile, X[train], Y[train], X[test], Y[test], \ pooledFile, imageDim, sgd, prefix=prefix) FoMs.append(FoM) fold += 1 mean_FoMs.append(np.mean(FoMs)) best_FoM_index = np.argmin(mean_FoMs) print "[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index] return CGrid[best_FoM_index]
def cross_validate_linearSVM(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5): from sklearn.cross_validation import KFold m = len(np.squeeze(Y)) CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001] kf = KFold(m, n_folds=n_folds, indices=False) mean_FoMs = [] for C in CGrid: fold = 1 FoMs = [] for train, test in kf: print "[+] training linear SVM: C : %e, fold : %d" % (C, fold) prefix = "cv/cv_fold%d" % fold FoM, threshold = train_linearSVM(C, dataFile, X[train], Y[train], X[test], Y[test], \ pooledFile, imageDim, sgd, prefix=prefix) FoMs.append(FoM) fold += 1 mean_FoMs.append(np.mean(FoMs)) best_FoM_index = np.argmin(mean_FoMs) print "[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index] return CGrid[best_FoM_index]
def knn_cv(post_features, post_class, n_folds, n_neighbors, length_dataset = -1): if(length_dataset == -1): length_dataset = len(post_class) cv = KFold(n = length_dataset, n_folds = n_folds, shuffle = True) train_accuracy = [] test_accuracy = [] for train,test in cv: knn = neighbors.KNeighborsClassifier(n_neighbors = n_neighbors) knn.fit(post_features[train],post_class[train]) train_accuracy.append(knn.score(post_features[train], post_class[train])) test_accuracy.append(knn.score(post_features[test], post_class[test])) # return (sum(train_accuracy)/n_folds), (sum(test_accuracy)/n_folds) return np.mean(train_accuracy), np.mean(test_accuracy)
def __init__( self, pdr, E_QC = "E_QC", Em = "Em", type_name = "Type", type_l = [1,2,3,4], disp = False, graph = False): # This parameter will be used in the run() function. self.type_l = type_l self.disp = disp self.graph = graph self.xMa = {} self.yVa = {} # self.kfa = {} for type_id in type_l: pdr_new = pdr[ pdr[ type_name] == type_id] self.xMa[type_id] = np.mat( pdr_new[ E_QC].values).T self.yVa[type_id] = np.mat( pdr_new[ Em].values).T # kfa[type_id] = cross_validation.KFold( np.shape(yVa[type_id])[0], n_folds=5, shuffle=True)
def _gs_SVC_r0( xM, yVc, params): """ Since classification is considered, we use yVc which includes digital values whereas yV can include float point values. """ print(xM.shape, yVc.shape) clf = svm.SVC() #parmas = {'alpha': np.logspace(1, -1, 9)} kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True) gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1) gs.fit( xM, yVc) return gs
def gs_SVC( xM, yVc, params, n_folds = 5): """ Since classification is considered, we use yVc which includes digital values whereas yV can include float point values. """ print(xM.shape, yVc.shape) clf = svm.SVC() #parmas = {'alpha': np.logspace(1, -1, 9)} kf5 = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1) gs.fit( xM, yVc) return gs
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1): """ As is a list of A matrices where A is similarity matrix. X is a concatened linear descriptors. If no X is used, X can be empty """ clf = binary_model.BIKE_Ridge( A_list, XX) parmas = {'alpha': np.logspace( *alphas_log)} ln = A_list[0].shape[0] # ls is the number of molecules. kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True) gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs) AX_idx = np.array([list(range( ln))]).T gs.fit( AX_idx, yV) return gs
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cv_Ridge_BIKE( A_list, yV, XX = None, alpha = 0.5, n_folds = 5, n_jobs = -1, grid_std = None): clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha) ln = A_list[0].shape[0] # ls is the number of molecules. kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True) AX_idx = np.array([list(range( ln))]).T yV_pred = cross_validation.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs) print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_folds = 5, n_jobs = -1): """ As is a list of A matrices where A is similarity matrix. X is a concatened linear descriptors. If no X is used, X can be empty """ clf = binary_model.BIKE_Ridge( A_list, X_concat) parmas = {'alpha': np.logspace( *alphas_log)} ln = A_list[0].shape[0] # ls is the number of molecules. kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True) gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs) AX_idx = np.array([list(range( ln))]).T gs.fit( AX_idx, yV) return gs
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=shuffle) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def _cv_LOO_r0( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ n_folds = xM.shape[0] print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def create_cv_id(target, n_folds_ = 5, cv_id_name=cv_id_name, seed=407): try: a = StratifiedKFold(target['target'],n_folds=n_folds_, shuffle=True, random_state=seed) cv_index = a.test_folds print 'Done StratifiedKFold' except: cv_index = np.empty(len(target)) a = KFold(len(target),n_folds=n_folds_, shuffle=True, random_state=seed) for idx, i in enumerate(a): cv_index[i[1]] = idx cv_index = cv_index.astype(int) print 'Done Kfold' np.save(INPUT_PATH + cv_id_name, cv_index) return ######### Utils ######### #feature list????????????util??
def test_kfold_no_shuffle(): # Manually check that KFold preserves the data ordering on toy datasets splits = iter(cval.KFold(4, 2)) train, test = next(splits) assert_array_equal(test, [0, 1]) assert_array_equal(train, [2, 3]) train, test = next(splits) assert_array_equal(test, [2, 3]) assert_array_equal(train, [0, 1]) splits = iter(cval.KFold(5, 2)) train, test = next(splits) assert_array_equal(test, [0, 1, 2]) assert_array_equal(train, [3, 4]) train, test = next(splits) assert_array_equal(test, [3, 4]) assert_array_equal(train, [0, 1, 2])
def test_predefinedsplit_with_kfold_split(): # Check that PredefinedSplit can reproduce a split generated by Kfold. folds = -1 * np.ones(10) kf_train = [] kf_test = [] for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)): kf_train.append(train_ind) kf_test.append(test_ind) folds[test_ind] = i ps_train = [] ps_test = [] ps = cval.PredefinedSplit(folds) for train_ind, test_ind in ps: ps_train.append(train_ind) ps_test.append(test_ind) assert_array_equal(ps_train, kf_train) assert_array_equal(ps_test, kf_test)
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) tprs = np.zeros((nrof_folds,nrof_thresholds)) fprs = np.zeros((nrof_folds,nrof_thresholds)) accuracy = np.zeros((nrof_folds)) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the best threshold for the fold acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set]) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs,0) fpr = np.mean(fprs,0) return tpr, fpr, accuracy
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) val = np.zeros(nrof_folds) far = np.zeros(nrof_folds) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the threshold that gives FAR = far_target far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set]) if np.max(far_train)>=far_target: f = interpolate.interp1d(far_train, thresholds, kind='slinear') threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set]) val_mean = np.mean(val) far_mean = np.mean(far) val_std = np.std(val) return val_mean, val_std, far_mean
def get_kfold_bydate(self, df, n_folds = 10): df.sort_values(by = ['time_date','time_id','start_district_id'], axis = 0, inplace = True) df.reset_index(drop=True, inplace = True) kf = KFold(df.shape[0], n_folds= n_folds, shuffle=False) for train_index, test_index in kf: print("TRAIN:", train_index, "TEST:", test_index) return kf
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def crossvalidate(problem, dataset, set_size, uid, w, var, cov, transform, old_alpha, lmbda=0.5): """Finds the best hyperparameters using cross-validation. Parameters ---------- WRITEME Returns ------- alpha : tuple The best hyperparameter. """ if len(dataset) % _NUM_FOLDS != 0: return old_alpha kfold = KFold(len(dataset), n_folds=_NUM_FOLDS) f = compute_transform(uid, w, var, cov, transform, lmbda=lmbda) avg_accuracy = np.zeros(len(_ALPHAS)) for i, alpha in enumerate(_ALPHAS): accuracies = [] for tr_indices, ts_indices in kfold: w, _ = problem.select_query(dataset[tr_indices], set_size, alpha, transform=f) utilities = np.dot(w, dataset[ts_indices].T) accuracies.append((utilities > 0).mean()) avg_accuracy[i] = sum(accuracies) / len(accuracies) alpha = _I_TO_ALPHA[np.argmax(avg_accuracy)] _LOG.debug('''\ alpha accuracies = {avg_accuracy} best alpha = {alpha} ''', **locals()) return alpha
def cross_validate(model, X, y, n_folds, batch_size, num_epoch, func_for_evaluation=None): # let's shuffle first. seed = 5 np.random.seed(seed) np.random.shuffle(X) np.random.seed(seed) np.random.shuffle(y) X = np.array(X) y = np.array(y) scores = np.zeros(n_folds) kf = KFold(len(y), n_folds=n_folds) for i, (train_index, test_index) in enumerate(kf): X_train, y_train = X[train_index, :], y[train_index] X_test, y_test = X[test_index, :], y[test_index] model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch) predictions = model.predict(X_test) score = func_for_evaluation(predictions[:, 0].tolist(), y_test) try: scores[i] = score[0] except IndexError: scores[i] = score print "{}-Fold cross validation score: {}".format(n_folds, scores.mean())