我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.ShuffleSplit()。
def validate(model, X, y, nb_epoch=25, batch_size=128, stop_early=True, folds=10, test_size=None, shuffle=True, verbose=True): early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto') total_score = [] if test_size is None: if folds == 1: test_size = 0.25 else: test_size = 1 - (1. / folds) kf = ShuffleSplit(n_splits=folds, test_size=test_size) for fold, (train_index, test_index) in enumerate(kf.split(X, y)): shuffle_weights(model) if fold > 0: print("FOLD:", fold) print("-" * 40) model.reset_states() callbacks = [early_stopping] if True else None hist = model.fit(X[train_index], y[train_index], batch_size=batch_size, shuffle=shuffle, validation_data=(X[test_index], y[test_index]), callbacks=[early_stopping], verbose=verbose) total_score.append(hist.history["val_acc"][-1]) return np.mean(total_score)
def train_test_split(inpath, train, test, split, random_seed): """ RuCor doesn't provide train/test data splitting, it makes random splitting. Args: inpath: path to data train: path to train folder test: path to test folder split: int, split ratio random_seed: seed for random module Returns: """ print('Start train-test splitting ...') z = os.listdir(inpath) doc_split = ShuffleSplit(1, test_size=split, random_state=random_seed) for train_indeses, test_indeses in doc_split.split(z): train_set = [z[i] for i in sorted(list(train_indeses))] test_set = [z[i] for i in sorted(list(test_indeses))] for x in train_set: build_data.move(os.path.join(inpath, x), os.path.join(train, x)) for x in test_set: build_data.move(os.path.join(inpath, x), os.path.join(test, x)) print('End train-test splitts.') return None
def TestPerformance(self, df = None): #If no dataframe is provided, use the currently learned one if(df is None): D = self.D else: D = self.S.transform(df.copy()) #Get features from the data frame A = self._ExtractFeat(D) #Get the target values and their corresponding column names y, _ = self._ExtractTarg(D) #Begin cross validation ss = ShuffleSplit(n_splits = 1) for trn, tst in ss.split(A): s1 = self.R.score(A, y) s2 = self.R.score(A[tst], y[tst]) s3 = self.R.score(A[trn], y[trn]) print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
def Train(self, C, A, Y, SF): ''' Train the classifier using the sample matrix A and target matrix Y ''' C.fit(A, Y) YH = np.zeros(Y.shape, dtype = np.object) for i in np.array_split(np.arange(A.shape[0]), 32): #Split up verification into chunks to prevent out of memory YH[i] = C.predict(A[i]) s1 = SF(Y, YH) print('All:{:8.6f}'.format(s1)) ''' ss = ShuffleSplit(random_state = 1151) #Use fixed state for so training can be repeated later trn, tst = next(ss.split(A, Y)) #Make train/test split mi = [8] * 1 #Maximum number of iterations at each iter YH = np.zeros((A.shape[0]), dtype = np.object) for mic in mi: #Chunk size to split dataset for CV results #C.SetMaxIter(mic) #Set the maximum number of iterations to run #C.fit(A[trn], Y[trn]) #Perform training iterations '''
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") X, y = iris.data, iris.target K = np.dot(X, X.T) cv = ShuffleSplit(test_size=0.25, random_state=0) tr, te = list(cv.split(X))[0] X_tr, y_tr = _safe_split(clf, X, y, tr) K_tr, y_tr2 = _safe_split(clfp, K, y, tr) assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) X_te, y_te = _safe_split(clf, X, y, te, tr) K_te, y_te2 = _safe_split(clfp, K, y, te, tr) assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
def split_testing_data_r(y): ss = ShuffleSplit(n_splits=1, test_size=0.2) tri = None tei = None for itr, ite in ss.split(y): tri = itr tei = ite return tri, tei
def get_cv_method(method, **kwargs): if method == 'kfold': return KFold(**kwargs) elif method == 'skfold': return StratifiedKFold(**kwargs) elif method == 'loo': return LeaveOneOut() elif method == 'shuffle_split': return ShuffleSplit(**kwargs) elif method == 'split': return TrainTestSplit(**kwargs) elif method == 's_shuffle_split': return StratifiedShuffleSplit(**kwargs) elif method == 'time_series': return TimeSeriesSplit(**kwargs) else: raise AttributeError('Invalid CV method - %s!' % method)
def cross_validation(self): cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20) scores = cross_val_score(self.clf, self.training_data, self.training_target, cv=cv, scoring='f1_macro') print scores print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def test_learning_curve_comprehensive(self): """ Test learning curve with all parameters with visual unit test. """ try: visualizer = LearningCurveVisualizer(LinearSVC(random_state=0), train_sizes=np.linspace(.1, 1.0, 5), cv=ShuffleSplit(n_splits=100, test_size=0.2, random_state=0), n_jobs=4) visualizer.fit(X, y) visualizer.poof() except Exception as e: self.fail("error during learning curve: {}".format(e)) self.assert_images_similar(visualizer)
def test_learning_curve_model_cv_only(self): """ Test learning curve with inputting model and cv only. """ try: visualizer = LearningCurveVisualizer(LinearSVC(), cv=ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)) visualizer.fit(X, y) visualizer.poof() except Exception as e: self.fail("error during learning curve: {}".format(e))
def test_learning_curve_model_trainsize_cv_only(self): """ Test learning curve with inputting model, training size, and cv only. """ try: visualizer = LearningCurveVisualizer(LinearSVC(), train_sizes=np.linspace(.1, 1.0, 5), cv=ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)) visualizer.fit(X, y) visualizer.poof() except Exception as e: self.fail("error during learning curve: {}".format(e))
def test_learning_curve_bad_trainsize(self): """ Test learning curve with bad input for training size. """ with self.assertRaises(YellowbrickError): visualizer = LearningCurveVisualizer(LinearSVC(), train_sizes=10000, cv=ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)) visualizer.fit(X, y) visualizer.poof()
def get_cv(self, X, y): unique_event_ids = np.unique(y[:, 0]) event_cv = ShuffleSplit( n_splits=self.n_cv, test_size=self.cv_test_size, random_state=self.random_state) for train_event_is, test_event_is in event_cv.split(unique_event_ids): train_is = np.where( np.in1d(y[:, 0], unique_event_ids[train_event_is]))[0] test_is = np.where( np.in1d(y[:, 0], unique_event_ids[test_event_is]))[0] yield train_is, test_is
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128): v[cname], z[cname] = 0, 0 np.random.seed(seed) build_model().summary(line_length=120) model_path = '../data/working/' + cname + '_keras_model.h5' ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits) scores = list() for n, (itrain, ival) in enumerate(ss.split(train3, y)): xtrain, xval = train3[itrain], train3[ival] ytrain, yval = y[itrain], y[ival] model = build_model() model.fit( xtrain, ytrain, batch_size = batch_size, epochs = 10000, validation_data = (xval, yval), verbose = 0, callbacks = build_keras_fit_callbacks(model_path), shuffle = True ) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += pconvert(p).ravel() score = metrics.log_loss(y[ival], p) print(cname, 'fold %d: '%(n+1), score, now()) scores.append(score) z[cname] += pconvert(model.predict(test3)).ravel() del model for i in range(3): gc.collect(i) os.remove(model_path) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 7 num_splits = 7 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.RandomForestClassifier(max_depth=9, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 7 num_splits = 7 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=6, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 1 num_splits = 3 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.RandomForestClassifier(max_depth=9, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 1 num_splits = 3 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=7, random_state=seed, n_estimators=1500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 7 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.RandomForestClassifier(max_depth=9, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 7 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=11, random_state=seed, n_estimators=1500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 2 num_splits = 7 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=11, random_state=seed, n_estimators=2000, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.RandomForestClassifier(max_depth=9, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=15, random_state=seed, n_estimators=2500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += p score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += reg.predict_proba(test2)[:,1] cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=15, random_state=seed, n_estimators=2500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.RandomForestClassifier(max_depth=9, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += p score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += reg.predict_proba(test2)[:,1] cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.RandomForestClassifier(max_depth=9, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += p score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += np.log1p(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234): v[cname], z[cname] = 0, 0 np.random.seed(seed) build_model().summary(line_length=120) model_path = '../data/working/' + cname + '_keras_model.h5' ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits) scores = list() for n, (itrain, ival) in enumerate(ss.split(train3, y)): xtrain, xval = train3[itrain], train3[ival] ytrain, yval = y[itrain], y[ival] model = build_model() model.fit( xtrain, ytrain, batch_size = 128, epochs=10000, validation_data=(xval, yval), verbose=0, callbacks=build_keras_fit_callbacks(model_path), shuffle=True ) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += pconvert(p).ravel() score = metrics.log_loss(y[ival], p) print(cname, 'fold %d: '%(n+1), score, now()) scores.append(score) z[cname] += pconvert(model.predict(test3)).ravel() del model for i in range(3): gc.collect(i) os.remove(model_path) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 7 num_splits = 17 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.RandomForestClassifier(max_depth=9, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 7 num_splits = 17 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=7, random_state=seed, n_estimators=1500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds