我们从Python开源项目中,提取了以下5个代码示例,用于说明如何使用sklearn.model_selection.GroupKFold()。
def generate_folds(labels_fname, folds_fname, max_n_folds=10): """ Generate folds for CV exps with n = 2, ..., max_n_folds. Save as pickled dict with n as key. """ filenames = read_labels(labels_fname)['__filenames__'] folds = {} for n in range(2, max_n_folds + 1): # Create folds from complete texts only # (i.e. instances/sentences of the same text are never in different folds). # There is no random seed, because the partitioning algorithm is deterministic. group_k_fold = GroupKFold(n_splits=n) # Don't bother to pass real X and Y, because they are not really used. folds[n] = list(group_k_fold.split(filenames, filenames, filenames)) print('writing folds to ' + folds_fname) pickle.dump(folds, open(folds_fname, 'wb'))
def get_train_test_fold_filenames(true_iob_dir, use_pickle=True): pickle_fname = '_train_test_fold_fnames.pkl' if use_pickle: try: return pickle.load(open(pickle_fname, 'rb')) except IOError: pass # Misuse data collecting function to get X, y and filenames. # Since we are not interested in the actual features, we pretend true_iob_dir is a feature dir. data = collect_crf_data(true_iob_dir, true_iob_dir) # Now create group_k_fold = GroupKFold(n_splits=5) # Create folds from complete texts only (i.e. instances of the same text are never in different folds) # Use same split for all three entities. # Note that there is no random seed, because the output of group_k_fold.split is deterministic # as long as the iob files are globbed in exactly the same order splits = group_k_fold.split(data['feats'], data['Material'], data['filenames']) fnames = np.array(data['filenames']) train_test_fold_fnames = [] for train_idx, test_idx in splits: train_fnames = np.unique(fnames[train_idx]) test_fnames = np.unique(fnames[test_idx]) train_test_fold_fnames.append((train_fnames, test_fnames)) pickle.dump(train_test_fold_fnames, open(pickle_fname, 'wb')) return train_test_fold_fnames
def build_folds(all_xs, all_ys, advice): domains = [get_domain(doc['url']) for doc in all_xs] n_domains = len(set(domains)) n_relevant_domains = len( {domain for domain, is_relevant in zip(domains, all_ys) if is_relevant}) n_folds = 4 if n_relevant_domains == 1: advice.append(AdviceItem( WARNING, 'Only 1 relevant domain in data means that it\'s impossible to do ' 'cross-validation across domains, ' 'and will likely result in model over-fitting.' )) folds = KFold(n_splits=n_folds).split(all_xs) else: folds = (GroupKFold(n_splits=min(n_domains, n_folds)) .split(all_xs, groups=domains)) if 1 < n_relevant_domains < WARN_N_RELEVANT_DOMAINS: advice.append(AdviceItem( WARNING, 'Low number of relevant domains (just {}) ' 'might result in model over-fitting.'.format(n_relevant_domains) )) folds = two_class_folds(folds, all_ys) if not folds: folds = two_class_folds(KFold(n_splits=n_folds).split(all_xs), all_ys) if not folds: advice.append(AdviceItem( WARNING, 'Can not do cross-validation, as there are no folds where ' 'training data has both relevant and non-relevant examples. ' 'There are too few domains or the dataset is too unbalanced.' )) return folds
def train_models(data, targets, groups,model=None, cropsize=2800, batch_size=512, epochs=250, epochs_to_stop=15,rnn_epochs_to_stop=15): """ trains a cnn3adam_filter_l2 model with a LSTM on top on the given data with 20% validation set and returns the two models """ input_shape = list((np.array(data[0])).shape) #train_data.shape input_shape[0] = cropsize n_classes = targets.shape[1] train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__() train_data = [data[i] for i in train_idx] train_target = targets[train_idx] train_groups = groups[train_idx] val_data = [data[i] for i in val_idx] val_target = targets[val_idx] val_groups = groups[val_idx] model = models.cnn3adam_filter_l2(input_shape, n_classes) if model is None else model(input_shape, n_classes) g_train= generator(train_data, train_target, batch_size, val=False, cropsize=cropsize) g_val = generator(val_data, val_target, batch_size, val=True, cropsize=cropsize) cb = Checkpoint_balanced(g_val, verbose=1, groups=val_groups, epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(model.name, 'testing')) model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0) val_acc = cb.best_acc val_f1 = cb.best_f1 print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100)) # LSTM training rnn_modelfun = models.pure_rnn_do lname = 'fc1' seq = 6 rnn_epochs = epochs stopafter_rnn = rnn_epochs_to_stop features = get_activations(model, train_data + val_data, lname, batch_size*2, cropsize=cropsize) train_data_extracted = features[0:len(train_data)] val_data_extracted = features[len(train_data):] assert (len(train_data)==len(train_data_extracted)) and (len(val_data)==len(val_data_extracted)) train_data_seq, train_target_seq, train_groups_seq = tools.to_sequences(train_data_extracted, train_target,groups=train_groups, seqlen=seq) val_data_seq, val_target_seq, val_groups_seq = tools.to_sequences(val_data_extracted, val_target, groups=val_groups, seqlen=seq) rnn_shape = list((np.array(train_data_seq[0])).shape) neurons = int(np.sqrt(rnn_shape[-1])*4) rnn_model = rnn_modelfun(rnn_shape, n_classes, layers=2, neurons=neurons, dropout=0.3) print('Starting RNN model with input from layer fc1: {} at {}'.format(rnn_model.name, rnn_shape, time.ctime())) g_train= generator(train_data_seq, train_target_seq, batch_size, val=False) g_val = generator(val_data_seq, val_target_seq, batch_size, val=True) cb = Checkpoint_balanced(g_val, verbose=1, groups = val_groups_seq, epochs_to_stop=stopafter_rnn, plot = True, name = '{}, {}'.format(rnn_model.name,'fc1')) rnn_model.fit_generator(g_train, g_train.n_batches, epochs=rnn_epochs, verbose=0, callbacks=[cb],max_queue_size=1) val_acc = cb.best_acc val_f1 = cb.best_f1 print('LSTM Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100)) return model, rnn_model
def train_models_feat(data, targets, groups, batch_size=512, epochs=250, epochs_to_stop=15): """ trains a ann and rnn model with features the given data with 20% validation set and returns the two models """ batch_size = 512 input_shape = list((np.array(data[0])).shape) #train_data.shape n_classes = targets.shape[1] train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__() train_data = [data[i] for i in train_idx] train_target = targets[train_idx] train_groups = groups[train_idx] val_data = [data[i] for i in val_idx] val_target = targets[val_idx] val_groups = groups[val_idx] model = models.ann(input_shape, n_classes) g_train= generator(train_data, train_target, batch_size, val=False) g_val = generator(val_data, val_target, batch_size, val=True) cb = Checkpoint_balanced(g_val, verbose=1, groups=val_groups, epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(model.name, 'testing')) model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0) val_acc = cb.best_acc val_f1 = cb.best_f1 print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100)) # LSTM training batch_size = 512 n_classes = targets.shape[1] train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__() train_data = np.array([data[i] for i in train_idx]) train_target = targets[train_idx] train_groups = groups[train_idx] val_data = np.array([data[i] for i in val_idx]) val_target = targets[val_idx] val_groups = groups[val_idx] train_data_seq, train_target_seq, train_groups_seq = tools.to_sequences(train_data, train_target, groups=train_groups, seqlen=6) val_data_seq, val_target_seq, val_groups_seq = tools.to_sequences(val_data, val_target, groups=val_groups, seqlen=6) input_shape = list((np.array(train_data_seq[0])).shape) #train_data.shape print(input_shape) rnn_model = models.pure_rnn_do(input_shape, n_classes) g_train = generator(train_data_seq, train_target_seq, batch_size, val=False) g_val = generator(val_data_seq, val_target_seq, batch_size, val=True) cb = Checkpoint_balanced(g_val, verbose=1, groups=val_groups_seq, epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(rnn_model.name, 'testing')) rnn_model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0) val_acc = cb.best_acc val_f1 = cb.best_f1 print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100)) return model, rnn_model