我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.StratifiedKFold()。
def train_and_calibrate_cv(model, X_tr, y_tr, cv=5): y_pred_xval = np.zeros(len(y_tr)) skf = cross_validation.StratifiedKFold(y_tr, n_folds=cv,shuffle=True) i = 0; for train, test in skf: i = i+1 print("training fold {} of {}".format(i, cv)) X_train_xval = np.array(X_tr)[train,:] X_test_xval = np.array(X_tr)[test,:] y_train_xval = np.array(y_tr)[train] # We could also copy the model first and then fit it model_copy = clone(model) model_copy.fit(X_train_xval,y_train_xval) y_pred_xval[test]=model.predict_proba(X_test_xval)[:,1] print("training full model") model_copy = clone(model) model_copy.fit(X_tr,y_tr) print("calibrating function") calib_func = prob_calibration_function(y_tr, y_pred_xval) return model_copy, calib_func
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 300 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) for n, (itrain, ival) in enumerate(skf.split(train2, y)): print('step %d of %d'%(n+1, skf.n_splits), now()) clf = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=3, random_state=13) clf.fit(train2[itrain], y[itrain]) p = clf.predict(train2[ival]) v.loc[ival, cname] += p score = metrics.log_loss(y[ival], p) z[cname] += np.log1p(clf.predict(test2)) print(cname, 'step %d: score'%(n+1), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits
def get_split(self): if self.split is not None: return name = "{}/split.p".format(self.flags.data_path) split = load_pickle(None,name,[]) if len(split) == 0: #data = self.data["training_variants"].append(self.data["test_variants_filter"]) data = self.data["training_variants"] y = data['Class']-1 X = np.arange(y.shape[0]) from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=self.flags.folds,shuffle=True,random_state=99) split = [(train_index, test_index) for train_index, test_index in skf.split(X, y)] save_pickle(split,name) print("new shuffle") self.split = split #print("split va",split[0][1][:10])
def cvsplit(fold, totalfold, mydict): '''get the split of train and test fold is the returned fold th data, from 0 to totalfold-1 total fold is for the cross validation mydict is the return dict from readlabel''' skf = StratifiedKFold(n_splits=totalfold) # default shuffle is false, okay! #readdicom(mydict) y = mydict.values() x = mydict.keys() count = 0 for train, test in skf.split(x,y): print(len(train), len(test)) if count == fold: #print test return train, test count += 1
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42): self.name = name self.X = X self.y = y self.task = task self.random_state = random_state if test_size is not None: self.test_size = test_size self.validation_method = "train_test_split" self.X_train, self.X_test, self.y_train, self.y_test = \ model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state) elif cv is not None: self.validation_method = "cv" if task == "regression": self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state) elif task == "classification": self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
def _sfn(l, mask, myrad, bcast_var): """Score classifier on searchlight data using cross-validation. The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The number of cross-validation folds is in `bast_var[1]. """ clf = bcast_var[2] data = l[0][mask, :].T # print(l[0].shape, mask.shape, data.shape) skf = model_selection.StratifiedKFold(n_splits=bcast_var[1], shuffle=False) accuracy = np.mean(model_selection.cross_val_score(clf, data, y=bcast_var[0], cv=skf, n_jobs=1)) return accuracy
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj): # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel # when the kernel matrix is computed in portions; also, this method only works # for self-correlation, i.e. correlation between the same data matrix. # no shrinking, set C=1 svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) #logit_clf = LogisticRegression() clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj) # doing leave-one-subject-out cross validation # no shuffling in cv skf = model_selection.StratifiedKFold(n_splits=num_subjects, shuffle=False) scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)), y=labels, cv=skf) print(scores) logger.info( 'the overall cross validation accuracy is %.2f' % np.mean(scores) )
def setBestParameters(self): cv = StratifiedKFold(n_splits = self.conf.num_folds) param_grid = self.conf.getParamGrid() if param_grid is None: # No parameter value to select return if self.conf.families_supervision: scoring = 'f1_macro' else: scoring = 'roc_auc' grid_search = GridSearchCV(self.pipeline, param_grid = param_grid, scoring = scoring, cv = cv, n_jobs = -1, fit_params = {'model__sample_weight': self.datasets.sample_weight}) grid_search.fit(self.datasets.train_instances.getFeatures(), self.getSupervision(self.datasets.train_instances)) self.conf.setBestValues(grid_search) self.pipeline.set_params(**self.conf.getBestValues()) return cv
def test_mdr_sklearn_pipeline(): """Ensure that MDR can be used as a transformer in a scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True)) assert np.mean(cv_scores) > 0.
def test_mdr_sklearn_pipeline_parallel(): """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1) assert np.mean(cv_scores) > 0.
def kfold(self, k=5, stratify=False, shuffle=True, seed=33): """K-Folds cross validation iterator. Parameters ---------- k : int, default 5 stratify : bool, default False shuffle : bool, default True seed : int, default 33 Yields ------- X_train, y_train, X_test, y_test, train_index, test_index """ if stratify: kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle) else: kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle) for train_index, test_index in kf.split(self.X_train, self.y_train): X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index] X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index] yield X_train, y_train, X_test, y_test, train_index, test_index
def predict_training(self, folds=5): """Do cross-validation and return probabilities for each data-point. Args: folds (int): Number of folds used for prediction on training data. """ partial_clf = linear_model.LogisticRegression(class_weight='balanced') prediction = np.zeros((len(self.features), self.num_classes)) skf = StratifiedKFold(n_splits=folds) for train_index, test_index in skf.split(self.features, self.labels): # prepare the training and test data training_features = self.features[train_index] test_features = self.features[test_index] training_labels = self.labels[train_index] # fitting the model and predicting partial_clf.fit(training_features, training_labels) curr_pred = partial_clf.predict_proba(test_features) prediction[test_index] = \ self.predict_proba_ordered(curr_pred, partial_clf.classes_) return prediction
def predict_training(self, folds=5): """Do cross-validation and return probabilities for each data-point. Args: folds (int): Number of folds used for prediction on training data. """ prediction = np.zeros((len(self.strings), self.num_classes)) skf = StratifiedKFold(n_splits=folds) for train_index, test_index in skf.split(self.strings, self.labels): # prepare the training and test data training_strings = self.strings[train_index] test_strings = self.strings[test_index] training_labels = self.labels[train_index] # predicting the results part_prediction = self.find_knn(training_strings, training_labels, test_strings) prediction[test_index] = part_prediction return prediction
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed) for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def test_stratified_kfold_ratios(): # Check that stratified kfold preserves class ratios in individual splits # Repeat with shuffling turned off and on n_samples = 1000 X = np.ones(n_samples) y = np.array([4] * int(0.10 * n_samples) + [0] * int(0.89 * n_samples) + [1] * int(0.01 * n_samples)) for shuffle in (False, True): for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y): assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2) assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2) assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2) assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2) assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2) assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2)
def test_stratifiedkfold_balance(): # Check that KFold returns folds with balanced sizes (only when # stratification is possible) # Repeat with shuffling turned off and on X = np.ones(17) y = [0] * 3 + [1] * 14 for shuffle in (True, False): cv = StratifiedKFold(3, shuffle=shuffle) for i in range(11, 17): skf = cv.split(X[:i], y[:i]) sizes = [] for _, test in skf: sizes.append(len(test)) assert_true((np.max(sizes) - np.min(sizes)) <= 1) assert_equal(np.sum(sizes), i)
def transform(self, M, **kwargs): """ Takes a Takes a dataframe that has :code:`item_id` index, other 'features' columns for prediction, and applies a Keras sequential model to it. :param M: a dataframe that has an :code:`item_id` index, and "features" columns. :type M: pandas.DataFrame :rtype: a tuple with trained Keras model and its keyword arguments """ rows, columns = M.shape factors = M.merge(self.validation_matrix, left_index=True, right_index=True) factors = factors.values if self.classification: kfold = StratifiedKFold(n_splits=self.kfold_n_splits, random_state=self.kfold_seed, shuffle=self.kfold_shuffle) else: kfold = KFold(n_splits=self.kfold_n_splits, random_state=self.kfold_seed, shuffle=self.kfold_shuffle) X = factors[:, :columns] Y = factors[:, columns:] for train_index, test_index in kfold.split(X, Y): self.keras_model.fit( X[train_index], Y[train_index], validation_data=[X[test_index], Y[train_index]], **self.keras_kwargs) return self.keras_model, kwargs
def cvsplitenhance(fold, totalfold, mydict, valfold=-1): '''get the split of train and test fold is the returned fold th data, from 0 to totalfold-1 total fold is for the cross validation mydict is the return dict from readlabel sperate the data into train, validation, test''' skf = StratifiedKFold(n_splits=totalfold) # default shuffle is false, okay! #readdicom(mydict) y = mydict.values() x = mydict.keys() count = 0 if valfold == -1: valfold = (fold+1) % totalfold print('valfold'+str(valfold)) trainls, valls, testls = [], [], [] for train, test in skf.split(x,y): print(len(train), len(test)) if count == fold: #print test[:] testls = test[:] elif count == valfold: valls = test[:] else: for i in test: trainls.append(i) count += 1 return trainls, valls, testls
def k_fold_validation(model, monitored_data, unmonitored_data, k, random_state=123): """ Performs k fold validation on a model. During each fold, records all of the scoring in the `scoring_methods` module. @param model is a machine learning model that has the functions `fit(X, y)` and `predict(X)` @param monitored_data an array-like matrix that has the following structure `[(features, value)]` @param unmonitored_data is also an array-like object: [features] @param k is the amount of folds @return is a 2D array of scores, with the following structure `[{scoring_method: score}]` where the shape is `len(k)` """ X, y = get_X_y(monitored_data, unmonitored_data) skf = StratifiedKFold(n_splits=k, random_state=random_state, shuffle=True) evaluations = [] i = 1 for train, test in skf.split(X, y): print("Starting split {}".format(i)) X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] print("Fitting data") model.fit(X_train, y_train) print("Predicting") prediction = model.predict(X_test) evaluations.append(scoring_methods.evaluate_model(prediction, y_test)) print(evaluations[-1]) i += 1 return evaluations
def _cross_validation_for_one_voxel(clf, vid, num_folds, subject_data, labels): """Score classifier on data using cross validation.""" # no shuffling in cv skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=False) scores = model_selection.cross_val_score(clf, subject_data, y=labels, cv=skf, n_jobs=1) logger.debug( 'cross validation for voxel %d is done' % vid ) return (vid, scores.mean())
def split_kfold_c(y): skf = StratifiedKFold(5) ilst = [] for tri, tei in skf.split(np.zeros(len(y)), y): ilst.append((tri, tei)) return ilst
def get_cv_method(method, **kwargs): if method == 'kfold': return KFold(**kwargs) elif method == 'skfold': return StratifiedKFold(**kwargs) elif method == 'loo': return LeaveOneOut() elif method == 'shuffle_split': return ShuffleSplit(**kwargs) elif method == 'split': return TrainTestSplit(**kwargs) elif method == 's_shuffle_split': return StratifiedShuffleSplit(**kwargs) elif method == 'time_series': return TimeSeriesSplit(**kwargs) else: raise AttributeError('Invalid CV method - %s!' % method)
def computeAccuracyForSingleModel(self,algorithm="SVM",isLocalSmall=0,execType="normal"): totalFeatures = self.instancesFeatures.shape[1] n = min(5, totalFeatures/2) # as explained in the article, the number of local agents will be 5 numberOfFeaturesInEachModel = int( math.ceil (totalFeatures / n) ) if (isLocalSmall): instFeatures = dataPreparation.selectNRandomColumns(self.instancesFeatures,numberOfFeaturesInEachModel) #select random numberOfFeatures columns else: instFeatures = np.array(self.instancesFeatures) skf = StratifiedKFold(n_splits=self.kFolds) avgScore = 0 avgF1Macro = 0 avgF1Micro = 0 avgF1Weighted = 0 for train_index, test_index in skf.split(instFeatures, self.instancesClasses): resultClasses = classifier.MakeClassification(self.algorithmsIndex[algorithm],instFeatures[train_index],self.instancesClasses[train_index],instFeatures[test_index],"value") valF1Macro = f1_score(self.instancesClasses[test_index], resultClasses, average='macro') valF1Micro = f1_score(self.instancesClasses[test_index], resultClasses, average='micro') valF1Weighted = f1_score(self.instancesClasses[test_index], resultClasses, average='weighted') valScore = accuracy_score(self.instancesClasses[test_index],resultClasses) avgF1Macro += valF1Macro avgF1Micro += valF1Micro avgF1Weighted += valF1Weighted avgScore += valScore with open(self.fileToWrite, "a") as myfile: myfile.write(str(valF1Weighted)+"\t"+str(valF1Micro)+"\t"+str(valF1Macro)+"\t"+str(valScore)+"\n") avgScore = avgScore / self.kFolds avgF1Macro /= self.kFolds avgF1Weighted /= self.kFolds avgF1Micro /= self.kFolds return avgScore, avgF1Macro, avgF1Micro, avgF1Weighted # this function will call all the underlying methods in order to perform data prepation, classification in each simulated agent, and aggregation
def plot_significance_score(model, X, Y): from sklearn.model_selection import permutation_test_score, StratifiedKFold cv = StratifiedKFold(10) # Must be numpy arrays to use permutation as opposed to pandas data frame score, permutation_scores, pvalue = permutation_test_score(model, X.values, Y.values, scoring="roc_auc", cv=cv, n_permutations=100) print("Classification Score %s (p-value: %s)" % (score, pvalue)) # -------------------- Matplotlib Graphs -------------------- # #-------------------------------------------------------------# #-------------------------------------------------------------#
def setUp(self): bl1 = RandomForestClassifier(random_state=8) bl2 = LogisticRegression() bl3 = RandomForestClassifier(max_depth=10, random_state=10) meta_est = LogisticRegression() skf = StratifiedKFold(random_state=8).split self.stacked_ensemble = stacker.XcessivStackedEnsemble( [bl1, bl2, bl3], ['predict', 'predict_proba', 'predict_proba'], meta_est, skf )
def get_sample_dataset(dataset_properties): """Returns sample dataset Args: dataset_properties (dict): Dictionary corresponding to the properties of the dataset used to verify the estimator and metric generators. Returns: X (array-like): Features array y (array-like): Labels array splits (iterator): This is an iterator that returns train test splits for cross-validation purposes on ``X`` and ``y``. """ kwargs = dataset_properties.copy() data_type = kwargs.pop('type') if data_type == 'multiclass': try: X, y = datasets.make_classification(random_state=8, **kwargs) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) except Exception as e: raise exceptions.UserError(repr(e)) elif data_type == 'iris': X, y = datasets.load_iris(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'mnist': X, y = datasets.load_digits(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'breast_cancer': X, y = datasets.load_breast_cancer(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'boston': X, y = datasets.load_boston(return_X_y=True) splits = model_selection.KFold(n_splits=2, random_state=8).split(X) elif data_type == 'diabetes': X, y = datasets.load_diabetes(return_X_y=True) splits = model_selection.KFold(n_splits=2, random_state=8).split(X) else: raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type'])) return X, y, splits
def build_corpus(): positive_sentences = codecs.open("rt-polaritydata/rt-polarity.pos").readlines() negative_sentences = codecs.open("rt-polaritydata/rt-polarity.neg").readlines() num_positive = len(positive_sentences) num_negative = len(negative_sentences) labels = [1] * num_positive + [0] * num_negative sentences = positive_sentences + negative_sentences clean = [word_tokenize(clean_sentence(sentence)) for sentence in sentences] total = reduce(lambda sent1, sent2: sent1 + sent2, clean) counter = collections.Counter(total) count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) words, _ = list(zip(*count_pairs)) word2id = dict(zip(words, range(3, len(words)+3))) word2id["<pad>"] = 0 word2id["<sos>"] = 1 word2id["<eos>"] = 2 inputs = [] for sent in clean: stantard_sent = [1] + [word2id[word] for word in sent] + [2] inputs.append(stantard_sent) skf = StratifiedKFold(n_splits=5) inputs_array = np.array(inputs) labels_array = np.array(labels) train_index, validation_index = skf.split(inputs_array, labels_array).next() np.random.shuffle(train_index) np.random.shuffle(validation_index) train_X, train_y = inputs_array[train_index], labels_array[train_index] valid_X, valid_y = inputs_array[validation_index], labels_array[validation_index] return word2id, train_X, train_y, valid_X, valid_y
def _get_cv_splits(self, df): if self._cv_method is None: self._cv_method = StratifiedKFold(n_splits=self._kfolds, shuffle=self._shuffle) for train, test in self._cv_method.split(df, df[self._class_col]): yield df.ix[train], df.ix[test]
def _create_stratified_split(csv_filepath, n_splits): """ Create a stratified split for the classification task. Parameters ---------- csv_filepath : str Path to a CSV file which points to images n_splits : int Number of splits to make """ from sklearn.model_selection import StratifiedKFold data = _load_csv(csv_filepath) labels = [el['symbol_id'] for el in data] skf = StratifiedKFold(labels, n_folds=n_splits) i = 1 kdirectory = 'classification-task' if not os.path.exists(kdirectory): os.makedirs(kdirectory) for train_index, test_index in skf: print("Create fold %i" % i) directory = "%s/fold-%i" % (kdirectory, i) if not os.path.exists(directory): os.makedirs(directory) else: print("Directory '%s' already exists. Please remove it." % directory) i += 1 train = [data[el] for el in train_index] test_ = [data[el] for el in test_index] for dataset, name in [(train, 'train'), (test_, 'test')]: with open("%s/%s.csv" % (directory, name), 'wb') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(('path', 'symbol_id', 'latex', 'user_id')) for el in dataset: csv_writer.writerow(("../../%s" % el['path'], el['symbol_id'], el['latex'], el['user_id']))
def make_mf_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''): n = X.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier ''' print clf np.random.seed(seed) feature_index = np.arange(X.shape[1]) for epoch in range(nb_epoch): print "Start epoch:",epoch mf_tr = np.zeros((X.shape[0],len(np.unique(y)))) mf_te = np.zeros((X_test.shape[0],len(np.unique(y)))) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y) np.random.shuffle(feature_index) new_index = feature_index[:int(max_features*len(feature_index))] for ind_tr, ind_te in skf: if ssp.issparse(X): X_tr = X[ind_tr].tocsc()[:,new_index] X_te = X[ind_te].tocsc()[:,new_index] else: X_tr = X[ind_tr][:,new_index] X_te = X[ind_te][:,new_index] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) mf_tr[ind_te] += clf.predict_proba(X_te) mf_te += clf.predict_proba(X_test[:,new_index]) score = log_loss(y_te, mf_tr[ind_te]) print '\tpred[{}] score:{}'.format(epoch, score) mf_te/=n_folds pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random_r.pkl'%(name,epoch)) pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random_r.pkl'%(name,epoch))
def make_mf_lsvc_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''): n = X.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier ''' print clf for epoch in range(nb_epoch): print "Start epoch:",epoch mf_tr = np.zeros(X.shape[0]) mf_te = np.zeros(X_test.shape[0]) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) mf_tr[ind_te] += clf.predict_proba(X_te).ravel() score = accuracy_score(y_te, clf.predict(X_te).ravel()) del X_tr del X_te mf_te += clf.predict_proba(X_test).ravel() print '\tpred[{}] score:{}'.format(epoch, score) mf_te/=n_folds pd.to_pickle(mf_tr.reshape(-1,1),path+'X_mf_%s_%s_random.pkl'%(name,epoch)) pd.to_pickle(mf_te.reshape(-1,1),path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
def make_mf_regression(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''): n = X.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier ''' print clf for epoch in range(nb_epoch): print "Start epoch:",epoch mf_tr = np.zeros(X.shape[0]) mf_te = np.zeros(X_test.shape[0]) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) mf_tr[ind_te] += clf.predict(X_te) del X_tr del X_te l = 600000 y_pred = [] for batch in range(4): X_tmp = X_test[l*batch:l*(batch+1)] y_pred.append(clf.predict(X_tmp)) y_pred = np.concatenate(y_pred) mf_te += y_pred score = log_loss(y_te, mf_tr[ind_te]) print '\tpred[{}] score:{}'.format(epoch, score) mf_te/=n_folds pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random.pkl'%(name,epoch)) pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
def tune_num_estimators(metric: str, label: np.ndarray, params: dict, strat_folds: StratifiedKFold, train) -> Tuple[int, float]: """ Uses xgboost's cross-validation method to tune the number of estimators and returns that along with the best CV score achieved. :param metric: Evaluation metric that is monitored during cross-validation - e.g. 'logloss' or 'rmse'. :param label: An array-like containing the labels of the classification or regression problem. :param params: A dictionary of XGB parameters. :param strat_folds: A StratifiedKFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :return: A tuple containing the tuned number of estimators along with the best CV score achieved. """ eval_hist = xgb.cv( dtrain=xgb.DMatrix(train, label=label), early_stopping_rounds=50, folds=strat_folds, metrics=metric, num_boost_round=10000, params=params, verbose_eval=True ) num_trees = eval_hist.shape[0] best_score = eval_hist.values[num_trees - 1, 0] return num_trees, best_score
def split_kfold(self, features, labels=None, n_folds=10): skf = StratifiedKFold(n_folds) for train_index, test_index in skf.split(features, labels): yield train_index, test_index # return cross_validation.StratifiedKFold(labels, n_folds)
def __init__(self, models, meta_model, cv=model_selection.StratifiedKFold(n_splits=3), use_base_features=True, use_proba=True): super().__init__( models=models, meta_model=meta_model, cv=cv, use_base_features=use_base_features, use_proba=use_proba, )
def predict_kfold(cls, X, y, n_folds=10, seed=0, textModel_params={}, kfolds=None, pool=None, use_tqdm=True): try: from tqdm import tqdm except ImportError: def tqdm(x, **kwargs): return x le = preprocessing.LabelEncoder().fit(y) y = np.array(le.transform(y)) hy = np.zeros(len(y), dtype=np.int) if kfolds is None: kfolds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X, y) args = [(X, y, tr, ts, textModel_params) for tr, ts in kfolds] if pool is not None: if use_tqdm: res = [x for x in tqdm(pool.imap_unordered(cls.train_predict_pool, args), desc='Params', total=len(args))] else: res = [x for x in pool.imap_unordered(cls.train_predict_pool, args)] else: if use_tqdm: args = tqdm(args) res = [cls.train_predict_pool(x) for x in args] for ts, _hy in res: hy[ts] = _hy return le.inverse_transform(hy)
def __init__(self, X, y, score, n_folds, cls, seed=0, pool=None): self.n_folds = n_folds self.score = score self.X = X self.le = le = preprocessing.LabelEncoder().fit(y) self.y = np.array(le.transform(y)) self.cls = cls self.pool = pool np.random.seed(seed) self.kfolds = [x for x in StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(np.zeros(self.y.shape[0]), self.y)]
def xgb1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 7 N_seeds = 2 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 5, learning_rate = 0.03, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) for s in range(N_seeds): xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): print('step %d of %d'%(n+1, skf.n_splits), now()) dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) dtest = xgb.DMatrix(test2) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds
def xgb2(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 7 N_seeds = 2 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 4, learning_rate = 0.03, subsample = 0.7, #colsample_bytree = 0.8, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): print('step %d of %d'%(n+1, skf.n_splits), now()) dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds
def xgb3(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 7 N_seeds = 2 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 4, learning_rate = 0.03, subsample = 0.8, #colsample_bytree = 0.8, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): print('step %d of %d'%(n+1, skf.n_splits), now()) dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds
def xgb1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 7 N_seeds = 3 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 5, learning_rate = 0.02, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) for s in range(N_seeds): xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) dtest = xgb.DMatrix(test2) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds
def xgb3(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 7 N_seeds = 3 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 4, learning_rate = 0.02, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds