def rfr_feature_select(): from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.cross_validation import cross_val_score, ShuffleSplit boston = load_boston() X = boston["data"] Y = boston["target"] names = boston["feature_names"] rf = RandomForestRegressor(n_estimators=20, max_depth=4) scores = [] for i in range(X.shape[1]): score = cross_val_score(rf, X[:, i:i + 1], Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3)) scores.append((round(np.mean(score), 3), names[i])) print sorted(scores, reverse=True)
def data_split(inputfile): data = hkl.load(inputfile) X = data['mat'] X_kspec = data['kmer'] y = data['y'] rs = ShuffleSplit(len(y), n_iter=1,random_state = 1) X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4)) X = np.concatenate((X,X_kspec), axis = 1) X = X[:,np.newaxis] X = X.transpose((0,1,3,2)) for train_idx, test_idx in rs: X_train = X[train_idx,:] y_train = y[train_idx] X_test = X[test_idx,:] y_test = y[test_idx] X_train = X_train.astype('float32') y_train = y_train.astype('int32') X_test = X_test.astype('float32') y_test = y_test.astype('int32') return [X_train, y_train, X_test, y_test] #define the network architecture
def train(self): """ Train SVM """ print "Starting Training" rs = ShuffleSplit(self.length,n_iter=self.args.fold,test_size=self.args.test_size,random_state=self.args.random_state) self.fold = 1 for train_index,test_index in rs: self.train_images,self.train_labels = self.images[train_index,...],self.labels[train_index,...] self.valid_images,self.valid_labels = self.images[test_index,...],self.labels[test_index,...] #pdb.set_trace() self.svm_classifier = self.classifier.fit(self.train_images,self.train_labels) self.test(self.valid_images) self.fold+=1
def __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clf
def grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__stop_words=[None, "english"], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clf
def evaluate(X, args): enum = ShuffleSplit(len(X), n_iter=args.n_iterations, test_size=args.test_size) train_scores = [] test_scores = [] for train_index, test_index in enum: X_train = [X[idx] for idx in train_index] X_test = [X[idx] for idx in test_index] X_train, X_test = preprocess_datasets(X_train, X_test, args) model = GaussianHMM(n_states=args.n_states, n_training_iterations=args.n_training_iterations, topology=args.topology) model.fit(X_train) train_scores.extend([model.loglikelihood(X_curr) for X_curr in X_train]) test_scores.extend([model.loglikelihood(X_curr) for X_curr in X_test]) train_scores_array = np.array(train_scores) train_mean = float(np.mean(train_scores_array)) train_std = float(np.std(train_scores_array)) test_scores_array = np.array(test_scores) test_mean = float(np.mean(test_scores_array)) test_std = float(np.std(test_scores_array)) return train_mean, train_std, test_mean, test_std
def optimize_learner_dad(learner, X, U, iters, train_size = 0.5): num_traj = X.shape[2] if train_size < 1.0: from sklearn import cross_validation rs = cross_validation.ShuffleSplit(num_traj, n_iter=1, train_size=train_size, random_state=0, test_size=1.-train_size) for train_index, test_index in rs: pass Xtrain = X[:,:,train_index]; Xtest = X[:,:,test_index] Utrain = U[:,:,train_index]; Utest = U[:,:,test_index] elif train_size == 1.0: Xtrain = X; Xtest = X Utrain = U; Utest = U else: raise Exception('Train size must be in (0,1]') dad = DaDControl() dad.learn(Xtrain, Utrain, learner, iters, Xtest, Utest, verbose=False) print(' DaD (iters:{:d}). Initial Err: {:.4g}, Best: {:.4g}'.format(iters, dad.initial_test_err, dad.min_test_error)) return dad
def test_cross_val_generator_with_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) # explicitly passing indices value is deprecated loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ps = cval.PredefinedSplit([1, 1, 2, 2]) ss = cval.ShuffleSplit(2) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ss = cval.ShuffleSplit(2) ps = cval.PredefinedSplit([1, 1, 2, 2]) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
def fit(self, X, y, test_size=0.3): # Grid search cross-val (best C param) cv = ShuffleSplit(len(X), n_iter=1, test_size=0.3, random_state=self.seed_) clf_cv = GridSearchCV(self.clf_base_, self.clf_hyparams_, cv=cv, n_jobs=-1, verbose=4) print('====> Training Classifier (with grid search hyperparam tuning) .. ') print('====> BATCH Training (in-memory): {:4.3f} MB'.format(X.nbytes / 1024.0 / 1024.0) ) clf_cv.fit(X, y) print('BEST: {}, {}'.format(clf_cv.best_score_, clf_cv.best_params_)) # Setting clf to best estimator self.clf_ = clf_cv.best_estimator_ # # Calibrating classifier # print('Calibrating Classifier ... ') # self.clf_prob_ = CalibratedClassifierCV(self.clf_, cv=cv, method='sigmoid') # self.clf_prob_.fit(X, y) # # Setting clf to best estimator # self.clf_ = clf_cv.best_estimator_ # pred_targets = self.clf_.predict(X) if self.epoch_no_ % 10 == 0: self.save(self.filename_.replace('.h5', '_iter_{}.h5'.format(self.epoch_no_))) self.save(self.filename_) self.epoch_no_ += 1
def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True): if stratify: n_folds = int(round(1 / test_size)) sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state) else: sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state) train_idx, test_idx = iter(sss).next() return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
def __call__(self, X, y, net): if self.eval_size is not None: if net.regression or not self.stratify: # test_size = self.eval_size # kf = ShuffleSplit( # y.shape[0], test_size=test_size, # random_state=self.random_state # ) # train_indices, valid_indices = next(iter(kf)) # valid_indices = shuffle(valid_indices) test_size = 1 - self.eval_size kf = ShuffleSplit( y.shape[0], test_size=test_size, random_state=self.random_state ) valid_indices, train_indices = next(iter(kf)) else: n_folds = int(round(1 / self.eval_size)) kf = StratifiedKFold(y, n_folds=n_folds, random_state=self.random_state) train_indices, valid_indices = next(iter(kf)) X_train, y_train = X[train_indices], y[train_indices] X_valid, y_valid = X[valid_indices], y[valid_indices] else: X_train, y_train = X, y X_valid, y_valid = X[len(X):], y[len(y):] return X_train, X_valid, y_train, y_valid
def data_split(inputfile,reads_count): data = hkl.load(inputfile) reads_count= hkl.load(reads_count) X = data['mat'] X_kspec = data['kmer'] reads_count = np.array(reads_count) y = np.mean(reads_count, axis = 1) y = np.log(y+1e-3) rs = ShuffleSplit(len(y), n_iter=1,random_state = 1) X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4)) X = np.concatenate((X,X_kspec), axis = 1) X = X[:,np.newaxis] X = X.transpose((0,1,3,2)) for train_idx, test_idx in rs: X_train = X[train_idx,:] y_train = y[train_idx] X_test = X[test_idx,:] y_test = y[test_idx] X_train = X_train.astype('float32') y_train = y_train.astype('float32') X_test = X_test.astype('float32') y_test = y_test.astype('float32') print 'Data prepration done!' return [X_train, y_train, X_test, y_test] #define the network architecture
def cv(model, X, y, n_iter=5, test_size=0.3): split = cross_validation.ShuffleSplit( len(X), n_iter=n_iter, test_size=test_size, ) return cross_validation.cross_val_score(model, X, y, cv=split, scoring='accuracy', n_jobs=-1)
def load_images(image_h5_file, n_images=-1, shuffle_seed=1): """Load images and auxiliary data from h5 file. Args: image_h5_file: location of h5 file containing images. n_images: number of images to load, -1 loads all. auxvars: list of auxvar field names to load. Returns: images: array of image arrays. aux_data: dict of auxvar arrays. TODO: add support for multiple classes. """ with h5py.File(image_h5_file, 'r') as h5file: images = h5file['images'] auxvars = h5file['auxvars'] if n_images < 0: n_images = len(images) elif n_images > len(images): print("Cannot load {0} images. Only {1} images in {2}".format( n_images, len(images), image_h5_file)) n_images = len(images) if n_images < len(images): rs = cross_validation.ShuffleSplit( len(images), n_iter=1, test_size=n_images, random_state=shuffle_seed) for train, test in rs: keep = test images = np.take(images, keep, axis=0) auxvars = np.take(auxvars, keep, axis=0) else: images = h5file['images'][:] auxvars = h5file['auxvars'][:] return images, auxvars
def _get_split(X, y): split = ShuffleSplit(y.shape[0], n_iter=1) train, validate = list(split)[0] X_train, X_validate, y_train, y_validate = X[train], X[validate], y[train], y[validate] return X_train, X_validate, y_train, y_validate
def test_shuffle_split(): ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0) ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0) ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0) for typ in six.integer_types: ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0) for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): assert_array_equal(t1[0], t2[0]) assert_array_equal(t2[0], t3[0]) assert_array_equal(t3[0], t4[0]) assert_array_equal(t1[1], t2[1]) assert_array_equal(t2[1], t3[1]) assert_array_equal(t3[1], t4[1])
def test_shufflesplit_errors(): assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1, train_size=0.95) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, train_size=None)
def test_shufflesplit_reproducible(): # Check that iterating twice on the ShuffleSplit gives the same # sequence of train-test when the random_state is given ss = cval.ShuffleSplit(10, random_state=21) assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))
def __grid_search_model(self, clf_factory, documents, labels, pos_label): boolndarr = labels.values == pos_label n = documents.size n_pos = labels[boolndarr].size n_neg = n - n_pos param_grid = { 'vect__binary' : [False, True], 'vect__min_df' : [1, 2], 'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)], 'vect__smooth_idf' : [False, True], 'vect__stop_words' : [None, 'english'], 'vect__sublinear_tf': [False, True], 'vect__use_idf' : [False, True], 'clf__alpha' : [0, 0.01, 0.05, 0.1, 0.5, 1] } k = 5 cv = ShuffleSplit( n, n_iter = k, test_size = 1 / k, random_state = 0 ) pos_weight = n_neg / n_pos sample_weight = np.ones(n) sample_weight[boolndarr] *= pos_weight fit_params = {'clf__sample_weight': sample_weight} f1_scorer = make_scorer(f1_score, pos_label=pos_label) grid_search = GridSearchCV( clf_factory, param_grid, cv = cv, fit_params = fit_params, n_jobs = -1, scoring = f1_scorer ) grid_search.fit(documents, labels) best_estimator = grid_search.best_estimator_ best_score = grid_search.best_score_ best_params = grid_search.best_params_ print("Best F1 score: {0:04.3f}".format(best_score)) print("Parameters: {0}".format(best_params)) return best_estimator
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)
def analyze(clf,labels=None): def _do(matrix, test_ratio=0.0): if labels: # Learning mode # Split train & test folds shuffle = ShuffleSplit(len(matrix), test_size=test_ratio) trainlist, testlist = [(a,b) for (a,b) in shuffle][-1] X_train = [x for x in map(lambda i: matrix[i], trainlist)] Y_train = [y for y in map(lambda i: labels[i], trainlist)] X_valid = [x for x in map(lambda i: matrix[i], testlist)] Y_valid = [y for y in map(lambda i: labels[i], testlist)] # Display what the underlying classifier is print(colored(clf[-1],'yellow')) # Display the dimension of the training elements print(colored('Trainset:','cyan')) print(colored('X: {0}'.format(np.shape(X_train)),'yellow')) print(colored('y: {0}'.format(np.shape(Y_train)),'yellow')) # Process trainset for opr in clf[:-1]: print(colored(opr,'yellow')) X_train = opr.fit_transform(X_train,Y_train) # NOTE: The last operation of the CLF is always a clustering algo clf[-1].fit(X_train,Y_train) # Display the dimension of the training elements print(colored('Validation set:','cyan')) print(colored('X: {0}'.format(np.shape(X_valid)),'yellow')) print(colored('y: {0}'.format(np.shape(Y_valid)),'yellow')) # Process validation set for opr in clf[:-1]: print(colored(opr,'yellow')) X_valid = opr.transform(X_valid) # Return tuple of [actual], [prediction] # on the validation set return (Y_valid, clf[-1].predict(X_valid)) else: # Classification mode X = matrix # Feature transformations for opr in clf[:-1]: X = opr.transform(X) # NOTE: Predict the clusters with the last operation y = clf[-1].predict(X) return iter(y) return _do