Python sklearn.cross_validation 模块，ShuffleSplit() 实例源码

我们从Python开源项目中，提取了以下24个代码示例，用于说明如何使用sklearn.cross_validation.ShuffleSplit()。

项目：python_utils 作者：Jayhello | 项目源码 | 文件源码

def rfr_feature_select():
    from sklearn.datasets import load_boston
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.cross_validation import cross_val_score, ShuffleSplit

    boston = load_boston()
    X = boston["data"]
    Y = boston["target"]
    names = boston["feature_names"]

    rf = RandomForestRegressor(n_estimators=20, max_depth=4)
    scores = []
    for i in range(X.shape[1]):
        score = cross_val_score(rf, X[:, i:i + 1],
                                Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
        scores.append((round(np.mean(score), 3), names[i]))

    print sorted(scores, reverse=True)

项目：Deopen 作者：kimmo1019 | 项目源码 | 文件源码

def data_split(inputfile):
    data = hkl.load(inputfile)
    X = data['mat']
    X_kspec = data['kmer']
    y = data['y']
    rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
    X = np.concatenate((X,X_kspec), axis = 1)
    X = X[:,np.newaxis]
    X = X.transpose((0,1,3,2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx,:]
        y_train = y[train_idx]
        X_test = X[test_idx,:]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('int32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('int32')
    return [X_train, y_train, X_test, y_test]

#define the network architecture

项目：Face_recognition_SVM 作者：AshStuff | 项目源码 | 文件源码

def train(self):
        """
            Train SVM

        """
        print "Starting Training"

        rs = ShuffleSplit(self.length,n_iter=self.args.fold,test_size=self.args.test_size,random_state=self.args.random_state)
        self.fold = 1
        for train_index,test_index in rs:
            self.train_images,self.train_labels = self.images[train_index,...],self.labels[train_index,...]
            self.valid_images,self.valid_labels = self.images[test_index,...],self.labels[test_index,...]
            #pdb.set_trace()

            self.svm_classifier = self.classifier.fit(self.train_images,self.train_labels)
            self.test(self.valid_images)
            self.fold+=1

项目：Building-Machine-Learning-Systems-With-Python-Second-Edition 作者：PacktPublishing | 项目源码 | 文件源码

def __grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf

项目：Building-Machine-Learning-Systems-With-Python-Second-Edition 作者：PacktPublishing | 项目源码 | 文件源码

def grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__stop_words=[None, "english"],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf

项目：motion-classification 作者：matthiasplappert | 项目源码 | 文件源码

def evaluate(X, args):
    enum = ShuffleSplit(len(X), n_iter=args.n_iterations, test_size=args.test_size)
    train_scores = []
    test_scores = []
    for train_index, test_index in enum:
        X_train = [X[idx] for idx in train_index]
        X_test = [X[idx] for idx in test_index]
        X_train, X_test = preprocess_datasets(X_train, X_test, args)
        model = GaussianHMM(n_states=args.n_states, n_training_iterations=args.n_training_iterations,
                            topology=args.topology)
        model.fit(X_train)
        train_scores.extend([model.loglikelihood(X_curr) for X_curr in X_train])
        test_scores.extend([model.loglikelihood(X_curr) for X_curr in X_test])

    train_scores_array = np.array(train_scores)
    train_mean = float(np.mean(train_scores_array))
    train_std = float(np.std(train_scores_array))
    test_scores_array = np.array(test_scores)
    test_mean = float(np.mean(test_scores_array))
    test_std = float(np.std(test_scores_array))
    return train_mean, train_std, test_mean, test_std

项目：DaD 作者：arunvenk | 项目源码 | 文件源码

def optimize_learner_dad(learner, X, U, iters, train_size = 0.5):
    num_traj = X.shape[2]
    if train_size < 1.0:
        from sklearn import cross_validation
        rs = cross_validation.ShuffleSplit(num_traj, n_iter=1, train_size=train_size, 
                random_state=0, test_size=1.-train_size)
        for train_index, test_index in rs:
            pass
        Xtrain = X[:,:,train_index]; Xtest = X[:,:,test_index]
        Utrain = U[:,:,train_index]; Utest = U[:,:,test_index]
    elif train_size == 1.0:
        Xtrain = X; Xtest = X
        Utrain = U; Utest = U
    else:
        raise Exception('Train size must be in (0,1]')

    dad = DaDControl()
    dad.learn(Xtrain, Utrain, learner, iters, Xtest, Utest, verbose=False)
    print(' DaD (iters:{:d}). Initial Err: {:.4g}, Best: {:.4g}'.format(iters,
        dad.initial_test_err, dad.min_test_error))
    return dad

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]

项目：pybot 作者：spillai | 项目源码 | 文件源码

def fit(self, X, y, test_size=0.3):
        # Grid search cross-val (best C param)
        cv = ShuffleSplit(len(X), n_iter=1, test_size=0.3, random_state=self.seed_)
        clf_cv = GridSearchCV(self.clf_base_, self.clf_hyparams_, cv=cv, n_jobs=-1, verbose=4)

        print('====> Training Classifier (with grid search hyperparam tuning) .. ')
        print('====> BATCH Training (in-memory): {:4.3f} MB'.format(X.nbytes / 1024.0 / 1024.0) )
        clf_cv.fit(X, y)
        print('BEST: {}, {}'.format(clf_cv.best_score_, clf_cv.best_params_))

        # Setting clf to best estimator
        self.clf_ = clf_cv.best_estimator_

        # # Calibrating classifier
        # print('Calibrating Classifier ... ')
        # self.clf_prob_ = CalibratedClassifierCV(self.clf_, cv=cv, method='sigmoid')
        # self.clf_prob_.fit(X, y)        

        # # Setting clf to best estimator
        # self.clf_ = clf_cv.best_estimator_
        # pred_targets = self.clf_.predict(X)

        if self.epoch_no_ % 10 == 0: 
            self.save(self.filename_.replace('.h5', '_iter_{}.h5'.format(self.epoch_no_)))
        self.save(self.filename_)
        self.epoch_no_ += 1

项目：kaggle-right-whale 作者：felixlaumon | 项目源码 | 文件源码

def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True):
    if stratify:
        n_folds = int(round(1 / test_size))
        sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state)
    else:
        sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state)
    train_idx, test_idx = iter(sss).next()
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

项目：kaggle-right-whale 作者：felixlaumon | 项目源码 | 文件源码

def __call__(self, X, y, net):
        if self.eval_size is not None:
            if net.regression or not self.stratify:
                # test_size = self.eval_size
                # kf = ShuffleSplit(
                #     y.shape[0], test_size=test_size,
                #     random_state=self.random_state
                # )
                # train_indices, valid_indices = next(iter(kf))
                # valid_indices = shuffle(valid_indices)
                test_size = 1 - self.eval_size
                kf = ShuffleSplit(
                    y.shape[0], test_size=test_size,
                    random_state=self.random_state
                )
                valid_indices, train_indices = next(iter(kf))
            else:
                n_folds = int(round(1 / self.eval_size))
                kf = StratifiedKFold(y, n_folds=n_folds, random_state=self.random_state)
                train_indices, valid_indices = next(iter(kf))

            X_train, y_train = X[train_indices], y[train_indices]
            X_valid, y_valid = X[valid_indices], y[valid_indices]
        else:
            X_train, y_train = X, y
            X_valid, y_valid = X[len(X):], y[len(y):]

        return X_train, X_valid, y_train, y_valid

项目：Deopen 作者：kimmo1019 | 项目源码 | 文件源码

def data_split(inputfile,reads_count):
    data = hkl.load(inputfile)
    reads_count= hkl.load(reads_count)
    X = data['mat']
    X_kspec = data['kmer']
    reads_count = np.array(reads_count)
    y = np.mean(reads_count, axis = 1)
    y = np.log(y+1e-3)
    rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
    X = np.concatenate((X,X_kspec), axis = 1)
    X = X[:,np.newaxis]
    X = X.transpose((0,1,3,2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx,:]
        y_train = y[train_idx]
        X_test = X[test_idx,:]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('float32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('float32')
    print 'Data prepration done!'
    return [X_train, y_train, X_test, y_test]


#define the network architecture

项目：digit-ocr 作者：Nozdi | 项目源码 | 文件源码

def cv(model, X, y, n_iter=5, test_size=0.3):
    split = cross_validation.ShuffleSplit(
        len(X), n_iter=n_iter, test_size=test_size,
    )
    return cross_validation.cross_val_score(model, X, y, cv=split,
                                            scoring='accuracy', n_jobs=-1)

项目：deepjets 作者：deepjets | 项目源码 | 文件源码

def load_images(image_h5_file, n_images=-1, shuffle_seed=1):
    """Load images and auxiliary data from h5 file.

    Args:
        image_h5_file: location of h5 file containing images.
        n_images: number of images to load, -1 loads all.
        auxvars: list of auxvar field names to load.
    Returns:
        images: array of image arrays.
        aux_data: dict of auxvar arrays.
    TODO: add support for multiple classes.
    """
    with h5py.File(image_h5_file, 'r') as h5file:
        images = h5file['images']
        auxvars = h5file['auxvars']
        if n_images < 0:
            n_images = len(images)
        elif n_images > len(images):
            print("Cannot load {0} images. Only {1} images in {2}".format(
                n_images, len(images), image_h5_file))
            n_images = len(images)
        if n_images < len(images):
            rs = cross_validation.ShuffleSplit(
                len(images), n_iter=1, test_size=n_images,
                random_state=shuffle_seed)
            for train, test in rs:
                keep = test
            images = np.take(images, keep, axis=0)
            auxvars = np.take(auxvars, keep, axis=0)
        else:
            images = h5file['images'][:]
            auxvars = h5file['auxvars'][:]
    return images, auxvars

项目：Quadflor 作者：quadflor | 项目源码 | 文件源码

def _get_split(X, y):
        split = ShuffleSplit(y.shape[0], n_iter=1)
        train, validate = list(split)[0]
        X_train, X_validate, y_train, y_validate = X[train], X[validate], y[train], y[validate]
        return X_train, X_validate, y_train, y_validate

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_shuffle_split():
    ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
    ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
    ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
    for typ in six.integer_types:
        ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
        assert_array_equal(t1[0], t2[0])
        assert_array_equal(t2[0], t3[0])
        assert_array_equal(t3[0], t4[0])
        assert_array_equal(t1[1], t2[1])
        assert_array_equal(t2[1], t3[1])
        assert_array_equal(t3[1], t4[1])

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_shufflesplit_errors():
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
                  train_size=0.95)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
                  train_size=None)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_shufflesplit_reproducible():
    # Check that iterating twice on the ShuffleSplit gives the same
    # sequence of train-test when the random_state is given
    ss = cval.ShuffleSplit(10, random_state=21)
    assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))

项目：TextCategorization 作者：Y-oHr-N | 项目源码 | 文件源码

def __grid_search_model(self, clf_factory, documents, labels, pos_label):
        boolndarr        = labels.values == pos_label
        n                = documents.size
        n_pos            = labels[boolndarr].size
        n_neg            = n - n_pos

        param_grid       = {
            'vect__binary'      : [False, True],
            'vect__min_df'      : [1, 2],
            'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)],
            'vect__smooth_idf'  : [False, True],
            'vect__stop_words'  : [None, 'english'],
            'vect__sublinear_tf': [False, True],
            'vect__use_idf'     : [False, True],
            'clf__alpha'        : [0, 0.01, 0.05, 0.1, 0.5, 1]
        }

        k                = 5
        cv               = ShuffleSplit(
            n,
            n_iter       = k,
            test_size    = 1 / k,
            random_state = 0
        )

        pos_weight       = n_neg / n_pos
        sample_weight    = np.ones(n)
        sample_weight[boolndarr] *= pos_weight
        fit_params       = {'clf__sample_weight': sample_weight}

        f1_scorer        = make_scorer(f1_score, pos_label=pos_label)

        grid_search      = GridSearchCV(
            clf_factory,
            param_grid,
            cv           = cv,
            fit_params   = fit_params,
            n_jobs       = -1,
            scoring      = f1_scorer
        )

        grid_search.fit(documents, labels)
        best_estimator   = grid_search.best_estimator_
        best_score       = grid_search.best_score_
        best_params      = grid_search.best_params_

        print("Best F1 score: {0:04.3f}".format(best_score))
        print("Parameters: {0}".format(best_params))

        return best_estimator

项目：Building-Machine-Learning-Systems-With-Python-Second-Edition 作者：PacktPublishing | 项目源码 | 文件源码

def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)

项目：Building-Machine-Learning-Systems-With-Python-Second-Edition 作者：PacktPublishing | 项目源码 | 文件源码

def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)

项目：Building-Machine-Learning-Systems-With-Python-Second-Edition 作者：PacktPublishing | 项目源码 | 文件源码

def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)

项目：pantip-libr 作者：starcolon | 项目源码 | 文件源码

def analyze(clf,labels=None):
  def _do(matrix, test_ratio=0.0):
    if labels:  # Learning mode

      # Split train & test folds
      shuffle = ShuffleSplit(len(matrix), test_size=test_ratio)
      trainlist, testlist = [(a,b) for (a,b) in shuffle][-1]
      X_train = [x for x in map(lambda i: matrix[i], trainlist)]
      Y_train = [y for y in map(lambda i: labels[i], trainlist)]
      X_valid = [x for x in map(lambda i: matrix[i], testlist)]
      Y_valid = [y for y in map(lambda i: labels[i], testlist)]

      # Display what the underlying classifier is
      print(colored(clf[-1],'yellow'))

      # Display the dimension of the training elements
      print(colored('Trainset:','cyan'))
      print(colored('X: {0}'.format(np.shape(X_train)),'yellow'))
      print(colored('y: {0}'.format(np.shape(Y_train)),'yellow'))

      # Process trainset
      for opr in clf[:-1]:
        print(colored(opr,'yellow'))
        X_train = opr.fit_transform(X_train,Y_train)
      # NOTE: The last operation of the CLF is always a clustering algo
      clf[-1].fit(X_train,Y_train)

      # Display the dimension of the training elements
      print(colored('Validation set:','cyan'))
      print(colored('X: {0}'.format(np.shape(X_valid)),'yellow'))
      print(colored('y: {0}'.format(np.shape(Y_valid)),'yellow'))

      # Process validation set
      for opr in clf[:-1]:
        print(colored(opr,'yellow'))
        X_valid = opr.transform(X_valid)

      # Return tuple of [actual], [prediction] 
      # on the validation set
      return (Y_valid, clf[-1].predict(X_valid))

    else: # Classification mode
      X = matrix

      # Feature transformations
      for opr in clf[:-1]:
        X = opr.transform(X)

      # NOTE: Predict the clusters with the last operation
      y = clf[-1].predict(X)
      return iter(y)

  return _do