Python sklearn.ensemble 模块,ExtraTreesClassifier() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.ExtraTreesClassifier()

项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_feature_importance(self,clf, model_name ):
        clfs = {'RandomForestClassifier':'feature_importances',
                'ExtraTreesClassifier': 'feature_importances',
                'AdaBoostClassifier': 'feature_importances',
                'LogisticRegression': 'coef',
                'svm.SVC': 'coef',
                'GradientBoostingClassifier': 'feature_importances',
                'GaussianNB': None,
                'DecisionTreeClassifier': 'feature_importances',
                'SGDClassifier': 'coef',
                'KNeighborsClassifier': None,
                'linear.SVC': 'coef'}

        if clfs[model_name] == 'feature_importances':
            return  list(clf.feature_importances_)
        elif clfs[model_name] == 'coef':
            return  list(clf.coef_.tolist())
        else:
            return None
项目:easyML    作者:aarshayj    | 项目源码 | 文件源码
def __init__(
        self,data_block, predictors=[],cv_folds=10,
        scoring_metric='accuracy',additional_display_metrics=[]):

        base_classification.__init__(
            self, alg=ExtraTreesClassifier(), data_block=data_block, 
            predictors=predictors,cv_folds=cv_folds,
            scoring_metric=scoring_metric, 
            additional_display_metrics=additional_display_metrics)

        self.model_output = pd.Series(self.default_parameters)
        self.model_output['Feature_Importance'] = "-"
        self.model_output['OOB_Score'] = "-"

        #Set parameters to default values:
        self.set_parameters(set_default=True)
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def define_model(self, model, parameters, n_cores = 0):
        clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
                'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
                'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
                'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
                'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
                'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
                'GaussianNB': GaussianNB(),
                'DecisionTreeClassifier': DecisionTreeClassifier(),
                'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
                'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 
                'linear.SVC': svm.LinearSVC() }

        if model not in clfs:
            raise ConfigError("Unsupported model {}".format(model))

        clf = clfs[model]
        clf.set_params(**parameters)
        return clf
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def fit(self, X, Y, sample_weight=None):
        from sklearn.ensemble import ExtraTreesClassifier
        from sklearn.feature_selection import SelectFromModel

        num_features = X.shape[1]
        max_features = int(
            float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        preprocessor = ExtraTreesClassifier(
            n_estimators=self.n_estimators, criterion=self.criterion,
            max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
            max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
            random_state=self.random_state, class_weight=self.class_weight
        )
        preprocessor.fit(X, Y, sample_weight=sample_weight)
        self.preprocessor = SelectFromModel(preprocessor, prefit=True)
        return self
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def prec_ets(n_trees, X_train, y_train, X_test, y_test, random_state=None):
    """
    ExtraTrees
    """
    from sklearn.ensemble import ExtraTreesClassifier
    if not issparse(X_train):
        X_train = X_train.reshape((X_train.shape[0], -1))
    if not issparse(X_test):
        X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    clf = ExtraTreesClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1, random_state=random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_ets{}={:.6f}%'.format(n_trees, prec*100.0))
    return clf, y_pred
项目:gcforest    作者:w821881341    | 项目源码 | 文件源码
def prec_ets(n_trees, X_train, y_train, X_test, y_test, random_state=None):
    """
    ExtraTrees
    """
    from sklearn.ensemble import ExtraTreesClassifier
    if not issparse(X_train):
        X_train = X_train.reshape((X_train.shape[0], -1))
    if not issparse(X_test):
        X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    clf = ExtraTreesClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1, random_state=random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_ets{}={:.6f}%'.format(n_trees, prec*100.0))
    return clf, y_pred
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)


    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)


    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    name = []
    for i in data.columns:
        name.append(i)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)


    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
    data = anamolySample
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0, len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data, target)
    model = SelectFromModel(clf, prefit=True)
    outcome = model.get_support()

    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    return warnstr
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample,normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data, target)
    model = SelectFromModel(clf, prefit=True)
    outcome = model.get_support()

    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    return warnstr
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample,normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data, target)
    model = SelectFromModel(clf, prefit=True)
    outcome = model.get_support()

    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    print warnstr
    return warnstr
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
        clf = ensemble.ExtraTreesClassifier(
                n_estimators = n_est_val,
                max_depth = depth_val,
                min_samples_split = split_val,
                min_samples_leaf = leaf_val,
                max_features = feat_val,
                criterion='entropy',
                n_jobs = jobs_val,
                random_state = random_state_val)
        clf.fit(train_X, train_y)
        pred_train_y = clf.predict_proba(train_X)[:,1]
        pred_test_y = clf.predict_proba(test_X)[:,1]

        if validation:
                train_loss = log_loss(train_y, pred_train_y)
                loss = log_loss(test_y, pred_test_y)
                print "Train, Test loss : ", train_loss, loss
                return pred_test_y, loss
        else:
                return pred_test_y
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
        clf = ensemble.ExtraTreesClassifier(
                n_estimators = n_est_val,
                max_depth = depth_val,
                min_samples_split = split_val,
                min_samples_leaf = leaf_val,
                max_features = feat_val,
                criterion='entropy',
                n_jobs = jobs_val,
                random_state = random_state_val)
        clf.fit(train_X, train_y)
        pred_train_y = clf.predict_proba(train_X)[:,1]
        pred_test_y = clf.predict_proba(test_X)[:,1]

        if validation:
                train_loss = log_loss(train_y, pred_train_y)
                loss = log_loss(test_y, pred_test_y)
                print "Train, Test loss : ", train_loss, loss
                return pred_test_y, loss
        else:
                return pred_test_y
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
        clf = ensemble.ExtraTreesClassifier(
                n_estimators = n_est_val,
                max_depth = depth_val,
                min_samples_split = split_val,
                min_samples_leaf = leaf_val,
                max_features = feat_val,
                criterion='entropy',
                n_jobs = jobs_val,
                random_state = random_state_val)
        clf.fit(train_X, train_y)
        pred_train_y = clf.predict_proba(train_X)[:,1]
        pred_test_y = clf.predict_proba(test_X)[:,1]

        if validation:
                train_loss = log_loss(train_y, pred_train_y)
                loss = log_loss(test_y, pred_test_y)
                print "Train, Test loss : ", train_loss, loss
                return pred_test_y, loss
        else:
                return pred_test_y
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
        clf = ensemble.ExtraTreesClassifier(
                n_estimators = n_est_val,
                max_depth = depth_val,
                min_samples_split = split_val,
                min_samples_leaf = leaf_val,
                max_features = feat_val,
                criterion='entropy',
                n_jobs = jobs_val,
                random_state = random_state_val)
        clf.fit(train_X, train_y)
        pred_train_y = clf.predict_proba(train_X)[:,1]
        pred_test_y = clf.predict_proba(test_X)[:,1]

        if validation:
                train_loss = log_loss(train_y, pred_train_y)
                loss = log_loss(test_y, pred_test_y)
                print "Train, Test loss : ", train_loss, loss
                return pred_test_y, loss
        else:
                return pred_test_y
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def extratreescv(n_estimators,
                 min_samples_split,
                 min_samples_leaf,
                 max_features,
                 max_depth,
                 min_weight_fraction_leaf
                 ):

    clf = ExtraTreesClassifier(n_estimators=int(n_estimators),
                               min_samples_split=int(min_samples_split),
                               min_samples_leaf=int(min_samples_leaf),
                               max_features= int(max_features),
                               max_depth = int(max_depth),
                               min_weight_fraction_leaf = min_weight_fraction_leaf,
                               n_jobs=-1,
                               random_state=1234,
                               verbose=1)

    clf.fit(x0, y0)
    ll = -log_loss(y1, clf.predict_proba(x1)[:,1])
    return ll
项目:hyperband    作者:zygmuntz    | 项目源码 | 文件源码
def try_params( n_iterations, params ):

    n_estimators = int( round( n_iterations * trees_per_iteration ))
    print "n_estimators:", n_estimators
    pprint( params )

    clf = XT( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params )
    return train_and_eval_sklearn_classifier( clf, data )
项目:ISM2017    作者:ybayle    | 项目源码 | 文件源码
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None):
    """Description of compare
    compare multiple classifier and display the best one
    """
    utils.print_success("Comparison of differents classifiers")
    if data is not None:
        train_features = data["train_features"]
        train_groundtruths = data["train_groundtruths"]
        test_features = data["test_features"]
        test_groundtruths = data["test_groundtruths"]
    else:
        train = utils.abs_path_file(train)
        test = utils.abs_path_file(test)
        train_features, train_groundtruths = read_file(train)
        test_features, test_groundtruths = read_file(test)
    if not utils.create_dir(res_dir):
        res_dir = utils.abs_path_dir(res_dir)
    classifiers = {
        "RandomForest": RandomForestClassifier(n_jobs=-1)
        # "RandomForest": RandomForestClassifier(n_estimators=5),
        # "KNeighbors":KNeighborsClassifier(3),
        # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
        # "DecisionTree":DecisionTreeClassifier(max_depth=5),
        # "MLP":MLPClassifier(),
        # "AdaBoost":AdaBoostClassifier(),
        # "GaussianNB":GaussianNB(),
        # "QDA":QuadraticDiscriminantAnalysis(),
        # "SVM":SVC(kernel="linear", C=0.025),
        # "GradientBoosting":GradientBoostingClassifier(),
        # "ExtraTrees":ExtraTreesClassifier(),
        # "LogisticRegression":LogisticRegression(),
        # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
    }
    for key in classifiers:
        utils.print_success(key)
        clf = classifiers[key]
        utils.print_info("\tFit")
        clf.fit(train_features, train_groundtruths)
        utils.print_info("\tPredict")
        predictions = clf.predict(test_features)
    return predictions
项目:Python-Machine-Learning-Cookbook    作者:PacktPublishing    | 项目源码 | 文件源码
def __init__(self, X, label_words):
        self.le = preprocessing.LabelEncoder()  
        self.clf = ExtraTreesClassifier(n_estimators=100, 
                max_depth=16, random_state=0)

        y = self.encode_labels(label_words)
        self.clf.fit(np.asarray(X), y)
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesClassifier as ETC

        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETC(
                n_estimators=0, criterion=self.criterion,
                max_depth=self.max_depth, min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
                max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
                oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
                random_state=self.random_state,
                class_weight=self.class_weight,
                warm_start=True
            )

        tmp = self.estimator  # TODO copy ?
        tmp.n_estimators += n_iter
        tmp.fit(X, y, sample_weight=sample_weight)
        self.estimator = tmp
        return self
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def __init__(self, name, kwargs):
        from sklearn.ensemble import ExtraTreesClassifier
        super(GCExtraTreesClassifier, self).__init__(name, ExtraTreesClassifier, kwargs)
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def et_opt1(df_cell_train_feats, y_train, df_cell_test_feats):
    logging.info("train et_opt1 model")
    clf = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_features="log2", min_samples_split=5, min_samples_leaf=1)
    clf.fit(df_cell_train_feats, y_train)
    y_test_pred = clf.predict_proba(df_cell_test_feats)
    return y_test_pred
项目:FLASH    作者:yuyuz    | 项目源码 | 文件源码
def get_data_preprocessor_balancing(params, y):
    d_balancing = params['layer_dict_list'][1]

    if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None':
        # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
        params['class_weight'] = None
        # for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier']
        params['sample_weight'] = None
    elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting':
        # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
        params['class_weight'] = 'auto'
        # for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier']
        if len(y.shape) > 1:
            offsets = [2 ** i for i in range(y.shape[1])]
            y_ = np.sum(y * offsets, axis=1)
        else:
            y_ = y
        unique, counts = np.unique(y_, return_counts=True)
        cw = 1. / counts
        cw = cw / np.mean(cw)
        sample_weight = np.ones(y_.shape)
        for i, ue in enumerate(unique):
            mask = y_ == ue
            sample_weight[mask] *= cw[i]
        params['sample_weight'] = sample_weight

    return params
项目:intelligentCampus    作者:Jackal007    | 项目源码 | 文件源码
def __init__(self):
        SingleClassifier.SingleClassifier.__init__(self)
        # weak classifier
        self.clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
项目:AnswerClassify    作者:kenluck2001    | 项目源码 | 文件源码
def makEnsemble( X, xlist, Y ):
    #naive bayes
    clf = MultinomialNB()
    clf.fit( xlist, Y )
    featureSelectModel.append (clf)

    #K nearest neighbours
    clf = KNeighborsClassifier()
    clf.fit( xlist, Y )
    featureSelectModel.append (clf)

    #Logistic regression
    clf = LogisticRegression(C=1)
    clf.fit( xlist, Y )
    featureSelectModel.append (clf)

    #random forest
    clf  = RandomForestClassifier(n_estimators = 400)
    clf.fit( X, Y )
    wholeFeatureModel.append (clf)

    #extra forest
    clf = ExtraTreesClassifier(n_estimators = 400)
    clf.fit( X, Y )
    wholeFeatureModel.append (clf)

    #decision forest
    clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
    clf.fit( X, Y )
    wholeFeatureModel.append (clf)

    #gradient boosting
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
                  'learning_rate': 0.01}
    clf = GradientBoostingClassifier(**params)
    clf.fit( X, Y )
    wholeFeatureModel.append (clf)
项目:kaggle_airbnb    作者:svegapons    | 项目源码 | 文件源码
def clf_extra_trees(data, random_state, calibrated=False, ext_name=""):
    """
    Application of extra trees classifier. For details look at
    'clf_sklearn' function.
    """
    et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1,

                                max_depth=17,
                                max_features=0.2,
                                min_samples_split=80,
                                random_state=random_state, verbose=10)

    return clf_sklearn(et, data, random_state, calibrated, clf_name='ET',
                       ext_name=ext_name)
项目:ml-traffic    作者:Zepheus    | 项目源码 | 文件源码
def lda(directories):
    images = load(directories, True, permute=False)

    f = HaarFeature()
    x = []

    for idx, im in enumerate(images):
        print("%d/%d" % (idx, len(images)))
        x.append(np.array(f.process(im)))

    y_train = [im.label for im in images]
    classes = list(set(y_train))
    class_to_index = {key: index for index, key in enumerate(classes)}
    labels = np.concatenate(np.array([[class_to_index[name] for name in y_train]]))

    clf = ExtraTreesClassifier()
    clf = clf.fit(x, labels)
    w, h = f.size, f.size
    i = 0

    filtered = []
    for size in f.haarSizes:
        for x in range(w - size):
            for y in range(h - size):
                for haar_type in range(len(f.haars)):
                    score = clf.feature_importances_[i]
                    if score > 0.000001:
                        filtered.append((size, x, y, haar_type, score))
                    i += 1

    sorted_filtered = sorted(filtered, key=lambda tup: tup[4], reverse=True)
    text_file = open("haarImportance.txt", "w")

    for k in sorted_filtered:
        # print("[size=%d][x=%d][y=%d][type=%d] \t=> %f" % k)
        text_file.write("[size=%d][x=%d][y=%d][type=%d] \t=> %f\n" % k)

    text_file.close()
项目:qml    作者:quantum13    | 项目源码 | 文件源码
def _load_model(self, model_id):
        _, conn = get_engine()

        #todo
        models = {
            'QXgb': QXgb,
            'QXgb2': QXgb2,
            'Ridge': Ridge,
            'RidgeClassifier': RidgeClassifier,
            'KNeighborsClassifier': KNeighborsClassifier,
            'QAvg': QAvg,
            'QRankedAvg': QRankedAvg,
            'QRankedByLineAvg': QRankedByLineAvg,
            'QStackModel': QStackModel,
            'LogisticRegression': LogisticRegression,
            'DecisionTreeClassifier': DecisionTreeClassifier,
            'QPostProcessingModel': QPostProcessingModel,
            'RandomForestClassifier': RandomForestClassifier,
            'ExtraTreesClassifier': ExtraTreesClassifier,
            'QAvgOneModelData': QAvgOneModelData,
            'QNN1': QNN1,
            'QNN2': QNN2,
        }

        res = conn.execute(
            """
                select cls, params, descr, predict_fn
                from qml_models 
                where 
                    model_id='{}'
            """.format(model_id)
        ).fetchone()

        if not res:
            raise Exception('Missing {} model'.format(model_id))

        model = models[res['cls']](**json.loads(res['params']))
        self.add(model_id, model, res['descr'], res['predict_fn'])
        return model
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def models():
    params = {'n_jobs':nthread,'random_state':seed,'class_weight':None}

    # extra = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features='auto',criterion= 'entropy',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
    # extra1 = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=60,criterion= 'gini',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)

    # rf = ensemble.RandomForestClassifier(n_estimators=1000,max_features= 'auto',criterion= 'gini',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params)
    # rf1 = ensemble.RandomForestClassifier(n_estimators=1000,max_features=60,criterion= 'entropy',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params)

    # xgb_binlog = XGBClassifier(objective="binary:logistic" ,max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_reglog = XGBClassifier(objective="reg:logistic", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_poi = XGBClassifier(objective="count:poisson", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)
    # xgb_reglin = XGBClassifier(objective="reg:linear", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed)

    rf_params = {'n_estimators':850,'max_features':60,'criterion':'entropy','min_samples_split': 4,'max_depth': 40, 'min_samples_leaf': 2, 'n_jobs': -1}

    clfs = [
        # (D1, XGBRegressor(objective="reg:linear", max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        (D1, XGBClassifier(objective="binary:logistic" ,max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1, XGBRegressor(objective="reg:linear", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1,XGBClassifier(objective="binary:logistic", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1, XGBRegressor(objective="reg:linear", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),
        # (D1,XGBClassifier(objective="binary:logistic", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)),

    ]
    for clf in clfs:
        yield clf
项目:yttresearch-machine-learning-algorithms-analysis    作者:gdemos01    | 项目源码 | 文件源码
def exportPresentationData(classifier,action):
        dir = input('Give Data Directory: ')

        if int(classifier)==1:
                clf = GradientBoostingClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 2:
                clf = LogisticRegression()
                classify(dir,clf,action)
        elif int(classifier) == 3:
                clf = KNeighborsClassifier(n_neighbors=5)
                classify(dir,clf,action)
        elif int(classifier) == 4:
                clf = DecisionTreeClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 5:
                clf = svm.LinearSVC()
                classify_type2(dir,clf,action)
        elif int(classifier) == 6:
                clf = RandomForestClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 7:
                clf = ExtraTreesClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 8:
                clf = IsolationForest()
                classify_type2(dir,clf,action)
        elif int(classifier) == 9:
                clf = AdaBoostClassifier(n_estimators=100)
                classify(dir,clf,action)
        elif int(classifier) == 10:
                clf = BaggingClassifier(DecisionTreeClassifier())
                classify(dir,clf,action)
        elif int(classifier) == 11:
                clf1 = GradientBoostingClassifier()
                clf2 = AdaBoostClassifier()
                clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft')
                classify(dir,clf,action)
项目:yttresearch-machine-learning-algorithms-analysis    作者:gdemos01    | 项目源码 | 文件源码
def exportPresentationData(classifier,action,dir):

        if int(classifier)==1:
                clf = GradientBoostingClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 2:
                clf = LogisticRegression()
                classify(dir,clf,action)
        elif int(classifier) == 3:
                clf = KNeighborsClassifier(n_neighbors=5)
                classify(dir,clf,action)
        elif int(classifier) == 4:
                clf = DecisionTreeClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 5:
                clf = svm.LinearSVC()
                classify_type2(dir,clf,action)
        elif int(classifier) == 6:
                clf = RandomForestClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 7:
                clf = ExtraTreesClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 8:
                clf = IsolationForest()
                classify_type2(dir,clf,action)
        elif int(classifier) == 9:
                clf = AdaBoostClassifier(n_estimators=100)
                classify(dir,clf,action)
        elif int(classifier) == 10:
                clf = BaggingClassifier(DecisionTreeClassifier())
                classify(dir,clf,action)
        elif int(classifier) == 11:
                clf1 = GradientBoostingClassifier()
                clf2 = AdaBoostClassifier()
                clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft')
                classify(dir,clf,action)
项目:DataMiningCompetitionFirstPrize    作者:lzddzh    | 项目源码 | 文件源码
def learn(x, y, test_x):
    cw = {"0":variables.weight_0_rf, "1000":variables.weight_1000_rf, "1500":variables.weight_1500_rf, "2000":variables.weight_2000_rf}
    clf = ExtraTreesClassifier(n_jobs = -1,
                                     n_estimators=variables.n_estimators_et,
                                     max_depth=variables.max_depth_et, random_state=0,
                                     min_samples_split=variables.min_samples_split_et,
                                     min_samples_leaf=variables.min_samples_leaf_et,
                                     max_features=variables.max_feature_et,
                                     max_leaf_nodes=variables.max_leaf_nodes_et,
                                     criterion=variables.criterion_et,
                                     min_impurity_split=variables.min_impurity_split_et,
                                     class_weight=variables.cw_et).fit(x, y)

    print "n_estimators=", variables.n_estimators_et,
    print "max_depth=", variables.max_depth_et,
    print "min_samples_split=", variables.min_samples_split_et,
    print "min_samples_leaf=", variables.min_samples_leaf_et,
    print "max_features=",variables.max_feature_et,
    print "max_leaf_nodes=",variables.max_leaf_nodes_et,
    print "criterion=",variables.criterion_et,
    print "min_impurity_split=",variables.min_impurity_split_et,
    print "class_weight=", variables.cw_et

    prediction_list = clf.predict(test_x)
    prediction_list_prob = clf.predict_proba(test_x)
    return prediction_list,prediction_list_prob
项目:fake-news-detection    作者:aldengolab    | 项目源码 | 文件源码
def define_clfs_params(self):
        '''
        Defines all relevant parameters and classes for classfier objects.
        Edit these if you wish to change parameters.
        '''
        # These are the classifiers
        self.clfs = {
            'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
            'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
            'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
            'LR': LogisticRegression(penalty = 'l1', C = 1e5),
            'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
            'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
            'NB': GaussianNB(),
            'DT': DecisionTreeClassifier(),
            'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
            'KNN': KNeighborsClassifier(n_neighbors = 3)
            }
        # These are the parameters which will be run through
        self.params = {
             'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]},
             'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]},
             'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]},
             'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]},
             'NB': {},
             'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]},
             'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
             }
项目:repo-classifier    作者:linkvt    | 项目源码 | 文件源码
def __init__(self):
        params = dict(clf__n_estimators=[80, 120, 150, 170], clf__min_samples_split=[2], clf__max_depth=[None, 40])
        class_weights = {'DEV': 1, 'WEB': 2, 'DATA': 4, 'DOCS': 4, 'EDU': 20, 'HW': 15, 'OTHER': 25}
        super().__init__(ensemble.ExtraTreesClassifier(class_weight=class_weights), params, 'ExtraTreesClassifier')
项目:gcforest    作者:w821881341    | 项目源码 | 文件源码
def __init__(self, name, kwargs):
        from sklearn.ensemble import ExtraTreesClassifier
        super(GCExtraTreesClassifier, self).__init__(name, ExtraTreesClassifier, kwargs)
项目:ZZZZ    作者:Phonicavi    | 项目源码 | 文件源码
def predictNext(self, stock, pred_date_count, train_batch_size=100, use_NN=True):
        trainX, trainY, trainD = self.getRawByCount(pred_date_count-train_batch_size, pred_date_count);
        testX, testY, testD = self.getSingleRaw(pred_date_count)
        testX = testX.reshape(1, -1)
        # print trainX[0]
        # print list(trainX)
        sc = StandardScaler()
        sc.fit(trainX)
        trainX = sc.transform(trainX)
        testX = sc.transform(testX)

        # trainX = np.delete(trainX,0,axis=1)
        # testX = np.delete(testX,0,axis=1)


        fs_method = 'RFC'
        pred_pro=[1,0]

        trainX,testX = featureSelection (trainX, trainY, testX, [], method=fs_method, testmode=False, n_features_to_select=None)

        if use_NN:
            from Power import NNet
            predY = NNet(TrainX=trainX, TrainY=trainY, TestX=testX)
            # print predY
            # pred_pro=[1,0]
        else:
            clf = ExtraTreesClassifier(criterion='gini', n_estimators=150, max_features='auto', n_jobs=4, class_weight='balanced')
            # clf =  DecisionTreeClassifier(class_weight='balanced')
            clf.fit(trainX, trainY)
            predY = clf.predict(testX)
            # pred_pro = (clf.predict_proba(testX) if hasattr(clf, "predict_proba") else clf.decision_function(testX))

        return predY[0], pred_pro[0],testY, testD, 1-clf.score(trainX, trainY)
项目:kdd99-scikit    作者:PENGZhaoqing    | 项目源码 | 文件源码
def tree_based_selection(self, data_set, data_target, feature_names):
        """

        :param data_set:
        :return:
        """

        clf = ExtraTreesClassifier()
        clf = clf.fit(data_set, data_target)
        print clf.feature_importances_

        model = SelectFromModel(clf, prefit=True)
        feature_set = model.transform(data_set)

        fea_index = []
        for A_col in np.arange(data_set.shape[1]):
            for B_col in np.arange(feature_set.shape[1]):
                if (data_set[:, A_col] == feature_set[:, B_col]).all():
                    fea_index.append(A_col)

        check = {}
        for i in fea_index:
            check[feature_names[i]] = data_set[0][i]
        print np.array(check)

        return feature_set, fea_index
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 7
    num_splits = 7
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=6,
                                               random_state=seed,
                                               n_estimators=500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 1
    num_splits = 3
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=7,
                                               random_state=seed,
                                               n_estimators=1500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 3
    num_splits = 7
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=11,
                                               random_state=seed,
                                               n_estimators=1500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 2
    num_splits = 7
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=11,
                                               random_state=seed,
                                               n_estimators=2000,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 3
    num_splits = 5
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=15,
                                               random_state=seed,
                                               n_estimators=2500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += p
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += reg.predict_proba(test2)[:,1]

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 3
    num_splits = 5
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=15,
                                               random_state=seed,
                                               n_estimators=2500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 3
    num_splits = 5
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=15,
                                               random_state=seed,
                                               n_estimators=2500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 3
    num_splits = 5
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=15,
                                               random_state=seed,
                                               n_estimators=2500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += p
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += np.log1p(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 7
    num_splits = 17
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=7,
                                               random_state=seed,
                                               n_estimators=1500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds