我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.ExtraTreesClassifier()。
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def __init__( self,data_block, predictors=[],cv_folds=10, scoring_metric='accuracy',additional_display_metrics=[]): base_classification.__init__( self, alg=ExtraTreesClassifier(), data_block=data_block, predictors=predictors,cv_folds=cv_folds, scoring_metric=scoring_metric, additional_display_metrics=additional_display_metrics) self.model_output = pd.Series(self.default_parameters) self.model_output['Feature_Importance'] = "-" self.model_output['OOB_Score'] = "-" #Set parameters to default values: self.set_parameters(set_default=True)
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def fit(self, X, Y, sample_weight=None): from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) preprocessor = ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight ) preprocessor.fit(X, Y, sample_weight=sample_weight) self.preprocessor = SelectFromModel(preprocessor, prefit=True) return self
def prec_ets(n_trees, X_train, y_train, X_test, y_test, random_state=None): """ ExtraTrees """ from sklearn.ensemble import ExtraTreesClassifier if not issparse(X_train): X_train = X_train.reshape((X_train.shape[0], -1)) if not issparse(X_test): X_test = X_test.reshape((X_test.shape[0], -1)) LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format( n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape)) clf = ExtraTreesClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1, random_state=random_state) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) prec = float(np.sum(y_pred == y_test)) / len(y_test) LOGGER.info('prec_ets{}={:.6f}%'.format(n_trees, prec*100.0)) return clf, y_pred
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0,len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data,target) model = SelectFromModel(clf,prefit=True) outcome = model.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data,target) model = SelectFromModel(clf,prefit=True) outcome = model.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithTreeBaesd(anamolySample,normalSample): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) name = [] for i in data.columns: name.append(i) clf = ExtraTreesClassifier() clf = clf.fit(data,target) model = SelectFromModel(clf,prefit=True) outcome = model.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name): data = anamolySample target = [] for i in range(0, len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0, len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data, target) model = SelectFromModel(clf, prefit=True) outcome = model.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " return warnstr
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name): target = [] for i in range(0, len(anamolySample)): target.append(1) data = pd.concat([anamolySample,normalSample]) for i in range(0, len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data, target) model = SelectFromModel(clf, prefit=True) outcome = model.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " return warnstr
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name): target = [] for i in range(0, len(anamolySample)): target.append(1) data = pd.concat([anamolySample,normalSample]) for i in range(0, len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data, target) model = SelectFromModel(clf, prefit=True) outcome = model.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " print warnstr return warnstr
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0): clf = ensemble.ExtraTreesClassifier( n_estimators = n_est_val, max_depth = depth_val, min_samples_split = split_val, min_samples_leaf = leaf_val, max_features = feat_val, criterion='entropy', n_jobs = jobs_val, random_state = random_state_val) clf.fit(train_X, train_y) pred_train_y = clf.predict_proba(train_X)[:,1] pred_test_y = clf.predict_proba(test_X)[:,1] if validation: train_loss = log_loss(train_y, pred_train_y) loss = log_loss(test_y, pred_test_y) print "Train, Test loss : ", train_loss, loss return pred_test_y, loss else: return pred_test_y
def extratreescv(n_estimators, min_samples_split, min_samples_leaf, max_features, max_depth, min_weight_fraction_leaf ): clf = ExtraTreesClassifier(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf), max_features= int(max_features), max_depth = int(max_depth), min_weight_fraction_leaf = min_weight_fraction_leaf, n_jobs=-1, random_state=1234, verbose=1) clf.fit(x0, y0) ll = -log_loss(y1, clf.predict_proba(x1)[:,1]) return ll
def try_params( n_iterations, params ): n_estimators = int( round( n_iterations * trees_per_iteration )) print "n_estimators:", n_estimators pprint( params ) clf = XT( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params ) return train_and_eval_sklearn_classifier( clf, data )
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier(n_jobs=-1) # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) return predictions
def __init__(self, X, label_words): self.le = preprocessing.LabelEncoder() self.clf = ExtraTreesClassifier(n_estimators=100, max_depth=16, random_state=0) y = self.encode_labels(label_words) self.clf.fit(np.asarray(X), y)
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesClassifier as ETC if refit: self.estimator = None if self.estimator is None: num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.estimator = ETC( n_estimators=0, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight, warm_start=True ) tmp = self.estimator # TODO copy ? tmp.n_estimators += n_iter tmp.fit(X, y, sample_weight=sample_weight) self.estimator = tmp return self
def __init__(self, name, kwargs): from sklearn.ensemble import ExtraTreesClassifier super(GCExtraTreesClassifier, self).__init__(name, ExtraTreesClassifier, kwargs)
def et_opt1(df_cell_train_feats, y_train, df_cell_test_feats): logging.info("train et_opt1 model") clf = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_features="log2", min_samples_split=5, min_samples_leaf=1) clf.fit(df_cell_train_feats, y_train) y_test_pred = clf.predict_proba(df_cell_test_feats) return y_test_pred
def get_data_preprocessor_balancing(params, y): d_balancing = params['layer_dict_list'][1] if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None': # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier'] params['class_weight'] = None # for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier'] params['sample_weight'] = None elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting': # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier'] params['class_weight'] = 'auto' # for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier'] if len(y.shape) > 1: offsets = [2 ** i for i in range(y.shape[1])] y_ = np.sum(y * offsets, axis=1) else: y_ = y unique, counts = np.unique(y_, return_counts=True) cw = 1. / counts cw = cw / np.mean(cw) sample_weight = np.ones(y_.shape) for i, ue in enumerate(unique): mask = y_ == ue sample_weight[mask] *= cw[i] params['sample_weight'] = sample_weight return params
def __init__(self): SingleClassifier.SingleClassifier.__init__(self) # weak classifier self.clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
def makEnsemble( X, xlist, Y ): #naive bayes clf = MultinomialNB() clf.fit( xlist, Y ) featureSelectModel.append (clf) #K nearest neighbours clf = KNeighborsClassifier() clf.fit( xlist, Y ) featureSelectModel.append (clf) #Logistic regression clf = LogisticRegression(C=1) clf.fit( xlist, Y ) featureSelectModel.append (clf) #random forest clf = RandomForestClassifier(n_estimators = 400) clf.fit( X, Y ) wholeFeatureModel.append (clf) #extra forest clf = ExtraTreesClassifier(n_estimators = 400) clf.fit( X, Y ) wholeFeatureModel.append (clf) #decision forest clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) clf.fit( X, Y ) wholeFeatureModel.append (clf) #gradient boosting params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01} clf = GradientBoostingClassifier(**params) clf.fit( X, Y ) wholeFeatureModel.append (clf)
def clf_extra_trees(data, random_state, calibrated=False, ext_name=""): """ Application of extra trees classifier. For details look at 'clf_sklearn' function. """ et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_depth=17, max_features=0.2, min_samples_split=80, random_state=random_state, verbose=10) return clf_sklearn(et, data, random_state, calibrated, clf_name='ET', ext_name=ext_name)
def lda(directories): images = load(directories, True, permute=False) f = HaarFeature() x = [] for idx, im in enumerate(images): print("%d/%d" % (idx, len(images))) x.append(np.array(f.process(im))) y_train = [im.label for im in images] classes = list(set(y_train)) class_to_index = {key: index for index, key in enumerate(classes)} labels = np.concatenate(np.array([[class_to_index[name] for name in y_train]])) clf = ExtraTreesClassifier() clf = clf.fit(x, labels) w, h = f.size, f.size i = 0 filtered = [] for size in f.haarSizes: for x in range(w - size): for y in range(h - size): for haar_type in range(len(f.haars)): score = clf.feature_importances_[i] if score > 0.000001: filtered.append((size, x, y, haar_type, score)) i += 1 sorted_filtered = sorted(filtered, key=lambda tup: tup[4], reverse=True) text_file = open("haarImportance.txt", "w") for k in sorted_filtered: # print("[size=%d][x=%d][y=%d][type=%d] \t=> %f" % k) text_file.write("[size=%d][x=%d][y=%d][type=%d] \t=> %f\n" % k) text_file.close()
def _load_model(self, model_id): _, conn = get_engine() #todo models = { 'QXgb': QXgb, 'QXgb2': QXgb2, 'Ridge': Ridge, 'RidgeClassifier': RidgeClassifier, 'KNeighborsClassifier': KNeighborsClassifier, 'QAvg': QAvg, 'QRankedAvg': QRankedAvg, 'QRankedByLineAvg': QRankedByLineAvg, 'QStackModel': QStackModel, 'LogisticRegression': LogisticRegression, 'DecisionTreeClassifier': DecisionTreeClassifier, 'QPostProcessingModel': QPostProcessingModel, 'RandomForestClassifier': RandomForestClassifier, 'ExtraTreesClassifier': ExtraTreesClassifier, 'QAvgOneModelData': QAvgOneModelData, 'QNN1': QNN1, 'QNN2': QNN2, } res = conn.execute( """ select cls, params, descr, predict_fn from qml_models where model_id='{}' """.format(model_id) ).fetchone() if not res: raise Exception('Missing {} model'.format(model_id)) model = models[res['cls']](**json.loads(res['params'])) self.add(model_id, model, res['descr'], res['predict_fn']) return model
def models(): params = {'n_jobs':nthread,'random_state':seed,'class_weight':None} # extra = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features='auto',criterion= 'entropy',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params) # extra1 = ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=60,criterion= 'gini',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params) # rf = ensemble.RandomForestClassifier(n_estimators=1000,max_features= 'auto',criterion= 'gini',min_samples_split= 2, max_depth= None, min_samples_leaf= 1, **params) # rf1 = ensemble.RandomForestClassifier(n_estimators=1000,max_features=60,criterion= 'entropy',min_samples_split= 4, max_depth= 40, min_samples_leaf= 2, **params) # xgb_binlog = XGBClassifier(objective="binary:logistic" ,max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_reglog = XGBClassifier(objective="reg:logistic", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_poi = XGBClassifier(objective="count:poisson", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) # xgb_reglin = XGBClassifier(objective="reg:linear", max_depth=10, learning_rate=0.01, n_estimators=5,nthread=nthread, seed=seed) rf_params = {'n_estimators':850,'max_features':60,'criterion':'entropy','min_samples_split': 4,'max_depth': 40, 'min_samples_leaf': 2, 'n_jobs': -1} clfs = [ # (D1, XGBRegressor(objective="reg:linear", max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), (D1, XGBClassifier(objective="binary:logistic" ,max_depth=6, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1, XGBRegressor(objective="reg:linear", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1,XGBClassifier(objective="binary:logistic", max_depth=5, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1, XGBRegressor(objective="reg:linear", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), # (D1,XGBClassifier(objective="binary:logistic", max_depth=4, learning_rate=0.01, subsample=.8, n_estimators=2000,nthread=nthread, seed=seed)), ] for clf in clfs: yield clf
def exportPresentationData(classifier,action): dir = input('Give Data Directory: ') if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def exportPresentationData(classifier,action,dir): if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def learn(x, y, test_x): cw = {"0":variables.weight_0_rf, "1000":variables.weight_1000_rf, "1500":variables.weight_1500_rf, "2000":variables.weight_2000_rf} clf = ExtraTreesClassifier(n_jobs = -1, n_estimators=variables.n_estimators_et, max_depth=variables.max_depth_et, random_state=0, min_samples_split=variables.min_samples_split_et, min_samples_leaf=variables.min_samples_leaf_et, max_features=variables.max_feature_et, max_leaf_nodes=variables.max_leaf_nodes_et, criterion=variables.criterion_et, min_impurity_split=variables.min_impurity_split_et, class_weight=variables.cw_et).fit(x, y) print "n_estimators=", variables.n_estimators_et, print "max_depth=", variables.max_depth_et, print "min_samples_split=", variables.min_samples_split_et, print "min_samples_leaf=", variables.min_samples_leaf_et, print "max_features=",variables.max_feature_et, print "max_leaf_nodes=",variables.max_leaf_nodes_et, print "criterion=",variables.criterion_et, print "min_impurity_split=",variables.min_impurity_split_et, print "class_weight=", variables.cw_et prediction_list = clf.predict(test_x) prediction_list_prob = clf.predict_proba(test_x) return prediction_list,prediction_list_prob
def define_clfs_params(self): ''' Defines all relevant parameters and classes for classfier objects. Edit these if you wish to change parameters. ''' # These are the classifiers self.clfs = { 'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1), 'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'), 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200), 'LR': LogisticRegression(penalty = 'l1', C = 1e5), 'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0), 'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10), 'NB': GaussianNB(), 'DT': DecisionTreeClassifier(), 'SGD': SGDClassifier(loss = 'log', penalty = 'l2'), 'KNN': KNeighborsClassifier(n_neighbors = 3) } # These are the parameters which will be run through self.params = { 'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]}, 'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]}, 'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]}, 'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]}, 'NB': {}, 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]}, 'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']} }
def __init__(self): params = dict(clf__n_estimators=[80, 120, 150, 170], clf__min_samples_split=[2], clf__max_depth=[None, 40]) class_weights = {'DEV': 1, 'WEB': 2, 'DATA': 4, 'DOCS': 4, 'EDU': 20, 'HW': 15, 'OTHER': 25} super().__init__(ensemble.ExtraTreesClassifier(class_weight=class_weights), params, 'ExtraTreesClassifier')
def predictNext(self, stock, pred_date_count, train_batch_size=100, use_NN=True): trainX, trainY, trainD = self.getRawByCount(pred_date_count-train_batch_size, pred_date_count); testX, testY, testD = self.getSingleRaw(pred_date_count) testX = testX.reshape(1, -1) # print trainX[0] # print list(trainX) sc = StandardScaler() sc.fit(trainX) trainX = sc.transform(trainX) testX = sc.transform(testX) # trainX = np.delete(trainX,0,axis=1) # testX = np.delete(testX,0,axis=1) fs_method = 'RFC' pred_pro=[1,0] trainX,testX = featureSelection (trainX, trainY, testX, [], method=fs_method, testmode=False, n_features_to_select=None) if use_NN: from Power import NNet predY = NNet(TrainX=trainX, TrainY=trainY, TestX=testX) # print predY # pred_pro=[1,0] else: clf = ExtraTreesClassifier(criterion='gini', n_estimators=150, max_features='auto', n_jobs=4, class_weight='balanced') # clf = DecisionTreeClassifier(class_weight='balanced') clf.fit(trainX, trainY) predY = clf.predict(testX) # pred_pro = (clf.predict_proba(testX) if hasattr(clf, "predict_proba") else clf.decision_function(testX)) return predY[0], pred_pro[0],testY, testD, 1-clf.score(trainX, trainY)
def tree_based_selection(self, data_set, data_target, feature_names): """ :param data_set: :return: """ clf = ExtraTreesClassifier() clf = clf.fit(data_set, data_target) print clf.feature_importances_ model = SelectFromModel(clf, prefit=True) feature_set = model.transform(data_set) fea_index = [] for A_col in np.arange(data_set.shape[1]): for B_col in np.arange(feature_set.shape[1]): if (data_set[:, A_col] == feature_set[:, B_col]).all(): fea_index.append(A_col) check = {} for i in fea_index: check[feature_names[i]] = data_set[0][i] print np.array(check) return feature_set, fea_index
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 7 num_splits = 7 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=6, random_state=seed, n_estimators=500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 1 num_splits = 3 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=7, random_state=seed, n_estimators=1500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 7 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=11, random_state=seed, n_estimators=1500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 2 num_splits = 7 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=11, random_state=seed, n_estimators=2000, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=15, random_state=seed, n_estimators=2500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += p score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += reg.predict_proba(test2)[:,1] cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=15, random_state=seed, n_estimators=2500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=15, random_state=seed, n_estimators=2500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += p score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += np.log1p(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 7 num_splits = 17 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=7, random_state=seed, n_estimators=1500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds