我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.RandomForestClassifier()。
def trained_models(): dataset = datasets.load_breast_cancer() X = dataset.data y = dataset.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345) rf = RandomForestClassifier() rf.fit(X_train, y_train) lr = LogisticRegression() lr.fit(X_train, y_train) svc_w_linear_kernel = SVC(kernel='linear') svc_w_linear_kernel.fit(X_train, y_train) svc_wo_linear_kernel = SVC() svc_wo_linear_kernel.fit(X_train, y_train) dummy = DummyClassifier() dummy.fit(X_train, y_train) return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel, 'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def test_improvement(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) hyperopt = HyperoptOptimizer(model, [p1], clf_score) best_params, best_model = hyperopt.fit(X_train=data, y_train=target, n_iters=10) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score) for status in hyperopt.trials.statuses(): self.assertEqual(status, 'ok')
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def get_classifier_class(class_name): name_table = { 'svm': SVC, 'k_neighbors': KNeighborsClassifier, 'gaussian_process': GaussianProcessClassifier, 'decision_tree': DecisionTreeClassifier, 'random_forest': RandomForestClassifier, 'ada_boost': AdaBoostClassifier, 'mlp': MLPClassifier, 'gaussian_naive_bayes': GaussianNB, 'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis } if class_name not in name_table: raise ValueError('No such classifier') return name_table[class_name]
def __create_classifiers(self): classifiers = list() classifiers.append({"func": linear_model.SGDClassifier(loss="log"), "name": "sgd"}) classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'), "name": "knn1"}) classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'), "name": "knn3"}) classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'), "name": "knn5"}) classifiers.append({"func": GaussianNB(), "name": "naive_bayes"}) # classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"}) # classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"}) # classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"}) return classifiers
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc',svm.LinearSVC()), ('knn',neighbors.KNeighborsClassifier()), ('rfor',RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('accuracy:',confidence) predictions = clf.predict(X_test) print('predicted class counts:',Counter(predictions)) print() print() return confidence # examples of running:
def run_forests(): print('random forest: \n') params = [] scores = [] for _ in range(5): max_features = np.random.randint(400,800) max_depth = np.random.choice([None, None, None, None, 30, 40, 60]) forest = RandomForestClassifier(n_estimators=50, max_features=max_features, max_depth=max_depth) forest_fit = forest.fit(X_train, Y_train) pred = forest_fit.predict(X_test) print('\n params:', dict(max_features=max_features, max_depth=max_depth)) print('forest train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), ' test: ', zero_one_score(Y_test, pred)) params.append( (max_features, max_depth) ) scores.append( zero_one_score(Y_test, pred)) print('best:', params[np.argmin(scores)])
def train_clf(x_train, y_train, best_depth): """ Train classifier. Parameters ---------- x_train : np.array [n_samples, n_features] Training features. y_train : np.array [n_samples] Training labels best_depth : int Optimal max_depth parameter Returns ------- clf : classifier Trained scikit-learn classifier """ clf = RFC(n_estimators=100, max_depth=best_depth, n_jobs=-1, class_weight='auto', max_features=None) clf = clf.fit(x_train, y_train) return clf
def __init__( self,data_block, predictors=[],cv_folds=10, scoring_metric='accuracy',additional_display_metrics=[]): base_classification.__init__( self, alg=RandomForestClassifier(), data_block=data_block, predictors=predictors,cv_folds=cv_folds, scoring_metric=scoring_metric, additional_display_metrics=additional_display_metrics ) self.model_output = pd.Series(self.default_parameters) self.model_output['Feature_Importance'] = "-" self.model_output['OOB_Score'] = "-" #Set parameters to default values: self.set_parameters(set_default=True)
def test_improvement(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) grid_sizes = {'max_depth': 5} grid_search = GridSearchOptimizer(model, [p1], clf_score, grid_sizes) best_params, best_model = grid_search.fit(X_train=data, y_train=target) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def test_objective_function(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=10, n_informative=10, n_redundant=0, class_sep=100, n_clusters_per_class=1, flip_y=0.0) model = RandomForestClassifier(max_depth=5) model.fit(data, target) fun = partial(objective, model, 'sklearn', clf_score, data, target, data, target) # model should fit the data perfectly final_score = fun(model.get_params())[0] self.assertEqual(final_score,1)
def test_expected_improvement_tractable(self): np.random.seed(5) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='expected_improvement') best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10) self.assertTrue(bayesOpt.success) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def test_probability_of_improvement_tractable(self): np.random.seed(5) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='probability_of_improvement') best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10) self.assertTrue(bayesOpt.success) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def test_upper_confidence_bound_tractable(self): np.random.seed(5) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='upper_confidence_bound') best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10) self.assertTrue(bayesOpt.success) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def test_improvement(self): np.random.seed(4) data, target = make_classification(n_samples=100, n_features=45, n_informative=15, n_redundant=5, class_sep=1, n_clusters_per_class=4, flip_y=0.4) model = RandomForestClassifier(max_depth=5) model.fit(data, target) start_score = clf_score(target, model.predict(data)) p1 = Parameter('max_depth', 'integer', lower=1, upper=10) rand_search = RandomSearchOptimizer(model, [p1], clf_score) best_params, best_model = rand_search.fit(X_train=data, y_train=target, n_iters=10) best_model.fit(data, target) final_score = clf_score(target, best_model.predict(data)) self.assertTrue(final_score>start_score)
def __init__(self, task: Task, scorer: Scorer, opt_logger: OptimizationLogger=VoidLogger(None)): if task.task == "classification": space = RandomForestOptimizer.Params.classification_space model = ensemble.RandomForestClassifier() else: space = RandomForestOptimizer.Params.regression_space model = ensemble.RandomForestRegressor() super().__init__(model, task, space, scorer, opt_logger)
def learns(tests,trains,indep=lambda x: x[:-1], dep = lambda x: x[-1], rf = Abcd(), lg = Abcd(), dt = Abcd(), nb = Abcd()): x1,y1,x2,y2= trainTest(tests,trains,indep,dep) forest = RandomForestClassifier(n_estimators = 50) forest = forest.fit(x1,y1) for n,got in enumerate(forest.predict(x2)): rf(predicted = got, actual = y2[n]) logreg = linear_model.LogisticRegression(C=1e5) logreg.fit(x1, y1) for n,got in enumerate(logreg.predict(x2)): lg(predicted = got, actual = y2[n]) bayes = GaussianNB() bayes.fit(x1,y1) for n,got in enumerate(bayes.predict(x2)): nb(predicted = got, actual = y2[n]) dectree = DecisionTreeClassifier(criterion="entropy", random_state=1) dectree.fit(x1,y1) for n,got in enumerate(dectree.predict(x2)): dt(predicted = got, actual = y2[n])
def rforest(train, test, tunings=None, smoteit=True, duplicate=True): "RF " # Apply random forest Classifier to predict the number of bugs. if smoteit: train = SMOTE(train, atleast=50, atmost=101, resample=duplicate) if not tunings: clf = RandomForestClassifier(n_estimators=100, random_state=1) else: clf = RandomForestClassifier(n_estimators=int(tunings[0]), max_features=tunings[1] / 100, min_samples_leaf=int(tunings[2]), min_samples_split=int(tunings[3]) ) train_DF = formatData(train) test_DF = formatData(test) features = train_DF.columns[:-2] klass = train_DF[train_DF.columns[-2]] # set_trace() clf.fit(train_DF[features], klass) preds = clf.predict(test_DF[test_DF.columns[:-2]]) return preds
def __init__(self, estimator=RandomForestClassifier(n_estimators=50, n_jobs=-1, max_features=1., min_samples_leaf=5, max_depth=5), n_folds=2, stratify=True, random_state=1): self.estimator = estimator self.n_folds = n_folds self.stratify = stratify self.random_state = random_state self.__cv = None self.__pred = None self.__target = None self.__fitOK = False
def test_stacked_classfier_extkfold(self): bclf = LogisticRegression(random_state=1) clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), RidgeClassifier(random_state=1), ] sl = StackedClassifier(bclf, clfs, n_folds=3, verbose=0, Kfold=StratifiedKFold(self.iris.target, 3), stack_by_proba=False, oob_score_flag=True, oob_metrics=log_loss) sl.fit(self.iris.data, self.iris.target) score = sl.score(self.iris.data, self.iris.target) self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def test_fwls_classfier(self): feature_func = lambda x: np.ones(x.shape) bclf = LogisticRegression(random_state=1) clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), RidgeClassifier(random_state=1), ] sl = FWLSClassifier(bclf, clfs, feature_func=feature_func, n_folds=3, verbose=0, Kfold=StratifiedKFold(self.iris.target, 3), stack_by_proba=False) sl.fit(self.iris.data, self.iris.target) score = sl.score(self.iris.data, self.iris.target) self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def test_classifier(self): index = [i for i in range(len(self.iris.data))] rf = RandomForestClassifier() jrf = JoblibedClassifier(rf, "rf", cache_dir='') jrf.fit(self.iris.data, self.iris.target, index) prediction = jrf.predict(self.iris.data, index) score = accuracy_score(self.iris.target, prediction) self.assertGreater(score, 0.9, "Failed with score = {0}".format(score)) rf = RandomForestClassifier(n_estimators=20) jrf = JoblibedClassifier(rf, "rf", cache_dir='') jrf.fit(self.iris.data, self.iris.target) index = [i for i in range(len(self.iris.data))] prediction2 = jrf.predict(self.iris.data, index) self.assertTrue((prediction == prediction2).all())
def prec_rf(n_trees, X_train, y_train, X_test, y_test): """ ExtraTrees """ from sklearn.ensemble import RandomForestClassifier if not issparse(X_train): X_train = X_train.reshape((X_train.shape[0], -1)) if not issparse(X_test): X_test = X_test.reshape((X_test.shape[0], -1)) LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format( n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape)) clf = RandomForestClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) prec = float(np.sum(y_pred == y_test)) / len(y_test) LOGGER.info('prec_rf{}={:.6f}%'.format(n_trees, prec*100.0)) return clf, y_pred
def get_toy_config(): config = {} ca_config = {} ca_config["random_state"] = 0 ca_config["max_layers"] = 100 ca_config["early_stopping_rounds"] = 3 ca_config["n_classes"] = 10 ca_config["estimators"] = [] ca_config["estimators"].append( {"n_folds": 5, "type": "XGBClassifier", "n_estimators": 10, "max_depth": 5, "objective": "multi:softprob", "silent": True, "nthread": -1, "learning_rate": 0.1} ) ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1}) ca_config["estimators"].append({"n_folds": 5, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1}) ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"}) config["cascade"] = ca_config return config
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'max_depth': range(20,60), 'n_estimators': range(10,40), 'max_features': ['sqrt', 'log2', None] } ] clf = GridSearchCV(RandomForestClassifier(n_estimators=30), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def get_classifier(self): algo=self.algo if algo=="GBT": return GradientBoostingClassifier() elif algo=="RF": return RandomForestClassifier() elif algo=="ADB": return AdaBoostClassifier() elif algo =="DT": return DecisionTreeClassifier() elif algo=="NB": return BernoulliNB() elif algo=="SGD": return SGDClassifier() elif algo=="SVC": return LinearSVC() elif algo=="MLPC": return MLPClassifier(activation='logistic', batch_size='auto', early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive', learning_rate_init=0.1, max_iter=5000, random_state=1, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) return 0
def performRFClass(X_train, y_train, X_test, y_test, fout, savemodel): """ Random Forest Binary Classification """ clf = RandomForestClassifier(n_estimators=100, n_jobs=-1) clf.fit(X_train, y_train) # if savemodel == True: # fname_out = '{}-{}.pickle'.format(fout, datetime.now()) # with open(fname_out, 'wb') as f: # cPickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) return accuracy
def performRFClass(X_train, y_train, X_test, y_test, fout, savemodel): """ Random Forest Binary Classification """ clf = RandomForestClassifier(n_estimators=100, n_jobs=-1) clf.fit(X_train, y_train) # if savemodel == True: # fname_out = '{}-{}.pickle'.format(fout, datetime.now()) # with open(fname_out, 'wb') as f: # cPickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) print "RF: ", accuracy
def random_forest(self, sensors_set): features = list(self.dataset.get_sensors_set_features(sensors_set)) print("RANDOM FOREST.....") print("CLASSIFICATION BASED ON THESE SENSORS: ", self.dataset.get_remained_sensors(sensors_set)) print("NUMBER OF FEATURES: ", len(features)) train_features, train_classes, test_features, test_classes = self.__get_sets_for_classification( self.dataset.get_train, self.dataset.get_test, features) classifier_forest = RandomForestClassifier(n_estimators=const.PAR_RF_ESTIMATOR) classifier_forest.fit(train_features, train_classes) test_prediction = classifier_forest.predict(test_features) acc = accuracy_score(test_classes, test_prediction) df_feature = pd.DataFrame( {'accuracy': acc, 'featureName': features, 'importance': classifier_forest.feature_importances_}) df_feature = df_feature.sort_values(by='importance', ascending=False) print("ACCURACY : " + str(acc)) print("END RANDOM FOREST") if not os.path.exists(const.DIR_RESULTS): os.makedirs(const.DIR_RESULTS) df_feature.to_csv(const.DIR_RESULTS + "/" + str(sensors_set) + const.FILE_RANDOM_FOREST_RESULTS, index=False) # neural network algorithm training on training al train set and test on all test set
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestClassifier import numpy as np scikit_data = load_boston() scikit_model = RandomForestClassifier(random_state = 1) t = scikit_data.target target = np.digitize(t, np.histogram(t)[1]) - 1 scikit_model.fit(scikit_data.data, target) # Save the data and the model self.scikit_data = scikit_data self.target = target self.scikit_model = scikit_model
def test_random_forest_classifier(self): for dtype in self.number_data_type.keys(): scikit_model = RandomForestClassifier(random_state=1) data = self.scikit_data['data'].astype(dtype) target = self.scikit_data['target'].astype(dtype) > self.scikit_data['target'].astype(dtype).mean() scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target) test_data = data[0].reshape(1, -1) self._check_tree_model(spec, 'multiArrayType', 'int64Type', 2) coreml_model = create_model(spec) try: self.assertEqual(scikit_model.predict(test_data)[0], bool(int(coreml_model.predict({'data': test_data})['target'])), msg="{} != {} for Dtype: {}".format( scikit_model.predict(test_data)[0], bool(int(coreml_model.predict({'data': test_data})['target'])), dtype ) ) except RuntimeError: print("{} not supported. ".format(dtype))
def __init__(self, outputs, inputs, k=None, hypers=None, params=None, distargs=None, rng=None): self.rng = gu.gen_rng() if rng is None else rng self.outputs = outputs self.inputs = inputs self.rng = gu.gen_rng() if rng is None else rng assert len(self.outputs) == 1 assert len(self.inputs) >= 1 assert self.outputs[0] not in self.inputs assert len(distargs['inputs']['stattypes']) == len(self.inputs) self.stattypes = distargs['inputs']['stattypes'] # Number of output categories and input dimension. # XXX WHATTA HACK. BayesDB passes in top-level kwargs, not in distargs. self.k = k if k is not None else int(distargs['k']) self.p = len(distargs['inputs']['stattypes']) # Sufficient statistics. self.N = 0 self.data = Data(x=OrderedDict(), Y=OrderedDict()) self.counts = [0] * self.k # Outlier and random forest parameters. if params is None: params = {} self.alpha = params.get('alpha', .1) self.regressor = params.get('forest', None) if self.regressor is None: self.regressor = RandomForestClassifier(random_state=self.rng)
def rf_categorize(email): # get training corpus emails = [] db = utils.get_local_db() for collection in db.collection_names(): for record in db.get_collection(collection).find(): emails.append([collection] + [record['Text']]) # vectorize corpus labels = [row[0] for row in emails] data = [row[1] for row in emails] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(data) X = X.toarray() # vectorize input email_vector = vectorizer.transform([email]) # create random forest and return prediction forest = RandomForestClassifier(n_estimators = int(sqrt(len(X[0])))+1) forest.fit(X, labels) return forest.predict(email_vector)[0]
def get_classifier(method='logistic_regression'): if 'logistic_regression' == method: return LogisticRegression(C=1e3, tol=0.01, multi_class='ovr', solver='liblinear', n_jobs=-1, random_state=123) if 'random_forest' == method: return RandomForestClassifier(n_estimators=250, bootstrap=False, n_jobs=-1, random_state=123) if 'gradient_boosting' == method: return xgb.XGBClassifier(max_depth=10, subsample=0.7, n_estimators=500, min_child_weight=0.05, colsample_bytree=0.3, learning_rate=0.1)
def applyRandomForestClassifier(self, train, test): #init algorithm RFC = RandomForestClassifier() #training target y_train = train[["Survived"]] x_train = train[train.columns.difference(["PassengerId","Survived"])] #fitting RFC.fit(x_train, y_train) result = RFC.predict(test[test.columns.difference(["PassengerId"])]) self.writeMessage("current training score") print RFC.score(x_train, y_train) test["Survived"] = result return test
def buildModel(dataset, method, parameters): """ Build final model for predicting real testing data """ features = dataset.columns[0:-1] if method == 'RNN': clf = performRNNlass(dataset[features], dataset['UpDown']) return clf elif method == 'RF': clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1) elif method == 'KNN': clf = neighbors.KNeighborsClassifier() elif method == 'SVM': c = parameters[0] g = parameters[1] clf = SVC(C=c, gamma=g) elif method == 'ADA': clf = AdaBoostClassifier() return clf.fit(dataset[features], dataset['UpDown'])
def build_model(self, X_train, y_train): if self.paras.load == True: model = self.load_training_model(self.paras.window_len) if model != None: return model print('build Random Forrest model...') # range of number of trees : 5*(1 -> 10) = 5,10,...,50 trees t_min = self.paras.tree_min[index] t_max = self.paras.tree_max[index] # range of max of features : 1 -> 10 features f_min = self.paras.feature_min[index] f_max = self.paras.feature_max[index] # range of window : 1 -> 70 days w_min = self.paras.window_min w_max = self.paras.window_max w_opt, n_opt, m_opt = self.best_window(X_train, y_train, w_min,w_max,t_min,t_max,f_min,f_max) model = RandomForestClassifier(n_estimators=n_opt,max_features=m_opt, n_jobs=8, verbose=self.paras.verbose) return model
def __init__(self, info, verbose=True, debug_mode=False): self.label_num=info['label_num'] self.target_num=info['target_num'] self.task = info['task'] self.metric = info['metric'] self.postprocessor = None #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba if debug_mode>=2: self.name = "RandomPredictor" self.model = RandomPredictor(self.target_num) self.predict_method = self.model.predict_proba return if info['task']=='regression': if info['is_sparse']==True: self.name = "BaggingRidgeRegressor" self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingRegressor" self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True) self.predict_method = self.model.predict # Always predict probabilities else: if info['has_categorical']: # Out of lazziness, we do not convert categorical variables... self.name = "RandomForestClassifier" self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start... elif info['is_sparse']: self.name = "BaggingNBClassifier" self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingClassifier" self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)") if info['task']=='multilabel.classification': self.model = MultiLabelEnsemble(self.model) self.predict_method = self.model.predict_proba
def try_params( n_iterations, params ): n_estimators = int( round( n_iterations * trees_per_iteration )) print "n_estimators:", n_estimators pprint( params ) clf = RF( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params ) return train_and_eval_sklearn_classifier( clf, data )
def run_predict_random_forest(X_train,Y_train,X_test,Y_test, n_estimators=30, max_features=500, show_mistakes=False): forest = RandomForestClassifier(n_estimators=10, max_features=20, max_depth=10) clf = SKClassifier(forest) forest_fit = clf.fit(X_train, Y_train) pred = forest_fit.predict(X_test) print('\n Random forest 0-1 error. \n Train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), '\n Test: ', zero_one_score(Y_test, pred)) met = clf.metrics(X_test,Y_test) if show_mistakes: mis = clf.show_mistakes(X_test,Y_test,10) print('Metrics:', met) return clf