我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.GradientBoostingClassifier()。
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def learn(x, y, test_x): # set sample weight weight_list = [] for j in range(len(y)): if y[j] == "0": weight_list.append(variables.weight_0_gdbt_b) if y[j] == "1000": weight_list.append(variables.weight_1000_gdbt_b) if y[j] == "1500": weight_list.append(variables.weight_1500_gdbt_b) if y[j] == "2000": weight_list.append(variables.weight_2000_gdbt_b) clf = GradientBoostingClassifier(loss='deviance', n_estimators=variables.n_estimators_gdbt_b, learning_rate=variables.learning_rate_gdbt_b, max_depth=variables.max_depth_gdbt_b, random_state=0, min_samples_split=variables.min_samples_split_gdbt_b, min_samples_leaf=variables.min_samples_leaf_gdbt_b, subsample=variables.subsample_gdbt_b, ).fit(x, y, weight_list) prediction_list = clf.predict(test_x) return prediction_list
def __init__( self, data_block, predictors=[],cv_folds=10, scoring_metric='accuracy',additional_display_metrics=[]): base_classification.__init__( self, alg=GradientBoostingClassifier(), data_block=data_block, predictors=predictors,cv_folds=cv_folds, scoring_metric=scoring_metric, additional_display_metrics=additional_display_metrics ) self.model_output = pd.Series(self.default_parameters) self.model_output['Feature_Importance'] = "-" #Set parameters to default values: self.set_parameters(set_default=True)
def createPipeline(self): self.pipeline = Pipeline([ ('model', GradientBoostingClassifier( loss = self.conf.loss, learning_rate = self.conf.learning_rate, n_estimators = self.conf.n_estimators, criterion = self.conf.criterion, max_depth = self.conf.max_depth, min_samples_split = self.conf.min_samples_split, min_samples_leaf = self.conf.min_samples_leaf, min_weight_fraction_leaf = self.conf.min_weight_fraction_leaf, subsample = self.conf.subsample, max_features = self.conf.max_features, max_leaf_nodes = self.conf.max_leaf_nodes, min_impurity_split = self.conf.min_impurity_decrease, presort = self.conf.presort))])
def get_classifier(self): algo=self.algo if algo=="GBT": return GradientBoostingClassifier() elif algo=="RF": return RandomForestClassifier() elif algo=="ADB": return AdaBoostClassifier() elif algo =="DT": return DecisionTreeClassifier() elif algo=="NB": return BernoulliNB() elif algo=="SGD": return SGDClassifier() elif algo=="SVC": return LinearSVC() elif algo=="MLPC": return MLPClassifier(activation='logistic', batch_size='auto', early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive', learning_rate_init=0.1, max_iter=5000, random_state=1, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) return 0
def _train_convert_evaluate(self, **scikit_params): """ Train a scikit-learn model, convert it and then evaluate it with CoreML """ scikit_model = GradientBoostingClassifier(random_state = 1, **scikit_params) scikit_model.fit(self.X, self.target) # Convert the model spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name) # Get predictions df = pd.DataFrame(self.X, columns=self.feature_names) df['prediction'] = scikit_model.predict(self.X) # Evaluate it metrics = evaluate_classifier(spec, df) return metrics
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston import numpy as np scikit_data = load_boston() scikit_model = GradientBoostingClassifier(random_state = 1) t = scikit_data.target target = np.digitize(t, np.histogram(t)[1]) - 1 scikit_model.fit(scikit_data.data, target) self.target = target # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model
def GradientBoostingDecisionTree_Export(action): # Setting our classifier to Gradient Boosting clf = GradientBoostingClassifier() dir = input('Give Data Directory: ') if int(action) == 1: print('Loading Data') PopularityClassifier.loadData(dir) PopularityClassifier.youtubePopular(dir,clf,2) PopularityClassifier.twitterPopular(dir,clf,2) PopularityClassifier.bothPopular(dir,clf,2) elif int(action) == 2: print('Loading Data') ViralityClassifier.loadData(dir) ViralityClassifier.youtubeViral(dir,clf,2) ViralityClassifier.twitterViral(dir,clf,2) ViralityClassifier.bothViral(dir,clf,2) else: print('Loading Data') ViralityAndPopularityClassifier.loadData(dir) ViralityAndPopularityClassifier.youtubeViralAndPopular(dir,clf,2) ViralityAndPopularityClassifier.twitterViralAndPopular(dir,clf,2) ViralityAndPopularityClassifier.bothViralAndPopular(dir,clf,2)
def classify(): #Predict Popularity gbdt = GradientBoostingClassifier() gbdt.fit(X,YP) gbdt.predict(videos) print(valVir.shape) prediction = gbdt.predict(videos) print(prediction) same=0 for i in range(0,valPop.size): if valPop[i]==prediction[i]: same = same+1 accurancy = same/valPop.size *100 print(accurancy)
def classify_user_item(train_data_new, test_data_new, result9): data = np.loadtxt(train_data_new) X = data[:, :-1] # select columns 0 through end-1 y = data[:, -1] # select column end print X print y print 'start train' clf2 = RandomForestClassifier(n_estimators=100) # clf2=GradientBoostingClassifier() clf2.fit(X, y) # clf2 = LogisticRegression().fit(X, y) print clf2.classes_ data1 = np.loadtxt(test_data_new) X_test = data1[:, :] print 'testing data is ok' result = clf2.predict_proba(X_test) print 'output result' print result f_result = open(result9, 'w') for i in range(0, len(result)): f_result.write(str(result[i]) + '\n')
def GradientBoostingClassifier(X_train, y_train, X_test): from sklearn.ensemble import GradientBoostingClassifier now = datetime.datetime.now() print ("GradientBoostingClassifier start in " + now.strftime('%Y-%m-%d %H:%M:%S')) GBC = GradientBoostingClassifier(max_features = 'sqrt', n_estimators = 300, learning_rate = 0.02, max_depth = 8, subsample = 0.8, n_jobs =4) GBC.fit(X_train, y_train) now = datetime.datetime.now() print ("GradientBoostingClassifier train done in " + now.strftime('%Y-%m-%d %H:%M:%S')) y_pred_GBC = GBC.predict_proba(X_test) y_pred_GBC = pd.DataFrame(y_pred_GBC[:,1:2],columns=['GBC_predictions']) y_pred_GBC.to_csv('GBC_result_all.csv', index=False) now = datetime.datetime.now() print ("GradientBoostingClassifier predict done in " + now.strftime('%Y-%m-%d %H:%M:%S'))
def GradientBoostingClassifier(X_train, y_train, X_test): from sklearn.ensemble import GradientBoostingClassifier now = datetime.datetime.now() print ("GradientBoostingClassifier start in " + now.strftime('%Y-%m-%d %H:%M:%S')) GBC = GradientBoostingClassifier(max_features = 'sqrt', n_estimators = 300, learning_rate = 0.02, max_depth = 8, subsample = 0.8) GBC.fit(X_train, y_train) now = datetime.datetime.now() print ("GradientBoostingClassifier train done in " + now.strftime('%Y-%m-%d %H:%M:%S')) y_pred_GBC = GBC.predict_proba(X_test) y_pred_GBC = pd.DataFrame(y_pred_GBC[:,1:2],columns=['GBC_predictions']) y_pred_GBC.to_csv('GBC_result_1.csv', index=False) now = datetime.datetime.now() print ("GradientBoostingClassifier predict done in " + now.strftime('%Y-%m-%d %H:%M:%S'))
def on_startup(app): connector = aiohttp.TCPConnector(limit=5, use_dns_cache=True, loop=app.loop) session = aiohttp.ClientSession(connector=connector, raise_for_status=True) bot = TelegramBot(app['config'].token, session) image_model = fit_model(app['config'].sample_df) def config_injections(binder): # injection bindings binder.bind(Config, app['config']) binder.bind(TelegramBot, bot) binder.bind(GradientBoostingClassifier, image_model) binder.bind_to_constructor(AsyncIOMotorDatabase, init_database) try: inject.configure(config_injections) except inject.InjectorException: log.error("Injector already configured", exc_info=True) setup_logging(log) app.loop.create_task(bot.set_hook())
def test_GradientBoostingClassifier_num(*data): ''' test the performance with different n_estimators :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data nums=np.arange(1,100,step=2) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for num in nums: clf=ensemble.GradientBoostingClassifier(n_estimators=num) clf.fit(X_train,y_train) training_scores.append(clf.score(X_train,y_train)) testing_scores.append(clf.score(X_test,y_test)) ax.plot(nums,training_scores,label="Training Score") ax.plot(nums,testing_scores,label="Testing Score") ax.set_xlabel("estimator num") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1.05) plt.suptitle("GradientBoostingClassifier") plt.show()
def test_GradientBoostingClassifier_maxdepth(*data): ''' test the performance with different max_depth :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data maxdepths=np.arange(1,20) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for maxdepth in maxdepths: clf=ensemble.GradientBoostingClassifier(max_depth=maxdepth,max_leaf_nodes=None) clf.fit(X_train,y_train) training_scores.append(clf.score(X_train,y_train)) testing_scores.append(clf.score(X_test,y_test)) ax.plot(maxdepths,training_scores,label="Training Score") ax.plot(maxdepths,testing_scores,label="Testing Score") ax.set_xlabel("max_depth") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1.05) plt.suptitle("GradientBoostingClassifier") plt.show()
def test_GradientBoostingClassifier_learning(*data): ''' test the performance with different learning rate :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data learnings=np.linspace(0.01,1.0) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for learning in learnings: clf=ensemble.GradientBoostingClassifier(learning_rate=learning) clf.fit(X_train,y_train) training_scores.append(clf.score(X_train,y_train)) testing_scores.append(clf.score(X_test,y_test)) ax.plot(learnings,training_scores,label="Training Score") ax.plot(learnings,testing_scores,label="Testing Score") ax.set_xlabel("learning_rate") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1.05) plt.suptitle("GradientBoostingClassifier") plt.show()
def test_GradientBoostingClassifier_subsample(*data): ''' test the performance with different subsample :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data fig=plt.figure() ax=fig.add_subplot(1,1,1) subsamples=np.linspace(0.01,1.0) testing_scores=[] training_scores=[] for subsample in subsamples: clf=ensemble.GradientBoostingClassifier(subsample=subsample) clf.fit(X_train,y_train) training_scores.append(clf.score(X_train,y_train)) testing_scores.append(clf.score(X_test,y_test)) ax.plot(subsamples,training_scores,label="Training Score") ax.plot(subsamples,testing_scores,label="Training Score") ax.set_xlabel("subsample") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1.05) plt.suptitle("GradientBoostingClassifier") plt.show()
def test_friedman_mse_in_graphviz(): clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0) clf.fit(X, y) dot_data = StringIO() export_graphviz(clf, out_file=dot_data) clf = GradientBoostingClassifier(n_estimators=2, random_state=0) clf.fit(X, y) for estimator in clf.estimators_: export_graphviz(estimator[0], out_file=dot_data) for finding in finditer("\[.*?samples.*?\]", dot_data.getvalue()): assert_in("friedman_mse", finding.group())
def check_classification_toy(presort, loss): # Check classification on a toy dataset. clf = GradientBoostingClassifier(loss=loss, n_estimators=10, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert_true(np.any(deviance_decrease >= 0.0)) leaves = clf.apply(X) assert_equal(leaves.shape, (6, 10, 1))
def test_probability_log(): # Predict probabilities. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def test_check_inputs_predict(): # X has wrong shape clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y) x = np.array([1.0, 2.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) x = np.array([[]]) assert_raises(ValueError, clf.predict, x) x = np.array([1.0, 2.0, 3.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, rng.rand(len(X))) x = np.array([1.0, 2.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) x = np.array([[]]) assert_raises(ValueError, clf.predict, x) x = np.array([1.0, 2.0, 3.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x)
def test_staged_functions_defensive(): # test that staged_functions make defensive copies rng = np.random.RandomState(0) X = rng.uniform(size=(10, 3)) y = (4 * X[:, 0]).astype(np.int) + 1 # don't predict zeros for estimator in [GradientBoostingRegressor(), GradientBoostingClassifier()]: estimator.fit(X, y) for func in ['predict', 'decision_function', 'predict_proba']: staged_func = getattr(estimator, "staged_" + func, None) if staged_func is None: # regressor has no staged_predict_proba continue with warnings.catch_warnings(record=True): staged_result = list(staged_func(X)) staged_result[1][:] = 0 assert_true(np.all(staged_result[0] != 0))
def test_serialization(): # Check model serialization. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) try: import cPickle as pickle except ImportError: import pickle serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL) clf = None clf = pickle.loads(serialized_clf) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
def test_more_verbose_output(): # Check verbose=2 does not cause error. from sklearn.externals.six.moves import cStringIO as StringIO import sys old_stdout = sys.stdout sys.stdout = StringIO() clf = GradientBoostingClassifier(n_estimators=100, random_state=1, verbose=2) clf.fit(X, y) verbose_output = sys.stdout sys.stdout = old_stdout # check output verbose_output.seek(0) header = verbose_output.readline().rstrip() # no OOB true_header = ' '.join(['%10s'] + ['%16s'] * 2) % ( 'Iter', 'Train Loss', 'Remaining Time') assert_equal(true_header, header) n_lines = sum(1 for l in verbose_output.readlines()) # 100 lines for n_estimators==100 assert_equal(100, n_lines)
def test_warm_start_oob(): # Test if warm start OOB equals fit. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: est = Cls(n_estimators=200, max_depth=1, subsample=0.5, random_state=1) est.fit(X, y) est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True) est_ws.fit(X, y) est_ws.set_params(n_estimators=200) est_ws.fit(X, y) assert_array_almost_equal(est_ws.oob_improvement_[:100], est.oob_improvement_[:100])
def test_probability_exponential(): # Predict probabilities. clf = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) score = clf.decision_function(T).ravel() assert_array_almost_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score))) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def test_partial_dependence_classifier(): # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) # only 4 grid points instead of 5 because only 4 unique X[:,0] vals assert pdp.shape == (1, 4) assert axes[0].shape[0] == 4 # now with our own grid X_ = np.asarray(X) grid = np.unique(X_[:, 0]) pdp_2, axes = partial_dependence(clf, [0], grid=grid) assert axes is None assert_array_equal(pdp, pdp_2)
def __init__(self, info, verbose=True, debug_mode=False): self.label_num=info['label_num'] self.target_num=info['target_num'] self.task = info['task'] self.metric = info['metric'] self.postprocessor = None #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba if debug_mode>=2: self.name = "RandomPredictor" self.model = RandomPredictor(self.target_num) self.predict_method = self.model.predict_proba return if info['task']=='regression': if info['is_sparse']==True: self.name = "BaggingRidgeRegressor" self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingRegressor" self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True) self.predict_method = self.model.predict # Always predict probabilities else: if info['has_categorical']: # Out of lazziness, we do not convert categorical variables... self.name = "RandomForestClassifier" self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start... elif info['is_sparse']: self.name = "BaggingNBClassifier" self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingClassifier" self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)") if info['task']=='multilabel.classification': self.model = MultiLabelEnsemble(self.model) self.predict_method = self.model.predict_proba
def try_params( n_iterations, params ): n_estimators = int( round( n_iterations * trees_per_iteration )) print "n_estimators:", n_estimators pprint( params ) clf = GB( n_estimators = n_estimators, verbose = 0, **params ) return train_and_eval_sklearn_classifier( clf, data )
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier(n_jobs=-1) # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) return predictions
def constructModel(corpus, classList, features, modelOutput): """ Trains a Decision Tree model on the test corpus. Args: corpus: A list of lists, containing the GC content, coverage, and class number. classList: A list of class names. features: List of variables used by each contig. modelOutput: Location to save model as GraphViz DOT, or False to save no model. Returns: classifier: A DecisionTreeClassifier object that has been trained on the test corpus. """ corpus.sort() # just in case X = [] Y = [] for item in corpus: X.append(item[:-1]) # all but the last item Y.append(item[-1]) # only the last item X_train, X_test, Y_train, Y_test = mscv.train_test_split(X, Y, test_size=0.3, random_state=0) # TODO: implement classifier testing and comparison, now only baggingClassifier is used as per paper #treeClassifier = tree.DecisionTreeClassifier() #treeClassifier = treeClassifier.fit(X_train, Y_train) #click.echo("Decision tree classifier built, score is %s out of 1.00" % treeClassifier.score(X_test, Y_test)) baggingClassifier = ensemble.BaggingClassifier() baggingClassifier = baggingClassifier.fit(X_train, Y_train) click.echo("Bagging classifier built, score is %s out of 1.00" % baggingClassifier.score(X_test, Y_test)) #forestClassifier = ensemble.RandomForestClassifier(n_estimators=10) #forestClassifier = forestClassifier.fit(X_train, Y_train) #click.echo("Random forest classifier built, score is %s out of 1.00" % forestClassifier.score(X_test, Y_test)) #adaClassifier = ensemble.AdaBoostClassifier(n_estimators=100) #adaClassifier = adaClassifier.fit(X_train, Y_train) #click.echo("AdaBoost classifier built, score is %s out of 1.00" % adaClassifier.score(X_test, Y_test)) #gradientClassifier = ensemble.GradientBoostingClassifier(n_estimators=100) #gradientClassifier = gradientClassifier.fit(X_train, Y_train) #click.echo("Gradient tree boosting classifier built, score is %s out of 1.00" % gradientClassifier.score(X_test, Y_test)) if modelOutput: with open(modelOutput, 'w') as dotfile: tree.export_graphviz(baggingClassifier, out_file=dotfile, feature_names=features, class_names=classList, filled=True, rounded=True, special_characters=True) return baggingClassifier
def score(self, estimator, X, y, advanced_scoring=False): X, y = utils.drop_missing_y_vals(X, y, output_column=None) if isinstance(estimator, GradientBoostingClassifier): X = X.toarray() predictions = estimator.predict_proba(X) if self.scoring_method == 'brier_score_loss': # At the moment, Microsoft's LightGBM returns probabilities > 1 and < 0, which can break some scoring functions. So we have to take the max of 1 and the pred, and the min of 0 and the pred. probas = [max(min(row[1], 1), 0) for row in predictions] predictions = probas try: score = self.scoring_func(y, predictions) except ValueError as e: bad_val_indices = [] for idx, val in enumerate(y): if str(val) in bad_vals_as_strings: bad_val_indices.append(idx) predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices] y = [val for idx, val in enumerate(y) if idx not in bad_val_indices] print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset') try: score = self.scoring_func(y, predictions) except ValueError: # Sometimes, particularly for a badly fit model using either too little data, or a really bad set of hyperparameters during a grid search, we can predict probas that are > 1 or < 0. We'll cap those here, while warning the user about them, because they're unlikely to occur in a model that's properly trained with enough data and reasonable params predictions = self.clean_probas(predictions) score = self.scoring_func(y, predictions) if advanced_scoring: return (-1 * score, predictions) else: return -1 * score
def get_classification(): clf = svm.SVC() clf = ensemble.GradientBoostingClassifier() return clf
def __init__(self, nr_events, case_id_col, label_col, encoder_kwargs, cls_kwargs, cls_method="rf"): self.case_id_col = case_id_col self.label_col = label_col self.encoder = SequenceEncoder(nr_events=nr_events, case_id_col=case_id_col, label_col=label_col, **encoder_kwargs) if cls_method == "gbm": self.cls = GradientBoostingClassifier(**cls_kwargs) elif cls_method == "rf": self.cls = RandomForestClassifier(**cls_kwargs) else: print("Classifier method not known")
def GBDT_classify(train_dataSet_path, test_dataSet_path, train_one_and_two_result_as_proba_path): train_data = pd.read_csv(train_dataSet_path) train_data = train_data.as_matrix() X_train = train_data[:, 2:-1] # select columns 0 through end-1 y_train = train_data[:, -1] # select column end test_data = pd.read_csv(test_dataSet_path) test_data = test_data.as_matrix() X_test = test_data[:, 2:-1] # select columns 0 through end-1 y_test = test_data[:, -1] # select column end clf = GradientBoostingClassifier(n_estimators=200) clf.fit(X_train, y_train) pre_y_test = clf.predict_proba(X_test) print pre_y_test print("GBDT Metrics : {0}".format(precision_recall_fscore_support(y_test, pre_y_test))) print u'????.....' f_result = open(test_dataSet_prob_path, 'w') for i in range(0, len(pre_y_test)): if i==0: print str(pre_y_test[i][0]) if i==len(pre_y_test)-1: print str(pre_y_test[i][0]) f_result.write(str(pre_y_test[i][0]) + '\n') return clf
def performGTBClass(X_train, y_train, X_test, y_test, fout, savemodel): """ Gradient Tree Boosting binary Classification """ clf = GradientBoostingClassifier(n_estimators=100) clf.fit(X_train, y_train) # if savemodel == True: # fname_out = '{}-{}.pickle'.format(fout, datetime.now()) # with open(fname_out, 'wb') as f: # cPickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) print "GTBClass: ", accuracy
def get_data_preprocessor_balancing(params, y): d_balancing = params['layer_dict_list'][1] if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None': # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier'] params['class_weight'] = None # for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier'] params['sample_weight'] = None elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting': # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier'] params['class_weight'] = 'auto' # for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier'] if len(y.shape) > 1: offsets = [2 ** i for i in range(y.shape[1])] y_ = np.sum(y * offsets, axis=1) else: y_ = y unique, counts = np.unique(y_, return_counts=True) cw = 1. / counts cw = cw / np.mean(cw) sample_weight = np.ones(y_.shape) for i, ue in enumerate(unique): mask = y_ == ue sample_weight[mask] *= cw[i] params['sample_weight'] = sample_weight return params
def model_fitting(train_set, train_labels, classifier_name, n_jobs=cpu_count()): """ The fitting process with sklearn algorithms. :param train_set: numpy array, required :param train_labels: list, required :param classifier_name: string, required :param n_jobs: integer, required :return: object - Fit classifier model according to the given training data """ classifier_list = {"svm_linear": SVC(probability=True, kernel='linear', C=1.0), "svm_poly": SVC(probability=True, kernel='poly', C=1.0), "svm_rbf": SVC(probability=True, kernel='rbf', C=1.0, gamma=0.01), "linear_svc": LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.1, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, random_state=None, max_iter=3000), "knn": KNeighborsClassifier(n_neighbors=100, weights='distance', leaf_size=30, n_jobs=n_jobs), "random_forests": RandomForestClassifier(n_estimators=350, criterion='entropy', min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=600, n_jobs=n_jobs), "logistic_regression": LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=2.4, fit_intercept=True, intercept_scaling=1, random_state=None, solver='liblinear', max_iter=1000, multi_class='ovr', warm_start=False, n_jobs=n_jobs), "decision_trees": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=100, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, presort=False), "sgd": SGDClassifier(alpha=.0001, n_iter=500, penalty="elasticnet", n_jobs=n_jobs), "neural_network": Classifier(layers=[Layer("Sigmoid", units=14), Layer("Sigmoid", units=13), Layer("Sigmoid", units=12), Layer("Sigmoid", units=10), Layer("Softmax")], learning_rate=0.01, n_iter=200, batch_size=10, regularize='L1', n_stable=50, dropout_rate=0, verbose=True), "GBC": GradientBoostingClassifier(max_depth=10, max_leaf_nodes=850, min_samples_leaf=15, learning_rate=0.1), "XGB": XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10, min_child_weight=2, missing=None, n_estimators=100, nthread=n_jobs, reg_alpha=0, objective='binary:logistic', reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)} return classifier_list[classifier_name].fit(train_set, train_labels)
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = GradientBoostingClassifier(random_state = 1) target = scikit_data['target'] > scikit_data['target'].mean() scikit_model.fit(scikit_data['data'], target) # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = GradientBoostingClassifier() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. from sklearn.preprocessing import OneHotEncoder with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(self.trainingSet, self.trainingLabel) pred_labels = {} for user in self.testDict: pred_labels[user] = classifier.predict([self.model.docvecs[user]]) # print 'SVM:' # print classification_report(self.testLabel, pred_labels) return pred_labels # classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, # max_depth=1, random_state=0) # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'GBDT:' # print classification_report(self.testLabel, pred_labels) # # clf = AdaBoostClassifier(n_estimators=100) # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'AdaBoost:' # print classification_report(self.testLabel, pred_labels) # # clf = RandomForestClassifier(n_estimators=10) # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Random Forest:' # print classification_report(self.testLabel, pred_labels)
def makEnsemble( X, xlist, Y ): #naive bayes clf = MultinomialNB() clf.fit( xlist, Y ) featureSelectModel.append (clf) #K nearest neighbours clf = KNeighborsClassifier() clf.fit( xlist, Y ) featureSelectModel.append (clf) #Logistic regression clf = LogisticRegression(C=1) clf.fit( xlist, Y ) featureSelectModel.append (clf) #random forest clf = RandomForestClassifier(n_estimators = 400) clf.fit( X, Y ) wholeFeatureModel.append (clf) #extra forest clf = ExtraTreesClassifier(n_estimators = 400) clf.fit( X, Y ) wholeFeatureModel.append (clf) #decision forest clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) clf.fit( X, Y ) wholeFeatureModel.append (clf) #gradient boosting params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01} clf = GradientBoostingClassifier(**params) clf.fit( X, Y ) wholeFeatureModel.append (clf)
def gradient_boosting_classifier(train_x, train_y): from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=200) model.fit(train_x, train_y) return model # SVM Classifier
def test_create(self): SigOptSearchCV( estimator=GradientBoostingClassifier, param_domains=GradientBoostingClassifier_PARAM_DOMAIN, client_token='client_token' )
def test_no_token(self): with pytest.raises(ValueError): SigOptSearchCV(estimator=GradientBoostingClassifier, param_domains=GradientBoostingClassifier_PARAM_DOMAIN)
def test_search(self): conn = sigopt.Connection() n_iter = 5 folds = 3 cv = SigOptSearchCV( estimator=GradientBoostingClassifier(), param_domains=GradientBoostingClassifier_PARAM_DOMAIN, client_token='client_token', n_iter=n_iter, cv=folds ) assert len(conn.experiments().create.mock_calls) == 0 assert len(conn.experiments().fetch.mock_calls) == 0 assert len(conn.experiments().suggestions.create.mock_calls) == 0 assert len(conn.experiments().observations.create.mock_calls) == 0 data = sklearn.datasets.load_iris() cv.fit(data['data'], data['target']) assert len(conn.experiments().create.mock_calls) == 1 create_definition = conn.experiments().create.call_args[1] assert create_definition['name'] == GradientBoostingClassifier_EXPERIMENT_DEF['name'] assert len(create_definition['parameters']) == len(GradientBoostingClassifier_EXPERIMENT_DEF['parameters']) for p in GradientBoostingClassifier_EXPERIMENT_DEF['parameters']: assert p in create_definition['parameters'] assert len(conn.experiments().best_assignments().fetch.mock_calls) == 1 assert len(conn.experiments().suggestions().create.mock_calls) == n_iter * folds assert len(conn.experiments().observations().create.mock_calls) == n_iter * folds assert cv.best_params_ == zero_corner(GradientBoostingClassifier_EXPERIMENT_DEF)