我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.GradientBoostingRegressor()。
def cross_validate_best_known(): ''' import and clean the tractor data, then do a corss validation on each of the three models we are training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print the scores. The parameters we're using here are the "best" that we've found so far using a grid search. ''' tractor_data = pd.read_csv('data/train.csv') tractor_data = cln.clean_all(tractor_data) X = tractor_data y = tractor_data.pop('SalePrice') rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2) gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1) ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3) ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000) validate.cross_v_scores([rf, gb, ab], X, y) # RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946 # GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829 # AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393
def model_gradient_boosting_tree(Xtrain,Xtest,ytrain): X_train = Xtrain y_train = ytrain gbr = GradientBoostingRegressor(random_state=0) param_grid = { 'n_estimators': [800,1500], 'max_features': [20,15], 'max_depth': [8,10], 'learning_rate': [0.1], 'subsample': [1] } model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE) model.fit(X_train, y_train) print('Gradient boosted tree regression...') print('Best Params:') print(model.best_params_) print('Best CV Score:') print(-model.best_score_) y_pred = model.predict(Xtest) return y_pred, -model.best_score_ # read data, build model and do prediction
def unscaled_pipelines(): # Random forest parameters random_forest_kwargs = { 'n_estimators': 10, 'criterion': 'mse', 'random_state': _RANDOM_STATE, 'n_jobs': cpu_count(), 'verbose': True, } # Gradient boosting parameters gradient_boost_kwargs = { 'random_state': _RANDOM_STATE, 'verbose': 1, } models = [ DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE), # RandomForestRegressor(**random_forest_kwargs), # GradientBoostingRegressor(**gradient_boost_kwargs), ] pipelines = [] for m in models: # Steps pipelines.append(make_pipeline(m)) return pipelines
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) prep = {'Standard Scaling': [StandardScaler()], 'Min Max Scaling': [MinMaxScaler()], 'No Preprocessing': []} est = {'Standard Scaling': [ElasticNet(), Lasso(), KNeighborsRegressor()], 'Min Max Scaling': [SVR()], 'No Preprocessing': [RandomForestRegressor(random_state=SEED), GradientBoostingRegressor()]} ens.add(est, prep) ens.add(GradientBoostingRegressor(), meta=True) return ens
def test_gbrt_base_estimator(): rng = np.random.RandomState(1) N = 10000 X = np.ones((N, 1)) y = rng.normal(size=N) base = RandomForestRegressor() rgr = GradientBoostingQuantileRegressor(base_estimator=base) assert_raise_message(ValueError, 'type GradientBoostingRegressor', rgr.fit, X, y) base = GradientBoostingRegressor() rgr = GradientBoostingQuantileRegressor(base_estimator=base) assert_raise_message(ValueError, 'quantile loss', rgr.fit, X, y) base = GradientBoostingRegressor(loss='quantile', n_estimators=20) rgr = GradientBoostingQuantileRegressor(base_estimator=base) rgr.fit(X, y) estimates = rgr.predict(X, return_quantiles=True) assert_almost_equal(stats.norm.ppf(rgr.quantiles), np.mean(estimates, axis=0), decimal=2)
def fs_boruta(df): # do feature selection using boruta X = df[[x for x in df.columns if x!='SalePrice']] y = df['SalePrice'] model = GradientBoostingRegressor() feat_selector = boruta_py.BorutaPy(model, n_estimators=100, verbose=12) # find all relevant features feat_selector.fit_transform(X.as_matrix(), y.as_matrix()) # check selected features features_bool = np.array(feat_selector.support_) features = np.array(X.columns) result = features[features_bool] #print(result) # check ranking of features features_rank = feat_selector.ranking_ #print(features_rank) rank = features_rank[features_bool] #print(rank) return result
def model_cross_valid(X,Y): seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) def bulid_model(model_name): model = model_name() return model scoring = 'neg_mean_squared_error' # + random fest boost lstm gbdt for model_name in [LinearRegression,ElasticNet]: #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]: model = bulid_model(model_name) results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(model_name,results.mean())
def __init__(self, nr_events, case_id_col, encoder_kwargs, cls_kwargs, cls_method="rf"): self.case_id_col = case_id_col self.nr_events = nr_events self.encoder = SequenceEncoder(nr_events=nr_events, case_id_col=case_id_col, **encoder_kwargs) if cls_method == "gbm": self.cls = GradientBoostingRegressor(**cls_kwargs) elif cls_method == "rf": self.cls = RandomForestRegressor(**cls_kwargs) else: print("Classifier method not known")
def grid_search(X, y, split, learn=[.01], samples_leaf=[250, 350, 500], depth=[10, 15]): ''' Runs a grid search for GBM on split data ''' for l in learn: for s in samples_leaf: for d in depth: model = GradientBoostingRegressor(n_estimators=250, learning_rate=l, min_samples_leaf=s, max_depth=d, random_state=42) model.fit(X.values[:split], y.values[:split]) in_score = model.score(X.values[:split], y.values[:split]) out_score = model.score(X.values[split:], y.values[split:]) print 'learning_rate: {}, min_samples_leaf: {}, max_depth: {}'.\ format(l, s, d) print 'in-sample score:', in_score print 'out-sample score:', out_score print ''
def __init__(self, q1=.16, q2=.84,**params): """ Gradient boosted trees as surrogate model for Bayesian Optimization. Uses quantile regression for an estimate of the 'posterior' variance. In practice, the std is computed as (`q2` - `q1`) / 2. Relies on `sklearn.ensemble.GradientBoostingRegressor` Parameters ---------- q1: float First quantile. q2: float Second quantile params: tuple Extra parameters to pass to `GradientBoostingRegressor` """ self.params = params self.q1 = q1 self.q2 = q2 self.eps = 1e-1
def fit(self, X, y): """ Fit a GBM model to data `X` and targets `y`. Parameters ---------- X : array-like Input values. y: array-like Target values. """ self.X = X self.y = y self.n = self.X.shape[0] self.modq1 = GradientBoostingRegressor(loss='quantile', alpha=self.q1, **self.params) self.modq2 = GradientBoostingRegressor(loss='quantile', alpha=self.q2, **self.params) self.mod = GradientBoostingRegressor(loss = 'ls', **self.params) self.modq1.fit(self.X, self.y) self.modq2.fit(self.X, self.y) self.mod.fit(self.X, self.y)
def test_boston_OHE_plus_trees(self): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), ("Trees",GradientBoostingRegressor(random_state = 1))]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, 'target') # Get predictions df = pd.DataFrame(data.data, columns=data.feature_names) df['prediction'] = pl.predict(data.data) # Evaluate it result = evaluate_regressor(spec, df, 'target', verbose = False) assert result["max_error"] < 0.0001
def gbdt_select_model(file_name): train_df = read_from_file(file_name) #featrue 16 selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Select Model...' start_time = datetime.datetime.now() gbdt = GradientBoostingRegressor() parameters = {'n_estimators': [100, 120], 'max_depth':[4, 5, 6]} grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) end_time = datetime.datetime.now() print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def test(): iris = load_iris() #print iris #print iris['target'].shape gbdt=GradientBoostingRegressor(n_estimators=1000, max_depth=4) gbdt.fit(iris.data[:120],iris.target[:120]) #Save GBDT Model joblib.dump(gbdt, 'GBDT.model') predict = gbdt.predict(iris.data[:120]) total_err = 0 for i in range(len(predict)): print predict[i],iris.target[i] err = predict[i] - iris.target[i] total_err += err * err print 'Training Error: %f' % (total_err / len(predict)) pred = gbdt.predict(iris.data[120:]) error = 0 for i in range(len(pred)): print pred[i],iris.target[i+120] err = pred[i] - iris.target[i+120] error += err * err print 'Test Error: %f' % (error / len(pred))
def select_model(file_name): train_df = read_from_file(file_name) #featrue 16 selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Select Model...' start_time = datetime.datetime.now() gbdt = GradientBoostingRegressor() parameters = {'n_estimators': [10000, 12000], 'max_depth':[16,15, 14]} grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) end_time = datetime.datetime.now() print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def generate_GBDT_model(file_name): train_df = read_from_file(file_name) #featrue 18 selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Train Gradient Boosting Regression Model...' start_time = datetime.datetime.now() gbdt = GradientBoostingRegressor(n_estimators=120, max_depth=10) #, class_weight='balanced') gbdt.fit(X,y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: ' print (end_time - start_time).seconds print 'Save Model...' joblib.dump(gbdt, 'GBDT.model') return gbdt
def GDBT_regression(X=train_df_munged,Y=label_df['SalePrice']): est = GradientBoostingRegressor(n_estimators=50,max_depth=3,learning_rate=0.1) X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0) est.fit(X_train,Y_train) y_train_pred = est.predict(X_test) plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data') plt.title("Linear regression with GDBT") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data") plt.title("Linear regression with GDBT") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() print('rmse value:',rmse(Y_test,y_train_pred)) return est
def train_model(self, train_file_path, model_path): print("==> Load the data ...") X_train, Y_train = self.load_file(train_file_path) print(train_file_path, shape(X_train)) print("==> Train the model ...") min_max_scaler = preprocessing.MaxAbsScaler() X_train_minmax = min_max_scaler.fit_transform(X_train) clf = GradientBoostingRegressor(n_estimators=self.n_estimators) clf.fit(X_train_minmax.toarray(), Y_train) print("==> Save the model ...") pickle.dump(clf, open(model_path, 'wb')) scaler_path = model_path.replace('.pkl', '.scaler.pkl') pickle.dump(min_max_scaler, open(scaler_path, 'wb')) return clf
def test_GradientBoostingRegressor_num(*data): ''' test the performance with different n_estimators :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data nums=np.arange(1,200,step=2) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for num in nums: regr=ensemble.GradientBoostingRegressor(n_estimators=num) regr.fit(X_train,y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ax.plot(nums,training_scores,label="Training Score") ax.plot(nums,testing_scores,label="Testing Score") ax.set_xlabel("estimator num") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1.05) plt.suptitle("GradientBoostingRegressor") plt.show()
def test_GradientBoostingRegressor_maxdepth(*data): ''' test the performance with different max_depth :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data maxdepths=np.arange(1,20) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for maxdepth in maxdepths: regr=ensemble.GradientBoostingRegressor(max_depth=maxdepth,max_leaf_nodes=None) regr.fit(X_train,y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ax.plot(maxdepths,training_scores,label="Training Score") ax.plot(maxdepths,testing_scores,label="Testing Score") ax.set_xlabel("max_depth") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(-1,1.05) plt.suptitle("GradientBoostingRegressor") plt.show()
def test_GradientBoostingRegressor_learning(*data): ''' test the performance with different learning rate :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data learnings=np.linspace(0.01,1.0) fig=plt.figure() ax=fig.add_subplot(1,1,1) testing_scores=[] training_scores=[] for learning in learnings: regr=ensemble.GradientBoostingRegressor(learning_rate=learning) regr.fit(X_train,y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ax.plot(learnings,training_scores,label="Training Score") ax.plot(learnings,testing_scores,label="Testing Score") ax.set_xlabel("learning_rate") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(-1,1.05) plt.suptitle("GradientBoostingRegressor") plt.show()
def test_GradientBoostingRegressor_subsample(*data): ''' test the performance with different subsample :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data fig=plt.figure() ax=fig.add_subplot(1,1,1) subsamples=np.linspace(0.01,1.0,num=20) testing_scores=[] training_scores=[] for subsample in subsamples: regr=ensemble.GradientBoostingRegressor(subsample=subsample) regr.fit(X_train,y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ax.plot(subsamples,training_scores,label="Training Score") ax.plot(subsamples,testing_scores,label="Training Score") ax.set_xlabel("subsample") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(-1,1.05) plt.suptitle("GradientBoostingRegressor") plt.show()
def test_GradientBoostingRegressor_max_features(*data): ''' test the performance with different max_features :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data fig=plt.figure() ax=fig.add_subplot(1,1,1) max_features=np.linspace(0.01,1.0) testing_scores=[] training_scores=[] for features in max_features: regr=ensemble.GradientBoostingRegressor(max_features=features) regr.fit(X_train,y_train) training_scores.append(regr.score(X_train,y_train)) testing_scores.append(regr.score(X_test,y_test)) ax.plot(max_features,training_scores,label="Training Score") ax.plot(max_features,testing_scores,label="Training Score") ax.set_xlabel("max_features") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1.05) plt.suptitle("GradientBoostingRegressor") plt.show()
def test_feature_importances(): X = np.array(boston.data, dtype=np.float32) y = np.array(boston.target, dtype=np.float32) for presort in True, False: clf = GradientBoostingRegressor(n_estimators=100, max_depth=5, min_samples_split=2, random_state=1, presort=presort) clf.fit(X, y) assert_true(hasattr(clf, 'feature_importances_')) # XXX: Remove this test in 0.19 after transform support to estimators # is removed. X_new = assert_warns( DeprecationWarning, clf.transform, X, threshold="mean") assert_less(X_new.shape[1], X.shape[1]) feature_mask = ( clf.feature_importances_ > clf.feature_importances_.mean()) assert_array_almost_equal(X_new, X[:, feature_mask])
def test_staged_predict(): # Test whether staged decision function eventually gives # the same prediction. X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test = X[200:] clf = GradientBoostingRegressor() # test raise ValueError if not fitted assert_raises(ValueError, lambda X: np.fromiter( clf.staged_predict(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # test if prediction for last stage equals ``predict`` for y in clf.staged_predict(X_test): assert_equal(y.shape, y_pred.shape) assert_array_equal(y_pred, y)
def test_staged_functions_defensive(): # test that staged_functions make defensive copies rng = np.random.RandomState(0) X = rng.uniform(size=(10, 3)) y = (4 * X[:, 0]).astype(np.int) + 1 # don't predict zeros for estimator in [GradientBoostingRegressor(), GradientBoostingClassifier()]: estimator.fit(X, y) for func in ['predict', 'decision_function', 'predict_proba']: staged_func = getattr(estimator, "staged_" + func, None) if staged_func is None: # regressor has no staged_predict_proba continue with warnings.catch_warnings(record=True): staged_result = list(staged_func(X)) staged_result[1][:] = 0 assert_true(np.all(staged_result[0] != 0))
def test_warm_start_oob(): # Test if warm start OOB equals fit. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: est = Cls(n_estimators=200, max_depth=1, subsample=0.5, random_state=1) est.fit(X, y) est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True) est_ws.fit(X, y) est_ws.set_params(n_estimators=200) est_ws.fit(X, y) assert_array_almost_equal(est_ws.oob_improvement_[:100], est.oob_improvement_[:100])
def test_multi_target_sample_weights(): # weighted regressor Xw = [[1,2,3], [4,5,6]] yw = [[3.141, 2.718], [2.718, 3.141]] w = [2., 1.] rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr_w.fit(Xw, yw, w) # unweighted, but with repeated samples X = [[1,2,3], [1,2,3], [4,5,6]] y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]] rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr.fit(X, y) X_test = [[1.5,2.5,3.5], [3.5,4.5,5.5]] assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
def __init__(self, info, verbose=True, debug_mode=False): self.label_num=info['label_num'] self.target_num=info['target_num'] self.task = info['task'] self.metric = info['metric'] self.postprocessor = None #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba if debug_mode>=2: self.name = "RandomPredictor" self.model = RandomPredictor(self.target_num) self.predict_method = self.model.predict_proba return if info['task']=='regression': if info['is_sparse']==True: self.name = "BaggingRidgeRegressor" self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingRegressor" self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True) self.predict_method = self.model.predict # Always predict probabilities else: if info['has_categorical']: # Out of lazziness, we do not convert categorical variables... self.name = "RandomForestClassifier" self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start... elif info['is_sparse']: self.name = "BaggingNBClassifier" self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingClassifier" self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)") if info['task']=='multilabel.classification': self.model = MultiLabelEnsemble(self.model) self.predict_method = self.model.predict_proba
def try_params( n_iterations, params ): n_estimators = int( round( n_iterations * trees_per_iteration )) print "n_estimators:", n_estimators pprint( params ) clf = GB( n_estimators = n_estimators, verbose = 0, **params ) return train_and_eval_sklearn_regressor( clf, data )
def add_new_weak_learner(self): ''' Summary: Adds a new function, h, to self.weak_learners by solving for Eq. 1 using multiple additive regression trees: [Eq. 1] h = argmin_h (sum_i Q_A(s_i,a_i) + h(s_i, a_i) - (r_i + max_b Q_A(s'_i, b))) ''' if len(self.most_recent_episode) == 0: # If this episode contains no data, don't do anything. return # Build up data sets of features and loss terms data = np.zeros((len(self.most_recent_episode), self.max_state_features + 1)) total_loss = np.zeros(len(self.most_recent_episode)) for i, experience in enumerate(self.most_recent_episode): # Grab the experience. s, a, r, s_prime = experience # Pad in case the state features are too short (as in Atari sometimes). features = self._pad_features_with_zeros(s, a) loss = (r + self.gamma * self.get_max_q_value(s_prime) - self.get_q_value(s, a)) # Add to relevant lists. data[i] = features total_loss[i] = loss # Compute new regressor and add it to the weak learners. estimator = GradientBoostingRegressor(loss='ls', n_estimators=1, max_depth=self.max_depth) estimator.fit(data, total_loss) self.weak_learners.append(estimator)
def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None): X, y = utils.drop_missing_y_vals(X, y, output_column=None) if isinstance(estimator, GradientBoostingRegressor): X = X.toarray() predictions = estimator.predict(X) if took_log_of_y: for idx, val in enumerate(predictions): predictions[idx] = math.exp(val) try: score = self.scoring_func(y, predictions) except ValueError: bad_val_indices = [] for idx, val in enumerate(y): if str(val) in bad_vals_as_strings: bad_val_indices.append(idx) predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices] y = [val for idx, val in enumerate(y) if idx not in bad_val_indices] print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset') score = self.scoring_func(y, predictions) if advanced_scoring == True: if hasattr(estimator, 'name'): print(estimator.name) advanced_scoring_regressors(predictions, y, verbose=verbose, name=name) return - 1 * score
def setClf(self): self.clf = GradientBoostingRegressor(n_estimators=100, verbose=100) # self.clf = GradientBoostingRegressor(loss = 'ls', verbose = 300, n_estimators=70, learning_rate= 0.1,subsample=1.0, max_features = 1.0) return
def run_batch(batch): for num_iters, params in batch: max_depth = params['max_depth'] learning_rate = params['learning_rate'] num_iters = int(num_iters) reg = GradientBoostingRegressor( learning_rate=learning_rate, max_depth=max_depth, n_estimators=num_iters) reg.fit(X_train, y_train) mse = ((reg.predict(X_test) - y_test)**2).mean() yield mse
def greedy_elim(df): # do feature selection using boruta X = df[[x for x in df.columns if x!='SalePrice']] y = df['SalePrice'] #model = RandomForestRegressor(n_estimators=50) model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05) # 150 features seems to be the best at the moment. Why this is is unclear. feat_selector = RFE(estimator=model, step=1, n_features_to_select=150) # find all relevant features feat_selector.fit_transform(X.as_matrix(), y.as_matrix()) # check selected features features_bool = np.array(feat_selector.support_) features = np.array(X.columns) result = features[features_bool] #print(result) # check ranking of features features_rank = feat_selector.ranking_ #print(features_rank) rank = features_rank[features_bool] #print(rank) return result
def convert(model, input_features, output_features): """Convert a boosted tree model to protobuf format. Parameters ---------- decision_tree : GradientBoostingRegressor A trained scikit-learn tree model. input_feature: [str] Name of the input columns. output_features: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not(_HAS_SKLEARN): raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, _ensemble.GradientBoostingRegressor) def is_gbr_model(m): if len(m.estimators_) == 0: return False if hasattr(m, 'estimators_') and m.estimators_ is not None: for t in m.estimators_.flatten(): if not hasattr(t, 'tree_') or t.tree_ is None: return False return True else: return False _sklearn_util.check_fitted(model, is_gbr_model) base_prediction = model.init_.mean return _MLModel(_convert_tree_ensemble(model, input_features, output_features, base_prediction = base_prediction))
def model_fit_and_test(TrainX,TrainY,TestX,TestY): def bulid_model(model_name): model = model_name() return model #for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]: for model_name in [LinearRegression, ElasticNet]: model = bulid_model(model_name) model.fit(TrainX,TrainY) print(model_name) resid = model.predict(TestX) - TestY #print resid print("Residual sum of squares: %f"% np.mean(resid ** 2)) #print model.predict(TestX) #print TestY # Explained variance score: 1 is perfect prediction plt.scatter(model.predict(TestX), resid); plt.axhline(0, color='red') plt.xlabel('Predicted Values') plt.ylabel('Residuals') #plt.xlim([1, 50]) plt.show() print('Variance score: %.2f' % model.score(TestX, TestY)) from statsmodels.stats.stattools import jarque_bera _, pvalue, _, _ = jarque_bera(resid) print ("Test Residuals Normal", pvalue) from statsmodels import regression, stats import statsmodels.api as sms import statsmodels.stats.diagnostic as smd # xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4))) xs_with_constant = sms.add_constant(TestX) _, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant) print ("Test Heteroskedasticity", pvalue1) ljung_box = smd.acorr_ljungbox(resid, lags=10) #print "Lagrange Multiplier Statistics:", ljung_box[0] print "Test Autocorrelation P-values:", ljung_box[1] if any(ljung_box[1] < 0.05): print "The residuals are autocorrelated." else: print "The residuals are not autocorrelated."
def fit_boosting(X, y, window=100000, estimators=250, learning=.01, samples_leaf=500, depth=20, validate=False): ''' Fits Gradient Boosting ''' model = GradientBoostingRegressor(n_estimators=estimators, learning_rate=learning, min_samples_leaf=samples_leaf, max_depth=depth, random_state=42) if validate: return cross_validate(X, y, model, window) return model.fit(X, y)
def __init__(self, **kwargs): #print("kwargs=", kwargs) self.is_boxcox = kwargs.get("is_boxcox", False) self.boxcox_lambda = kwargs.get("boxcox_lambda", 0.0) self.Model = kwargs.get("model", GradientBoostingRegressor) if "is_boxcox" in kwargs: kwargs.pop("is_boxcox") if "boxcox_lambda" in kwargs: kwargs.pop("boxcox_lambda") if "model" in kwargs: kwargs.pop("model") self.clf = self.Model(**kwargs)
def gradient_boost_grid_search(): gradient_boost_grid = { 'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [.0001, .001, .01, .1, 1], 'n_estimators': [50, 100, 1000, 10000], 'max_depth': [1, 3], 'min_samples_split': [2, 4, 10], 'max_features': ['sqrt', 'log2'], } gb = GradientBoostingRegressor() return gradient_boost_grid, gb
def setUpClass(cls): """ Set up the unit test by loading the dataset and training a model. """ if not HAS_SKLEARN: return scikit_data = load_boston() scikit_model = GradientBoostingRegressor(random_state = 1) scikit_model.fit(scikit_data['data'], scikit_data['target']) # Save the data and the model cls.scikit_data = scikit_data cls.scikit_model = scikit_model
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = GradientBoostingRegressor() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(TypeError): model = GradientBoostingRegressor() spec = xgb_converter.convert(model, 'data', 'out') # Check the expected class during conversion with self.assertRaises(TypeError): model = OneHotEncoder() spec = xgb_converter.convert(model, 'data', 'out')
def test_model_select_by_param(): iris = load_iris() gbdt = GradientBoostingRegressor() parameters = {'n_estimators': [1000, 5000], 'max_depth':[3,4]} grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(iris.data[:150],iris.target[:150]) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None): X, y = utils.drop_missing_y_vals(X, y, output_column=None) if isinstance(estimator, GradientBoostingRegressor): X = X.toarray() predictions = estimator.predict(X) if took_log_of_y: for idx, val in enumerate(predictions): predictions[idx] = math.exp(val) try: score = self.scoring_func(y, predictions) except ValueError: bad_val_indices = [] for idx, val in enumerate(y): if str(val) in bad_vals_as_strings or str(predictions[idx]) in bad_vals_as_strings: bad_val_indices.append(idx) predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices] y = [val for idx, val in enumerate(y) if idx not in bad_val_indices] print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the predicted or y values. We will ignore these, and report the score on the rest of the dataset') score = self.scoring_func(y, predictions) if advanced_scoring == True: if hasattr(estimator, 'name'): print(estimator.name) advanced_scoring_regressors(predictions, y, verbose=verbose, name=name) return - 1 * score
def regression_with_GBR(X_train, y_train, X_test, y_test, parmsFromNormalization, params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01, 'loss': 'ls'}): #GradientBoostingRegressor gfr = GradientBoostingRegressor(**params) gfr.fit(X_train, y_train) y_pred_gbr = gfr.predict(X_test) print_regression_model_summary("GBR", y_test, y_pred_gbr, parmsFromNormalization) print_feature_importance(X_test, y_test,gfr.feature_importances_) #cross validation ( not sure this make sense for regression #http://scikit-learn.org/stable/modules/cross_validation.html #gfr = GradientBoostingRegressor(**params) #scores = cross_validation.cross_val_score(gfr, X_train, y_train, cv=5) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) return y_pred_gbr
def GDBT_regression(X=train_split,Y=y): est = GradientBoostingRegressor(n_estimators=75,max_depth=3,learning_rate=0.1) X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0) est.fit(X_train,Y_train) y_train_pred = est.predict(X_test) plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data') plt.title("Linear regression with GDBT") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data") plt.title("Linear regression with GDBT") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() print('rmse value:',rmsle(Y_test,y_train_pred)) return est # linear_regression() # ridge_regression() # Lasso_regression() #model = Elasticnet_regression() # ''' # predict final result # ''' # # # coefs,lasso = Lasso_regression() # selected_features = coefs[coefs['value'] != 0].index.values # train_new = train_split[selected_features]
def gbr(X,y): X_train,X_validation,y_train,y_validation = train_test_split(X,y,random_state=0) sklearn_boost = GradientBoostingRegressor(random_state=1) sklearn_boost.fit(X_train,y_train.ravel()) print 'training error:',1.0 - sklearn_boost.score(X_train,y_train) print 'validation error:',1.0 - sklearn_boost.score(X_validation,y_validation) time_fit(sklearn_boost,X_train,y_train.ravel())