我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.linear_model.Lasso()。
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) prep = {'Standard Scaling': [StandardScaler()], 'Min Max Scaling': [MinMaxScaler()], 'No Preprocessing': []} est = {'Standard Scaling': [ElasticNet(), Lasso(), KNeighborsRegressor()], 'Min Max Scaling': [SVR()], 'No Preprocessing': [RandomForestRegressor(random_state=SEED), GradientBoostingRegressor()]} ens.add(est, prep) ens.add(GradientBoostingRegressor(), meta=True) return ens
def _cv_r0( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def model_cross_valid(X,Y): seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) def bulid_model(model_name): model = model_name() return model scoring = 'neg_mean_squared_error' # + random fest boost lstm gbdt for model_name in [LinearRegression,ElasticNet]: #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]: model = bulid_model(model_name) results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(model_name,results.mean())
def fit_regression(X, y, regression_class=LinearRegression, regularization_const=.001): ''' Given a dataset and some solutions (X, y) a regression class (from scikit learn) and an Lambda which is required if the regression class is Lasso or Ridge X (pandas DataFrame): The data. y (pandas DataFrame or Series): The answers. regression_class (class): One of sklearn.linear_model.[LinearRegression, Ridge, Lasso] regularization_const: the regularization_const value (regularization parameter) for Ridge or Lasso. Called alpha by scikit learn for interface reasons. Return: tuple, (the_fitted_regressor, mean(cross_val_score)). ''' if regression_class is LinearRegression: predictor = regression_class() else: predictor = regression_class(alpha=regularization_const, normalize=True) predictor.fit(X, y) cross_scores = cross_val_score(predictor, X, y=y, scoring='neg_mean_squared_error') cross_scores_corrected = np.sqrt(-1 * cross_scores) # Scikit learn returns negative vals && we need root return (predictor, np.mean(cross_scores_corrected))
def run(self): """ ?? """ # ???? X, Y = self._fetch_data() clf = self.get_classifier(X, Y) # ?? X, Y = self._fetch_test_data() res = [] for item in range(11): hit_ratio = self.predict(clf, X, Y, item * 0.1) res.append([item * 0.1 * 100, hit_ratio * 100]) # ?????????????? arr = np.array(res) plt.plot(arr[:, 0], arr[:, 1]) # ??? plt.plot(arr[:, 0], arr[:, 1], 'ro') # ??? plt.xlabel('???(%)') plt.ylabel('???(%)') plt.title('??Lasso?????????????') plt.show()
def lasso_regularization(matrix_a, vector_y, lambda_parameter=0): """ Lasso algorithm that solves min ||y - Ax||_2^2 + lambda ||x||_1 :param matrix_a: :param vector_y: :param lambda_parameter: :return: estimated x """ # convert regularization parameter (sklearn considers (1/2m factor)) reg_parameter = lambda_parameter / (2 * len(vector_y)) # initialize model clf = linear_model.Lasso(reg_parameter, fit_intercept=False, normalize=False) # fit it clf.fit(matrix_a, vector_y) # return estimate x = clf.coef_ return x
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cv( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def mlr_val_vseq_lasso( RM, yE, v_seq, alpha = .5, disp = True, graph = True): """ Validation is peformed using vseq indexed values. """ org_seq = list(range( len( yE))) t_seq = [x for x in org_seq if x not in v_seq] RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0] RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0] clf = linear_model.Lasso( alpha = alpha) clf.fit( RMt, yEt) if disp: print('Training result') mlr_show( clf, RMt, yEt, disp = disp, graph = graph) if disp: print('Validation result') r_sqr, RMSE = mlr_show( clf, RMv, yEv, disp = disp, graph = graph) #if r_sqr < 0: # print 'v_seq:', v_seq, '--> r_sqr = ', r_sqr return r_sqr, RMSE
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = svm.SVR( **svr_params) kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') kutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') kutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cv( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) kf_n = kf_n_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') kutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ n_splits = xM.shape[0] # print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') kutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def pd_gscv( pdr, method, xM, yV, alphas_log, colname = 'Predicted-RP', fname = 'sheet/rafa36795_cxcalc_prp1000.csv'): """ This run grid search, perform cross-validation for plotting and save the predicted values, """ print("1. Searching the best hyper-parameter by a grid method.") gr = jgrid.gs( method, xM, yV, alphas_log) print(gr.grid_scores_) print("Best alpha:", gr.best_params_['alpha']) print("2. Predicting the property using the best hyper-parameter and show a x-y plot") yV_pred = jgrid.cv( 'Lasso', xM, yV, alpha = gr.best_params_['alpha'], grid_std = gr_beststd(gr)) print("3. Saving the predicted results in crossvalidation into", fname) pdw = pdr.copy() pdw[ colname] = yV_pred.tolist() pdw.to_csv( fname, index = False) print("4. Saving the best estimator as a pkl file") print(gr.best_estimator_) externals.joblib.dump(gr.best_estimator_, fname[:-3] + "pkl")
def cv_SVR(xM, yV, svr_params, n_folds=5, n_jobs=-1, grid_std=None, graph=True, shuffle=True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = svm.SVR(**svr_params) kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True) kf_n = kf_n_c.split(xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV.A1, cv=kf_n, n_jobs=n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show(yV, yV_pred, grid_std=grid_std) return yV_pred
def _cv_r0(method, xM, yV, alpha, n_folds=5, n_jobs=-1, grid_std=None, graph=True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr(linear_model, method)(alpha=alpha) kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True) kf_n = kf_n_c.split(xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv=kf_n, n_jobs=n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show(yV, yV_pred, grid_std=grid_std) return yV_pred
def cv(method, xM, yV, alpha, n_folds=5, n_jobs=-1, grid_std=None, graph=True, shuffle=True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules Return -------- yV_pred """ print(xM.shape, yV.shape) clf = getattr(linear_model, method)(alpha=alpha) kf_n_c = model_selection.KFold(n_splits=n_folds, shuffle=True) kf_n = kf_n_c.split(xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv=kf_n, n_jobs=n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show(yV, yV_pred, grid_std=grid_std) return yV_pred
def _cv_LOO_r0(method, xM, yV, alpha, n_jobs=-1, grid_std=None, graph=True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ n_folds = xM.shape[0] print(xM.shape, yV.shape) clf = getattr(linear_model, method)(alpha=alpha) # print("Note - shuffling is not applied because of LOO.") kf_n_c = model_selection.KFold(n_splits=n_folds) kf_n = kf_n_c.split(xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv=kf_n, n_jobs=n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show(yV, yV_pred, grid_std=grid_std) return yV_pred
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=shuffle) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def _cv_LOO_r0( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ n_folds = xM.shape[0] print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds) yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def test_Lasso(*data): ''' test the correlation between alpha and sparse condition :param data: train_data, test_data, train_value, test_value :return: None ''' X,y=data alphas=np.logspace(-2,2) zeros=[] for alpha in alphas: regr=Lasso(alpha=alpha) regr.fit(X,y) num=0 for ele in regr.coef_: if abs(ele) < 1e-5:num+=1 zeros.append(num) fig=plt.figure() ax=fig.add_subplot(1,1,1) ax.plot(alphas,zeros) ax.set_xlabel(r"$\alpha$") ax.set_xscale("log") ax.set_ylim(0,X.shape[1]+1) ax.set_ylabel("zeros in coef") ax.set_title("Sparsity In Lasso") plt.show()
def test_Lasso_alpha(*data): ''' test the score with different alpha :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data alphas=[0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000] scores=[] for i,alpha in enumerate(alphas): regr = linear_model.Lasso(alpha=alpha) regr.fit(X_train, y_train) scores.append(regr.score(X_test, y_test)) ## graph fig=plt.figure() ax=fig.add_subplot(1,1,1) ax.plot(alphas,scores) ax.set_xlabel(r"$\alpha$") ax.set_ylabel(r"score") ax.set_xscale('log') ax.set_title("Lasso") plt.show()
def test_rank_deficient_design(): # consistency test that checks that LARS Lasso is handling rank # deficient input data (with n_features < rank) in the same way # as coordinate descent Lasso y = [5, 0, 5] for X in ([[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]], ): # To be able to use the coefs to compute the objective function, # we need to turn off normalization lars = linear_model.LassoLars(.1, normalize=False) coef_lars_ = lars.fit(X, y).coef_ obj_lars = (1. / (2. * 3.) * linalg.norm(y - np.dot(X, coef_lars_)) ** 2 + .1 * linalg.norm(coef_lars_, 1)) coord_descent = linear_model.Lasso(.1, tol=1e-6, normalize=False) coef_cd_ = coord_descent.fit(X, y).coef_ obj_cd = ((1. / (2. * 3.)) * linalg.norm(y - np.dot(X, coef_cd_)) ** 2 + .1 * linalg.norm(coef_cd_, 1)) assert_less(obj_lars, obj_cd * (1. + 1e-8))
def test_lasso_lars_vs_lasso_cd_early_stopping(verbose=False): # Test that LassoLars and Lasso using coordinate descent give the # same results when early stopping is used. # (test : before, in the middle, and in the last part of the path) alphas_min = [10, 0.9, 1e-4] for alphas_min in alphas_min: alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso', alpha_min=0.9) lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8) lasso_cd.alpha = alphas[-1] lasso_cd.fit(X, y) error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_) assert_less(error, 0.01) alphas_min = [10, 0.9, 1e-4] # same test, with normalization for alphas_min in alphas_min: alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso', alpha_min=0.9) lasso_cd = linear_model.Lasso(fit_intercept=True, normalize=True, tol=1e-8) lasso_cd.alpha = alphas[-1] lasso_cd.fit(X, y) error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_) assert_less(error, 0.01)
def train(self): """""" start = time.time() extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train') print('size before truncated outliers is %d ' % len(self.TrainData)) self.TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)] #self.TrainData = self.TrainData.join(extra_tr, on='parcelid', how= 'left') self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1) print('size after truncated outliers is %d ' % len(self.TrainData)) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['logerror'] self._l_train_columns = X.columns X = X.values.astype(np.float32, copy=False) lr = Lasso(alpha= self._lr_alpha, max_iter= self._lr_iter, tol= 1e-4, random_state= 2017, selection= self._lr_sel) self._model = lr.fit(X, Y) end = time.time() print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start))) self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__, datetime.now().strftime('%Y%m%d-%H:%M:%S')) with open(self._f_eval_train_model, 'wb') as o_file: pickle.dump(self._model, o_file, -1) o_file.close() #self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]], # ignore_index=True) ## ignore_index will reset the index or index will be overlaped return
def define_model(self): #if self.modeltype == "AR" : # return statsmodels.tsa.ar_model.AR(max_order=self.parameters['max_order']) if self.modeltype == "RandomForest" : return ensemble.RandomForestRegressor(n_estimators=self.parameters['n_estimators']) #return ensemble.RandomForestClassifier( # n_estimators=self.parameters['n_estimators']) elif self.modeltype == "LinearRegression" : return linear_model.LinearRegression() elif self.modeltype == "Lasso" : return linear_model.Lasso( alpha=self.parameters['alpha']) elif self.modeltype == "ElasticNet" : return linear_model.ElasticNet( alpha=self.parameters['alpha'], l1_ratio=self.parameters['l1_ratio']) elif self.modeltype == "SVR" : return SVR( C=self.parameters['C'], epsilon=self.parameters['epsilon'], kernel=self.parameters['kernel']) #elif self.modeltype == 'StaticModel': # return StaticModel ( # parameters=self.parameters # ) #elif self.modeltype == 'AdvancedStaticModel': # return AdvancedStaticModel ( # parameters=self.parameters # ) # elif self.modeltype == 'SGDRegressor' : # print(self.parameters) # return linear_model.SGDRegressor( # loss=self.parameters['loss'], # penalty=self.parameters['penalty'], # l1_ratio=self.parameters['l1_ratio']) else: raise ConfigError("Unsupported model {0}".format(self.modeltype))
def lasso_regression_model(parameter_array): alpha_value = parameter_array[0] #alpha value index is first index return linear_model.Lasso(alpha=alpha_value, fit_intercept=True, normalize=True, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic') #Returns the SVR Linear Kernel model
def model_fit_and_test(TrainX,TrainY,TestX,TestY): def bulid_model(model_name): model = model_name() return model #for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]: for model_name in [LinearRegression, ElasticNet]: model = bulid_model(model_name) model.fit(TrainX,TrainY) print(model_name) resid = model.predict(TestX) - TestY #print resid print("Residual sum of squares: %f"% np.mean(resid ** 2)) #print model.predict(TestX) #print TestY # Explained variance score: 1 is perfect prediction plt.scatter(model.predict(TestX), resid); plt.axhline(0, color='red') plt.xlabel('Predicted Values') plt.ylabel('Residuals') #plt.xlim([1, 50]) plt.show() print('Variance score: %.2f' % model.score(TestX, TestY)) from statsmodels.stats.stattools import jarque_bera _, pvalue, _, _ = jarque_bera(resid) print ("Test Residuals Normal", pvalue) from statsmodels import regression, stats import statsmodels.api as sms import statsmodels.stats.diagnostic as smd # xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4))) xs_with_constant = sms.add_constant(TestX) _, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant) print ("Test Heteroskedasticity", pvalue1) ljung_box = smd.acorr_ljungbox(resid, lags=10) #print "Lagrange Multiplier Statistics:", ljung_box[0] print "Test Autocorrelation P-values:", ljung_box[1] if any(ljung_box[1] < 0.05): print "The residuals are autocorrelated." else: print "The residuals are not autocorrelated."
def main(dataset_size, test_proportion): diabetes = load_diabetes() X = diabetes.data[:dataset_size] y = diabetes.target[:dataset_size] fig, ax_list = plt.subplots(3, 1, figsize=(8, 6)) plot_errors_by_lambda(X, y, test_proportion=test_proportion, regression_class=Ridge, ax=ax_list[0]) plot_errors_by_lambda(X, y, test_proportion=test_proportion, regression_class=Lasso, ax=ax_list[1]) plot_errors_by_lambda(X, y, test_proportion=test_proportion, regression_class=LinearRegression, ax=ax_list[2]) plt.tight_layout() plt.show()
def vote_with_lr(conf, forecasts, best_model_index, y_actual): start = time.time() best_forecast = forecasts[:, best_model_index] forecasts = np.sort(np.delete(forecasts, best_model_index, axis=1), axis=1) forecasts = np.where(forecasts <=0, 0.1, forecasts) data_train = [] for i in range(forecasts.shape[0]): f_row = forecasts[i,] min_diff_to_best = np.min([cal_rmsle(best_forecast[i], f) for f in f_row]) comb = list(itertools.combinations(f_row,2)) avg_error = scipy.stats.hmean([cal_rmsle(x,y) for (x,y) in comb]) data_train.append([min_diff_to_best, avg_error, scipy.stats.hmean(f_row), np.median(f_row), np.std(f_row)]) X_all = np.column_stack([np.row_stack(data_train), best_forecast]) if conf.target_as_log: y_actual = transfrom_to_log(y_actual) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual)*0.25)) X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual) y_actual_test = y_actual[no_of_training_instances:] lr_model =linear_model.Lasso(alpha = 0.2) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forcast_revered = retransfrom_from_log(lr_forecast) calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered) print_time_took(start, "vote_with_lr") return lr_forcast_revered
def get_models4ensamble(conf): models = [] #models = [RFRModel(conf), DLModel(conf), LRModel(conf)] #models = [LRModel(conf)] # see http://scikit-learn.org/stable/modules/linear_model.html #0 was too big to run with depth set to 1, and 1 was overfitting a bit if conf.command == 1: xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":3, "eta":0.1, "min_child_weight":5, "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0} else: xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8, "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0} #xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8, # "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0} models = [ #DLModel(conf), #LRModel(conf, model=linear_model.BayesianRidge()), #LRModel(conf, model=linear_model.LassoLars(alpha=.1)), #LRModel(conf, model=linear_model.Lasso(alpha = 0.1)), #LRModel(conf, model=Pipeline([('poly', PolynomialFeatures(degree=3)), #LRModel(conf, model=linear_model.Ridge (alpha = .5)) # ('linear', LinearRegression(fit_intercept=False))])), XGBoostModel(conf, xgb_params, use_cv=True), LRModel(conf, model=linear_model.Lasso(alpha = 0.3)), RFRModel(conf, RandomForestRegressor(oob_score=True, n_jobs=4)), #LRModel(conf, model=linear_model.Lasso(alpha = 0.2)), ETRModel(conf, model=ExtraTreesRegressor(n_jobs=4)), #AdaBoostRModel(conf, model=AdaBoostRegressor(loss='square')) ] return models #return [XGBoostModel(conf, xgb_params, use_cv=True)]
def lasso_regression(self, scoring_metric='neg_mean_squared_error', hyperparameter_grid=None, randomized_search=True, number_iteration_samples=2): """ A light wrapper for Sklearn's lasso regression that performs randomized search over an overridable default hyperparameter grid. Args: scoring_metric (str): Any sklearn scoring metric appropriate for regression hyperparameter_grid (dict): hyperparameters by name randomized_search (bool): True for randomized search (default) number_iteration_samples (int): Number of models to train during the randomized search for exploring the hyperparameter space. More may lead to a better model, but will take longer. Returns: TrainedSupervisedModel: """ self.validate_regression('Lasso Regression') if hyperparameter_grid is None: hyperparameter_grid = {"fit_intercept": [True, False]} number_iteration_samples = 2 algorithm = get_algorithm(Lasso, scoring_metric, hyperparameter_grid, randomized_search, number_iteration_samples=number_iteration_samples) trained_supervised_model = self._create_trained_supervised_model(algorithm) return trained_supervised_model
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) est = [ElasticNet(copy_X=False), Lasso(copy_X=False)] ens.add(est) ens.add(KNeighborsRegressor()) return ens
def lasso(): """Fit Lasso.""" print("Fitting LAS...", end=" ", flush=True) time.sleep(SLEEP) t0 = time.time() ls = Lasso() ls.fit(X, y) print_time(t0, "Done", end="")
def elasticnet(): """Fit Elastic Net.""" print("Fitting ELN...", end=" ", flush=True) time.sleep(SLEEP) t0 = time.time() ls = Lasso() ls.fit(X, y) print_time(t0, "Done", end="")
def build_ensemble(kls, **kwargs): """Generate ensemble of class kls.""" ens = kls(**kwargs) ens.add([SVR(), RandomForestRegressor(), GradientBoostingRegressor(), Lasso(copy_X=False), MLPRegressor(shuffle=False, alpha=0.001)]) ens.add_meta(Lasso(copy_X=False)) return ens
def spot_check(X, y): if type == 'regression': models = [ (LinearRegression(), 'Ordinary Least Squares'), (Ridge(alpha=0.1), 'Ridge (alpha 0.1)'), (Ridge(), 'Ridge (alpha 1.0)'), (Lasso(alpha=0.1), 'Lasso (alpha 0.1)'), (Lasso(), 'Lasso (alpha 1.0)'), (ElasticNet(alpha=0.1), 'ElasticNet (alpha 0.1)'), (ElasticNet(), 'ElasticNet (alpha 1.0)'), (DecisionTreeRegressor(), 'Decision Tree'), (KNeighborsRegressor(), 'K-Nearest Neighbors'), # (RandomForestRegressor(), 'Random Forest Regressor'), # (BaggingRegressor(), 'Bagging Regressor'), # (GradientBoostingRegressor(), 'Gradient Bosted Regression'), # (SVR(), 'Support Vector Regression') ] splits = 5 scores = [] for model, model_name in models: score = check_model(model, splits, X, y) # get average score scores.append(score) model_names = map(lambda x: x[1], models) for name, score in zip(model_names, scores): print('%s: %f' % (name, score))
def get_classifier(self, X, Y): """ ??Lasso?? :param X: ???? :param Y: ?????? :return: ?? """ clf = Lasso() clf.fit(X, Y) return clf
def train_lasso_model(_train_x, train_y, _predict_x): print_title("Lasso Regressor") train_x, predict_x = \ standarize_feature(_train_x, _predict_x) reg = linear_model.LassoCV( precompute=True, cv=5, verbose=1, n_jobs=4) reg.fit(train_x, train_y) print("alphas: %s" % reg.alphas_) print("mse path: %s" % np.mean(reg.mse_path_, axis=1)) itemindex = np.where(reg.alphas_ == reg.alpha_) print("itemindex: %s" % itemindex) _mse = np.mean(reg.mse_path_[itemindex[0], :]) print("Best alpha using bulit-in LassoCV: %f(mse: %f)" % (reg.alpha_, _mse)) alpha = reg.alpha_ reg = linear_model.Lasso(alpha=alpha) reg.fit(train_x, train_y) n_nonzeros = (reg.coef_ != 0).sum() print("Non-zeros coef: %d" % n_nonzeros) predict_y = reg.predict(predict_x) train_y_pred = reg.predict(train_x) return {"y": predict_y, "train_y": train_y_pred, "coef": reg.coef_}
def lasso(train ,test , label, alpha = 0.00099, max_iteration = 50000): lasso = Lasso(alpha = alpha , max_iter = max_iteration) lasso.fit(train,label) #prediction on training data y_predicton = lasso.predict(train) y_test = label print("Lasso score on training set: ", rmse(y_test, y_predicton)) y_predicton = lasso.predict(test) y_predicton = np.exp(y_predicton) return y_predicton
def test_regressor_cv(self): """ Ensure only "CV" regressors are allowed """ for model in (SVR, Ridge, Lasso, LassoLars, ElasticNet): with self.assertRaises(YellowbrickTypeError): alphas = AlphaSelection(model()) for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV): try: alphas = AlphaSelection(model()) except YellowbrickTypeError: self.fail("could not instantiate RegressorCV on alpha selection")
def run(): data = load_binary() # Extract features user_feat_matrix = process_level2(data) # X del user_feat_matrix['X']['user_id'] X = user_feat_matrix['X'].values X[np.isnan(X)] = 0 Y = user_feat_matrix['Y'] Y.fillna(0, inplace=True) del user_feat_matrix['X_all']['user_id'] X_all = user_feat_matrix['X_all'].values X_all[np.isnan(X_all)] = 0 cols = list(Y.columns.values) symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted', 'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain', 'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin'] with open("result.txt", 'w') as f: f.write("user_id,day_in_cycle,symptom,probability\n") for symptom in symptoms: print(symptom) pipeline = Pipeline([ ('remove_low_variance_features', VarianceThreshold(threshold=0.0)), #('standard_scale', StandardScaler()), ('estimator', Lasso()), ]) param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]} model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4, verbose=2) model.fit(X, s_Y.values) print("dumping...") data_dir = 'data' cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv')) c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)} dump(symptom, model, X_all, c_length, data['users'].user_id)
def estimate(self, a, y, initial_x=None): """ :param a: MxN matrix A in the y=Ax equation :type a: numpy.ndarray :param y: M vector y in the y=Ax equation :type y: numpy.ndarray :param initial_x: N vector of an initial solution :type initial_x: numpy.ndarray :return: best estimation of the N vector x in the y=Ax equation :rtype: numpy.ndarray :Example: >>> import numpy as np >>> import linvpy as lp >>> a = np.matrix([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([1, 2, 3]) >>> m = lp.MEstimator() >>> m.estimate(a,y) array([ -2.95552481e-16, 5.00000000e-01]) >>> m_ = lp.MEstimator(loss_function=lp.Bisquare, clipping=2.23, \ regularization=lp.Lasso(), lamb=3) >>> initial_solution = np.array([1, 2]) >>> m_.estimate(a, y, initial_x=initial_solution) array([ 0., 0.]) """ return self.irls(a, y, initial_x)
def build_model(train_file, attr_file, model_out, algorithm='ridge'): classifiers = ['ridge', 'linear', 'lasso', 'rf', 'en'] if algorithm not in classifiers: raise NotImplementedError("only implemented algorithms: " + str(classifiers)) train_data = pd.read_pickle(train_file) attrs = read_attrs(attr_file) target_attr = attrs[0] usable_attrs = attrs[1:] if algorithm == 'ridge': clf = Ridge() elif algorithm == 'linear': clf = LinearRegression() elif algorithm == 'lasso': clf = Lasso() elif algorithm == 'en': clf = ElasticNet() else: clf = RandomForestRegressor() logger.debug("Modeling '%s'", target_attr) logger.debug(" train set (%d): %s", len(train_data), train_file) logger.debug(" Algorithm: %s", algorithm) if hasattr(clf, 'coef_'): logger.debug('Coefficients:') for i,c in enumerate(clf.coef_): logger.debug(' %-20s' % usable_attrs[i] + ':', '%20.4f' % c) clf.fit(train_data[usable_attrs], train_data[target_attr]) pickle.dump(clf, open(model_out, 'wb'))
def gs_Lasso( xM, yV, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1): print(xM.shape, yV.shape) clf = linear_model.Lasso() #parmas = {'alpha': np.logspace(1, -1, 9)} parmas = {'alpha': np.logspace( *alphas_log)} kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True) #kf5 = kf5_c.split( xM) gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs) gs.fit( xM, yV) return gs
def gs_Lasso_norm( xM, yV, alphas_log = (-1, 1, 9)): print(xM.shape, yV.shape) clf = linear_model.Lasso( normalize = True) #parmas = {'alpha': np.logspace(1, -1, 9)} parmas = {'alpha': np.logspace( *alphas_log)} kf5_c = model_selection.KFold( n_splits = 5, shuffle=True) #kf5 = kf5_c.split( xM) gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = -1) gs.fit( xM, yV) return gs