def train_lassolars_model(train_x, train_y, predict_x): print_title("LassoLars Regressor") reg = linear_model.LassoLarsCV( cv=10, n_jobs=3, max_iter=2000, normalize=False) reg.fit(train_x, train_y) print("alphas and cv_alphas: {0} and {1}".format( reg.alphas_.shape, reg.cv_alphas_.shape)) print("alphas[%d]: %s" % (len(reg.cv_alphas_), reg.cv_alphas_)) print("mse shape: {0}".format(reg.cv_mse_path_.shape)) # print("mse: %s" % np.mean(_mse, axis=0)) # print("mse: %s" % np.mean(_mse, axis=1)) # index = np.where(reg.alphas_ == reg.alpha_) # print("itemindex: %s" % index) index = np.where(reg.cv_alphas_ == reg.alpha_) _mse_v = np.mean(reg.cv_mse_path_[index, :]) print("mse value: %f" % _mse_v) print("best alpha: %f" % reg.alpha_) best_alpha = reg.alpha_ reg = linear_model.LassoLars(alpha=best_alpha) reg.fit(train_x, train_y) n_nonzeros = (reg.coef_ != 0).sum() print("Non-zeros coef: %d" % n_nonzeros) predict_y = reg.predict(predict_x) return {'y': predict_y, "coef": reg.coef_}
def test_rank_deficient_design(): # consistency test that checks that LARS Lasso is handling rank # deficient input data (with n_features < rank) in the same way # as coordinate descent Lasso y = [5, 0, 5] for X in ([[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]], ): # To be able to use the coefs to compute the objective function, # we need to turn off normalization lars = linear_model.LassoLars(.1, normalize=False) coef_lars_ = lars.fit(X, y).coef_ obj_lars = (1. / (2. * 3.) * linalg.norm(y - np.dot(X, coef_lars_)) ** 2 + .1 * linalg.norm(coef_lars_, 1)) coord_descent = linear_model.Lasso(.1, tol=1e-6, normalize=False) coef_cd_ = coord_descent.fit(X, y).coef_ obj_cd = ((1. / (2. * 3.)) * linalg.norm(y - np.dot(X, coef_cd_)) ** 2 + .1 * linalg.norm(coef_cd_, 1)) assert_less(obj_lars, obj_cd * (1. + 1e-8))
def test_lasso_lars_vs_lasso_cd_early_stopping(verbose=False): # Test that LassoLars and Lasso using coordinate descent give the # same results when early stopping is used. # (test : before, in the middle, and in the last part of the path) alphas_min = [10, 0.9, 1e-4] for alphas_min in alphas_min: alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso', alpha_min=0.9) lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8) lasso_cd.alpha = alphas[-1] lasso_cd.fit(X, y) error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_) assert_less(error, 0.01) alphas_min = [10, 0.9, 1e-4] # same test, with normalization for alphas_min in alphas_min: alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso', alpha_min=0.9) lasso_cd = linear_model.Lasso(fit_intercept=True, normalize=True, tol=1e-8) lasso_cd.alpha = alphas[-1] lasso_cd.fit(X, y) error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_) assert_less(error, 0.01)
def test_multitarget(): # Assure that estimators receiving multidimensional y do the right thing X = diabetes.data Y = np.vstack([diabetes.target, diabetes.target ** 2]).T n_targets = Y.shape[1] for estimator in (linear_model.LassoLars(), linear_model.Lars()): estimator.fit(X, Y) Y_pred = estimator.predict(X) Y_dec = assert_warns(DeprecationWarning, estimator.decision_function, X) assert_array_almost_equal(Y_pred, Y_dec) alphas, active, coef, path = (estimator.alphas_, estimator.active_, estimator.coef_, estimator.coef_path_) for k in range(n_targets): estimator.fit(X, Y[:, k]) y_pred = estimator.predict(X) assert_array_almost_equal(alphas[k], estimator.alphas_) assert_array_almost_equal(active[k], estimator.active_) assert_array_almost_equal(coef[k], estimator.coef_) assert_array_almost_equal(path[k], estimator.coef_path_) assert_array_almost_equal(Y_pred[:, k], y_pred)
def train(self): """""" start = time.time() print('size before truncated outliers is %d ' % len(self.TrainData)) TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)] print('size after truncated outliers is %d ' % len(self.TrainData)) TrainData['longitude'] -= -118600000 TrainData['latitude'] -= 34220000 #extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train') #self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['logerror'] self._l_train_columns = X.columns X = X.values.astype(np.float32, copy=False) lr = LassoLars(alpha= self._lr_alpha, max_iter= self._lr_iter, verbose= True) self._model = lr.fit(X, Y) end = time.time() print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start))) self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__, datetime.now().strftime('%Y%m%d-%H:%M:%S')) #with open(self._f_eval_train_model, 'wb') as o_file: # pickle.dump(self._model, o_file, -1) #o_file.close() #self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]], # ignore_index=True) ## ignore_index will reset the index or index will be overlaped return
def get_models4ensamble(conf): models = [] #models = [RFRModel(conf), DLModel(conf), LRModel(conf)] #models = [LRModel(conf)] # see http://scikit-learn.org/stable/modules/linear_model.html #0 was too big to run with depth set to 1, and 1 was overfitting a bit if conf.command == 1: xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":3, "eta":0.1, "min_child_weight":5, "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0} else: xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8, "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0} #xgb_params = {"objective": "reg:linear", "booster":"gbtree", "max_depth":10, "eta":0.1, "min_child_weight":8, # "subsample":0.5, "nthread":4, "colsample_bytree":0.5, "num_parallel_tree":1, 'gamma':0} models = [ #DLModel(conf), #LRModel(conf, model=linear_model.BayesianRidge()), #LRModel(conf, model=linear_model.LassoLars(alpha=.1)), #LRModel(conf, model=linear_model.Lasso(alpha = 0.1)), #LRModel(conf, model=Pipeline([('poly', PolynomialFeatures(degree=3)), #LRModel(conf, model=linear_model.Ridge (alpha = .5)) # ('linear', LinearRegression(fit_intercept=False))])), XGBoostModel(conf, xgb_params, use_cv=True), LRModel(conf, model=linear_model.Lasso(alpha = 0.3)), RFRModel(conf, RandomForestRegressor(oob_score=True, n_jobs=4)), #LRModel(conf, model=linear_model.Lasso(alpha = 0.2)), ETRModel(conf, model=ExtraTreesRegressor(n_jobs=4)), #AdaBoostRModel(conf, model=AdaBoostRegressor(loss='square')) ] return models #return [XGBoostModel(conf, xgb_params, use_cv=True)]
def test_regressor_cv(self): """ Ensure only "CV" regressors are allowed """ for model in (SVR, Ridge, Lasso, LassoLars, ElasticNet): with self.assertRaises(YellowbrickTypeError): alphas = AlphaSelection(model()) for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV): try: alphas = AlphaSelection(model()) except YellowbrickTypeError: self.fail("could not instantiate RegressorCV on alpha selection")
def getModels(): result = [] result.append("LinearRegression") result.append("BayesianRidge") result.append("ARDRegression") result.append("ElasticNet") result.append("HuberRegressor") result.append("Lasso") result.append("LassoLars") result.append("Rigid") result.append("SGDRegressor") result.append("SVR") result.append("MLPClassifier") result.append("KNeighborsClassifier") result.append("SVC") result.append("GaussianProcessClassifier") result.append("DecisionTreeClassifier") result.append("RandomForestClassifier") result.append("AdaBoostClassifier") result.append("GaussianNB") result.append("LogisticRegression") result.append("QuadraticDiscriminantAnalysis") return result
def compute_bench(alpha, n_samples, n_features, precompute): lasso_results = [] lars_lasso_results = [] it = 0 for ns in n_samples: for nf in n_features: it += 1 print('==================') print('Iteration %s of %s' % (it, max(len(n_samples), len(n_features)))) print('==================') n_informative = nf // 10 X, Y, coef_ = make_regression(n_samples=ns, n_features=nf, n_informative=n_informative, noise=0.1, coef=True) X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data gc.collect() print("- benchmarking Lasso") clf = Lasso(alpha=alpha, fit_intercept=False, precompute=precompute) tstart = time() clf.fit(X, Y) lasso_results.append(time() - tstart) gc.collect() print("- benchmarking LassoLars") clf = LassoLars(alpha=alpha, fit_intercept=False, normalize=False, precompute=precompute) tstart = time() clf.fit(X, Y) lars_lasso_results.append(time() - tstart) return lasso_results, lars_lasso_results
def test_lars_lstsq(): # Test that Lars gives least square solution at the end # of the path X1 = 3 * diabetes.data # use un-normalized dataset clf = linear_model.LassoLars(alpha=0.) clf.fit(X1, y) coef_lstsq = np.linalg.lstsq(X1, y)[0] assert_array_almost_equal(clf.coef_, coef_lstsq)
def test_lasso_lars_vs_lasso_cd(verbose=False): # Test that LassoLars and Lasso using coordinate descent give the # same results. X = 3 * diabetes.data alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso') lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8) for c, a in zip(lasso_path.T, alphas): if a == 0: continue lasso_cd.alpha = a lasso_cd.fit(X, y) error = linalg.norm(c - lasso_cd.coef_) assert_less(error, 0.01) # similar test, with the classifiers for alpha in np.linspace(1e-2, 1 - 1e-2, 20): clf1 = linear_model.LassoLars(alpha=alpha, normalize=False).fit(X, y) clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8, normalize=False).fit(X, y) err = linalg.norm(clf1.coef_ - clf2.coef_) assert_less(err, 1e-3) # same test, with normalized data X = diabetes.data alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso') lasso_cd = linear_model.Lasso(fit_intercept=False, normalize=True, tol=1e-8) for c, a in zip(lasso_path.T, alphas): if a == 0: continue lasso_cd.alpha = a lasso_cd.fit(X, y) error = linalg.norm(c - lasso_cd.coef_) assert_less(error, 0.01)
def test_lasso_lars_vs_lasso_cd_ill_conditioned2(): # Create an ill-conditioned situation in which the LARS has to go # far in the path to converge, and check that LARS and coordinate # descent give the same answers # Note it used to be the case that Lars had to use the drop for good # strategy for this but this is no longer the case with the # equality_tolerance checks X = [[1e20, 1e20, 0], [-1e-32, 0, 0], [1, 1, 1]] y = [10, 10, 1] alpha = .0001 def objective_function(coef): return (1. / (2. * len(X)) * linalg.norm(y - np.dot(X, coef)) ** 2 + alpha * linalg.norm(coef, 1)) lars = linear_model.LassoLars(alpha=alpha, normalize=False) assert_warns(ConvergenceWarning, lars.fit, X, y) lars_coef_ = lars.coef_ lars_obj = objective_function(lars_coef_) coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-4, normalize=False) cd_coef_ = coord_descent.fit(X, y).coef_ cd_obj = objective_function(cd_coef_) assert_less(lars_obj, cd_obj * (1. + 1e-8))
def getSKLearnModel(modelName): if modelName == 'LinearRegression': model = linear_model.LinearRegression() elif modelName == 'BayesianRidge': model = linear_model.BayesianRidge() elif modelName == 'ARDRegression': model = linear_model.ARDRegression() elif modelName == 'ElasticNet': model = linear_model.ElasticNet() elif modelName == 'HuberRegressor': model = linear_model.HuberRegressor() elif modelName == 'Lasso': model = linear_model.Lasso() elif modelName == 'LassoLars': model = linear_model.LassoLars() elif modelName == 'Rigid': model = linear_model.Ridge() elif modelName == 'SGDRegressor': model = linear_model.SGDRegressor() elif modelName == 'SVR': model = SVR() elif modelName=='MLPClassifier': model = MLPClassifier() elif modelName=='KNeighborsClassifier': model = KNeighborsClassifier() elif modelName=='SVC': model = SVC() elif modelName=='GaussianProcessClassifier': model = GaussianProcessClassifier() elif modelName=='DecisionTreeClassifier': model = DecisionTreeClassifier() elif modelName=='RandomForestClassifier': model = RandomForestClassifier() elif modelName=='AdaBoostClassifier': model = AdaBoostClassifier() elif modelName=='GaussianNB': model = GaussianNB() elif modelName=='LogisticRegression': model = linear_model.LogisticRegression() elif modelName=='QuadraticDiscriminantAnalysis': model = QuadraticDiscriminantAnalysis() return model
def test_lasso_lars_vs_lasso_cd_positive(verbose=False): # Test that LassoLars and Lasso using coordinate descent give the # same results when using the positive option # This test is basically a copy of the above with additional positive # option. However for the middle part, the comparison of coefficient values # for a range of alphas, we had to make an adaptations. See below. # not normalized data X = 3 * diabetes.data alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso', positive=True) lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True) for c, a in zip(lasso_path.T, alphas): if a == 0: continue lasso_cd.alpha = a lasso_cd.fit(X, y) error = linalg.norm(c - lasso_cd.coef_) assert_less(error, 0.01) # The range of alphas chosen for coefficient comparison here is restricted # as compared with the above test without the positive option. This is due # to the circumstance that the Lars-Lasso algorithm does not converge to # the least-squares-solution for small alphas, see 'Least Angle Regression' # by Efron et al 2004. The coefficients are typically in congruence up to # the smallest alpha reached by the Lars-Lasso algorithm and start to # diverge thereafter. See # https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff for alpha in np.linspace(6e-1, 1 - 1e-2, 20): clf1 = linear_model.LassoLars(fit_intercept=False, alpha=alpha, normalize=False, positive=True).fit(X, y) clf2 = linear_model.Lasso(fit_intercept=False, alpha=alpha, tol=1e-8, normalize=False, positive=True).fit(X, y) err = linalg.norm(clf1.coef_ - clf2.coef_) assert_less(err, 1e-3) # normalized data X = diabetes.data alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso', positive=True) lasso_cd = linear_model.Lasso(fit_intercept=False, normalize=True, tol=1e-8, positive=True) for c, a in zip(lasso_path.T[:-1], alphas[:-1]): # don't include alpha=0 lasso_cd.alpha = a lasso_cd.fit(X, y) error = linalg.norm(c - lasso_cd.coef_) assert_less(error, 0.01)
def get_model_list(task_name): model_list, name_list = [], [] model_list.append(linear_model.LinearRegression()) name_list.append('LR') # model_list.append(linear_model.SGDRegressor()) name_list.append('LR_SGD') model_list.append(linear_model.Lasso(alpha = 1.0)) name_list.append('Lasso') model_list.append(linear_model.Ridge (alpha = 1.0)) name_list.append('Ridge') model_list.append(linear_model.LassoLars(alpha=.1)) name_list.append('LassoLars') model_list.append(linear_model.BayesianRidge()) name_list.append('BayesianRidge') model_list.append(KernelRidge(alpha=1.0)) name_list.append('KernelRidge') model_list.append(gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)) name_list.append('GaussianProcess') model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=3)) name_list.append('KNN_unif') model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=3)) name_list.append('KNN_dist') model_list.append(SVR(kernel = 'linear', C = 1, gamma = 'auto', coef0 = 0, degree = 2)) name_list.append('SVM_linear') model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2)) name_list.append('SVM_poly') model_list.append(SVR(kernel = 'rbf', C = 1, gamma = 'auto', coef0 = 0, degree = 2)) name_list.append('SVM_rbf') model_list.append(DecisionTreeRegressor()) name_list.append('DT') model_list.append(RandomForestRegressor(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0)) name_list.append('RF') model_list.append(ExtraTreesRegressor(n_estimators=100, max_depth=None, max_features='auto', min_samples_split=2, random_state=0)) name_list.append('ET') return model_list, name_list