我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.linear_model.LinearRegression()。
def test_stacked_regressor(self): bclf = LinearRegression() clfs = [RandomForestRegressor(n_estimators=50, random_state=1), GradientBoostingRegressor(n_estimators=25, random_state=1), Ridge(random_state=1)] # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] sr = StackedRegressor(bclf, clfs, n_folds=3, verbose=0, oob_score_flag=True) sr.fit(X_train, y_train) mse = mean_squared_error(y_test, sr.predict(X_test)) assert_less(mse, 6.0)
def test_fwls_regressor(self): feature_func = lambda x: np.ones(x.shape) bclf = LinearRegression() clfs = [RandomForestRegressor(n_estimators=50, random_state=1), GradientBoostingRegressor(n_estimators=25, random_state=1), Ridge(random_state=1)] # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] sr = FWLSRegressor(bclf, clfs, feature_func, n_folds=3, verbose=0, oob_score_flag=True) sr.fit(X_train, y_train) mse = mean_squared_error(y_test, sr.predict(X_test)) assert_less(mse, 6.0)
def scatter_regresion_Plot(X, Y, testName): plt.scatter(X, Y, c = 'b', label = '_nolegend_', s = 1) X = X.reshape(-1, 1) Y = Y.reshape(-1, 1) R2 = r2_score(X, Y) regr = linear_model.LinearRegression() regr.fit(X, Y) plt.plot(X, regr.predict(X), "--", label = 'Regression', color = 'r') plt.title(testName + ' ($R^2$: ' + "{0:.3f}".format(R2) + ")", fontsize = 14) plt.xlabel('True Values', fontsize = 12, weight = 'bold') plt.ylabel('Predicted Values', fontsize = 12, weight = 'bold') plt.legend(loc = 'upper left', bbox_to_anchor = (0, 1.0), fancybox = True, shadow = True, fontsize = 10) plt.subplots_adjust(left = 0.2, right = 0.9, bottom = 0.05, top = 0.97, wspace = 0.15, hspace = 0.3)
def model_cross_valid(X,Y): seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) def bulid_model(model_name): model = model_name() return model scoring = 'neg_mean_squared_error' # + random fest boost lstm gbdt for model_name in [LinearRegression,ElasticNet]: #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]: model = bulid_model(model_name) results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(model_name,results.mean())
def fit_lr(train_X, train_y, test_X): """ Use linear regression to predict. :param train_X: :param train_y: :param test_X: :return: """ lr = LinearRegression() lr.fit(train_X, train_y) yhat_train = lr.predict(train_X) yhat_test = lr.predict(test_X) model = "LR int %.2f coefs %s" % (lr.intercept_, pprint(lr.coef_)) return model, yhat_train, yhat_test
def __init__(self, model, statistics_calc, backend, n_samples = 1000, seed = None): self.model = model self.statistics_calc = statistics_calc self.backend = backend self.rng = np.random.RandomState(seed) self.model.prior.reseed(self.rng.randint(np.iinfo(np.uint32).max, dtype=np.uint32)) # main algorithm seed_arr = self.rng.randint(1, n_samples*n_samples, size=n_samples, dtype=np.int32) seed_pds = self.backend.parallelize(seed_arr) sample_parameters_statistics_pds = self.backend.map(self._sample_parameter_statistics, seed_pds) sample_parameters_and_statistics = self.backend.collect(sample_parameters_statistics_pds) sample_parameters, sample_statistics = [list(t) for t in zip(*sample_parameters_and_statistics)] sample_parameters = np.array(sample_parameters) sample_statistics = np.concatenate(sample_statistics) self.coefficients_learnt = np.zeros(shape=(sample_parameters.shape[1],sample_statistics.shape[1])) regr = linear_model.LinearRegression(fit_intercept=True) for ind in range(sample_parameters.shape[1]): regr.fit(sample_statistics, sample_parameters[:,ind]) self.coefficients_learnt[ind,:] = regr.coef_
def calculate_residual_correlation_matrix(returns): # find the market return constraining on the selected companies (first PCA) # regress each stock on that and find correlation of residuals returns_matrix = returns.as_matrix().transpose() covar_matrix = np.cov(returns_matrix) pca = decomposition.PCA(n_components=1) pca.fit(covar_matrix) X = pca.transform(covar_matrix) regr = linear_model.LinearRegression() dim = covar_matrix.shape[1] res = np.zeros(shape=(dim,dim)) for x in range(0, dim): regr = linear_model.LinearRegression() regr = regr.fit(X, covar_matrix[:,x]) res[:,x] = covar_matrix[:,x] - regr.predict(X) res_corr = np.corrcoef(res) return pd.DataFrame(res_corr, index = returns.columns, columns = returns.columns)
def fit_regression(X, y, regression_class=LinearRegression, regularization_const=.001): ''' Given a dataset and some solutions (X, y) a regression class (from scikit learn) and an Lambda which is required if the regression class is Lasso or Ridge X (pandas DataFrame): The data. y (pandas DataFrame or Series): The answers. regression_class (class): One of sklearn.linear_model.[LinearRegression, Ridge, Lasso] regularization_const: the regularization_const value (regularization parameter) for Ridge or Lasso. Called alpha by scikit learn for interface reasons. Return: tuple, (the_fitted_regressor, mean(cross_val_score)). ''' if regression_class is LinearRegression: predictor = regression_class() else: predictor = regression_class(alpha=regularization_const, normalize=True) predictor.fit(X, y) cross_scores = cross_val_score(predictor, X, y=y, scoring='neg_mean_squared_error') cross_scores_corrected = np.sqrt(-1 * cross_scores) # Scikit learn returns negative vals && we need root return (predictor, np.mean(cross_scores_corrected))
def test_least_square_model(prostate_data): from esl_model.ch3.models import LeastSquareModel train_x, train_y, test_x, test_y, features = prostate_data lsm = LeastSquareModel(train_x=train_x, train_y=train_y, features_name=features) lsm.pre_processing() lsm.train() print(lsm.beta_hat) print('rss:',lsm.rss) print('F-statistic', lsm.F_statistic(remove_cols=['age', 'lcp', 'gleason', 'pgg45'])) print('z-score', lsm.z_score) result = lsm.test(test_x, test_y) print('test error: ', result.mse) from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(train_x, train_y) print('std error', result.std_error) assert np.isclose(result.mse, np.mean(((lr.predict(test_x)) - test_y) **2))
def rolling_beta(X, y, idx, window=100): assert len(X) == len(y) out_dates = [] out_beta = [] model_ols = linear_model.LinearRegression() for iStart in range(0, len(X) - window): iEnd = iStart + window _x = X[iStart:iEnd].values.reshape(-1, 1) _y = y[iStart:iEnd].values.reshape(-1, 1) model_ols.fit(_x, _y) # store output out_dates.append(idx[iEnd]) out_beta.append(model_ols.coef_[0][0]) return pd.DataFrame({'beta': out_beta}, index=out_dates)
def test_linear_regressor(self): for dtype in self.number_data_type.keys(): scikit_model = LinearRegression(normalize=True) data = self.scikit_data['data'].astype(dtype) target = self.scikit_data['target'].astype(dtype) scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target) test_data = data[0].reshape(1, -1) coreml_model = create_model(spec) try: self.assertEqual(scikit_model.predict(test_data)[0].dtype, type(coreml_model.predict({'data': test_data})['target'])) self.assertAlmostEqual(scikit_model.predict(test_data)[0], coreml_model.predict({'data': test_data})['target'], msg="{} != {} for Dtype: {}".format( scikit_model.predict(test_data)[0], coreml_model.predict({'data': test_data})['target'], dtype ) ) except RuntimeError: print("{} not supported. ".format(dtype))
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ if not(HAS_SKLEARN): return scikit_data = load_boston() feature_names = scikit_data.feature_names scikit_model = LinearRegression() scikit_model.fit(scikit_data['data'], scikit_data['target']) # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ if not HAS_SKLEARN: return scikit_data = load_boston() feature_names = scikit_data.feature_names scikit_model = Pipeline(steps = [ ('linear' , LinearRegression()) ]) scikit_model.fit(scikit_data['data'], scikit_data['target']) # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model
def test_linear_regression_evaluation(self): """ Check that the evaluation results are the same in scikit learn and coremltools """ input_names = self.scikit_data.feature_names df = pd.DataFrame(self.scikit_data.data, columns=input_names) for normalize_value in (True, False): cur_model = LinearRegression(normalize=normalize_value) cur_model.fit(self.scikit_data['data'], self.scikit_data['target']) spec = convert(cur_model, input_names, 'target') df['prediction'] = cur_model.predict(self.scikit_data.data) metrics = evaluate_regressor(spec, df) self.assertAlmostEquals(metrics['max_error'], 0)
def find_parameters_w(X, Y): """Find the parameter values w for the model which best fits X and Y. Args: X: A 2-dimensional numpy array representing the independent variables in the linear regression model. Y: A numpy array of floats representing the dependent variables in the linear regression model. Returns: A tuple (w0, w1, w2, w3, w4) representing the parameter values w. """ clf = linear_model.LinearRegression() clf.fit(X, Y) w0 = clf.intercept_ w1, w2, w3, w4 = clf.coef_ return w0, w1, w2, w3, w4
def predict_price(dates, prices, x): dates = np.reshape(dates, (len(dates),1)) # converting to matrix of n X 1 prices = np.reshape(prices, (len(prices),1)) linear_mod = linear_model.LinearRegression() # defining the linear regression model linear_mod.fit(dates, prices) # fitting the data points in the model plt.scatter(dates, prices, color= 'black', label= 'Data') # plotting the initial datapoints plt.plot(dates, linear_mod.predict(dates), color= 'red', label= 'Linear model') # plotting the line made by linear regression plt.xlabel('Date') plt.ylabel('Price') plt.title('Linear Regression') plt.legend() plt.show() return linear_mod.predict(x)[0][0], linear_mod.coef_[0][0], linear_mod.intercept_[0]
def prepare_fit_model_for_factors(model_type, x_train, y_train): """ Given a model type, train and test data Args: model_type (str): 'classification' or 'regression' x_train: y_train: Returns: (sklearn.base.BaseEstimator): A fit model. """ if model_type == 'classification': algorithm = LogisticRegression() elif model_type == 'regression': algorithm = LinearRegression() else: algorithm = None if algorithm is not None: algorithm.fit(x_train, y_train) return algorithm
def regression_murder(year): # applies linear regression on murder rates murder = pd.DataFrame() dates = crime_rate_df.index.values.tolist() murder['label'] = crime_rate_df['Murder and\nnonnegligent \nmanslaughter'] prediction_size = int(0.1 * len(murder)) X = np.array(dates) y = np.array(murder['label']) y.reshape((len(X), 1)) y_train = y[:-prediction_size] X_train = X[:-prediction_size] clf = LinearRegression() clf.fit(X_train.reshape(-1, 1), y_train) regression_line = [clf.predict(X_train[i].reshape(1, -1)) for i in range(len(X_train))] print(clf.predict(year)) plt.scatter(X_train, y_train) plt.plot(X_train, regression_line) plt.show()
def linear_regression(): lr = LinearRegression() lr.fit(X_train, y_train) # Look at predictions on training and validation set print("RMSE on Training set :", rmse_cv(lr, train_split, y).mean()) y_train_pred = lr.predict(train_split) print('rmsle calculate by self:', rmsle(list(np.exp(y) - 1), list(np.exp(y_train_pred) - 1))) plt.scatter(y_train_pred, y_train_pred - y, c="blue", marker="s", label="Training data") plt.title("Linear regression") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y, c="blue", marker="s", label="Training data") plt.title("Linear regression") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() return lr
def main(): diabetes = datasets.load_diabetes() diabetes_X = diabetes.data[:, np.newaxis, 2] diabetes_X_train = diabetes_X[:-20] diabetes_X_test = diabetes_X[-20:] diabetes_y_train = diabetes.target[:-20] diabetes_y_test = diabetes.target[-20:] regr = linear_model.LinearRegression() regr.fit(diabetes_X_train, diabetes_y_train) print('Coefficients: \n', regr.coef_) print("Mean squared error: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test)**2)) print('Variance score: %.2f' % regr.score(diabetes_X_test, diabetes_y_test))
def test_parameter_estimation_low_memory(self): X = np.random.uniform(0, 4, 1000) y = X + np.random.normal(0, 1, 1000) m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000, low_mem=True) m.fit(X.reshape(-1, 1), y) coef_samples = [b.coef_ for b in m.base_models_] intercept_samples = [b.intercept_ for b in m.base_models_] self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3) l, r = central_credible_interval(coef_samples, alpha=0.05) self.assertLess(l, 1) self.assertGreater(r, 1) l, r = highest_density_interval(coef_samples, alpha=0.05) self.assertLess(l, 1) self.assertGreater(r, 1) self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) l, r = central_credible_interval(intercept_samples, alpha=0.05) self.assertLess(l, 0) self.assertGreater(r, 0) self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) l, r = highest_density_interval(intercept_samples, alpha=0.05) self.assertLess(l, 0) self.assertGreater(r, 0)
def test_parameter_estimation(self): X = np.random.uniform(0, 4, 1000) y = X + np.random.normal(0, 1, 1000) m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000, low_mem=False) m.fit(X.reshape(-1, 1), y) coef_samples = [b.coef_ for b in m.base_models_] intercept_samples = [b.intercept_ for b in m.base_models_] self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3) l, r = central_credible_interval(coef_samples, alpha=0.05) self.assertLess(l, 1) self.assertGreater(r, 1) l, r = highest_density_interval(coef_samples, alpha=0.05) self.assertLess(l, 1) self.assertGreater(r, 1) self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) l, r = central_credible_interval(intercept_samples, alpha=0.05) self.assertLess(l, 0) self.assertGreater(r, 0) self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) l, r = highest_density_interval(intercept_samples, alpha=0.05) self.assertLess(l, 0) self.assertGreater(r, 0)
def train_regressor(options, embed_map, wordvecs, worddict): """ Return regressor to map word2vec to RNN word space """ # Gather all words from word2vec that appear in wordvecs d = defaultdict(lambda : 0) for w in embed_map.vocab.keys(): d[w] = 1 shared = OrderedDict() count = 0 for w in worddict.keys()[:options['n_words']-2]: if d[w] > 0: shared[w] = count count += 1 # Get the vectors for all words in 'shared' w2v = numpy.zeros((len(shared), 300), dtype='float32') sg = numpy.zeros((len(shared), options['dim_word']), dtype='float32') for w in shared.keys(): w2v[shared[w]] = embed_map[w] sg[shared[w]] = wordvecs[w] clf = LinearRegression() clf.fit(w2v, sg) return clf
def test_stacking(): model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) ds = model.stack(10) assert ds.X_train.shape[0] == model.dataset.X_train.shape[0] assert ds.X_test.shape[0] == model.dataset.X_test.shape[0] assert ds.y_train.shape[0] == model.dataset.y_train.shape[0] model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) ds = model.stack(10, full_test=False) assert np.isnan(ds.X_train).sum() == 0 assert ds.X_train.shape[0] == model.dataset.X_train.shape[0] assert ds.X_test.shape[0] == model.dataset.X_test.shape[0] assert ds.y_train.shape[0] == model.dataset.y_train.shape[0] model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) model.dataset.load() ds = model.stack(10, full_test=False) # Check cache assert np.isnan(ds.X_train).sum() == 0 assert ds.X_train.shape[0] == model.dataset.X_train.shape[0] assert ds.X_test.shape[0] == model.dataset.X_test.shape[0] assert ds.y_train.shape[0] == model.dataset.y_train.shape[0]
def _get_trend(cls, log, starting_date): """Get commit count trend based on log. :param log: a log on which the trend should be computed :param starting_date: starting date of log :return: computed trend """ records = [0] date = starting_date for entry in log: if entry['author']['date'] > date + cls._SECONDS_PER_DAY: date += cls._SECONDS_PER_DAY records.append(0) records[-1] += 1 lr = LinearRegression() lr.fit(np.array(range(len(records))).reshape(-1, 1), np.array(records)) return lr.coef_[0]
def linear_model_manual(prediction_value): data = pd.read_csv('E://Spyder/LinearRegression/data/data.csv') X_tem = [] Y_tem = [] for X_data ,Y_data in zip(data['x'],data['y']): X_tem.append(int(X_data)) Y_tem.append(float(Y_data)) X_parameters = np.array(X_tem) Y_parameters = np.array(Y_tem) xy = X_parameters*Y_parameters xy_avg = xy.mean() x_avg = X_parameters.mean() y_avg = Y_parameters.mean() x_square = X_parameters*X_parameters x_square_avg = x_square.mean() predictions = {} #Method of least squares predictions['coefficient'] = (xy_avg - x_avg*y_avg) / (x_square_avg - x_avg*x_avg) predictions['intercept'] = y_avg - predictions['coefficient']*x_avg #prediction_result predictions['predictions_result'] = predictions['intercept'] + predictions['coefficient']*prediction_value return predictions
def linear_model_multivariate(): #coefficient = (X_trans*X)^-1 * X_trans * y data = pd.read_csv('E://Spyder/LinearRegression/data/data.csv') X_tem = [] Y_tem = [] linearModel={} for X_data ,Y_data in zip(data['x'],data['y']): X_tem.append(int(X_data)) Y_tem.append(float(Y_data)) X_parameters = np.ones((len(X_tem),2)) for i in range(len(X_tem)): X_parameters[i][0] = X_tem[i] Y_parameters = np.array(Y_tem) # Formula # coefficient = inv(X.T*X) * X.T * y coefficient = np.dot(np.dot(np.linalg.inv(np.dot(X_parameters.T,X_parameters)),X_parameters.T),Y_parameters) avg_X = X_parameters.mean(axis = 0) intercept = Y_parameters.mean() + coefficient * avg_X[1] linearModel['coefficient'] = coefficient linearModel['intercept'] = intercept return linearModel
def get_loss(): #Calculate the loss the linear_model data = pd.read_csv('E://Spyder/LinearRegression/data/data.csv') X_tem = [] Y_tem = [] for X_data ,Y_data in zip(data['x'],data['y']): X_tem.append([int(X_data)]) Y_tem.append(float(Y_data)) x_data = np.array(X_tem) y_data = np.array(Y_tem) regr = linear_model.LinearRegression() regr.fit(x_data,y_data) loss = np.sum((y_data - regr.predict(x_data)) ** 2) return loss #Function to show the result of linear fit model
def plot2dRegression(x,y, nameX, nameY, namePlot): model = LinearRegression() linearModel = model.fit(x, y) predictModel = linearModel.predict(x) plt.scatter(x,y, color='g') plt.plot(x, predictModel, color='k') plt.xlabel(nameX) plt.ylabel(nameY) test = stats.linregress(predictModel,y) print("The squared of the correlation coefficient R^2 is " + str(test.rvalue**2)) plt.savefig("plot/loadings/"+namePlot, bbox_inches='tight') plt.show() return test.rvalue**2 #plot the 2D regression between the performance values and the loadings. #return the correlation factor: R squared
def test_select_best(self): """ Test the select best fit estimator """ X, y = ANSCOMBE[1] X = np.array(X) y = np.array(y) X = X[:,np.newaxis] model = fit_select_best(X, y) self.assertIsNotNone(model) self.assertIsInstance(model, Pipeline) X, y = ANSCOMBE[3] X = np.array(X) y = np.array(y) X = X[:,np.newaxis] model = fit_select_best(X, y) self.assertIsNotNone(model) self.assertIsInstance(model, LinearRegression)
def test_estimator_instance(self): """ Test that isestimator works for instances """ models = ( LinearRegression(), LogisticRegression(), KMeans(), LSHForest(), PCA(), RidgeCV(), LassoCV(), RandomForestClassifier(), ) for model in models: self.assertTrue(isestimator(model))
def test_estimator_class(self): """ Test that isestimator works for classes """ models = ( LinearRegression, LogisticRegression, KMeans, LSHForest, PCA, RidgeCV, LassoCV, RandomForestClassifier, ) for model in models: self.assertTrue(inspect.isclass(model)) self.assertTrue(isestimator(model))
def test_clusterer_enforcement(self): """ Assert that only clustering estimators can be passed to cluster viz """ nomodels = [ SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier ] for nomodel in nomodels: with self.assertRaises(YellowbrickTypeError): visualizer = ClusteringScoreVisualizer(nomodel()) models = [ KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch ] for model in models: try: visualizer = ClusteringScoreVisualizer(model()) except YellowbrickTypeError: self.fail("could not pass clustering estimator to visualizer")
def GetBeta(f,*args): FactorValue = f(*args) stock = args[0] date = args[1] #Get 20 Business day's data tempprice = get_price(list(stock), date, "{:%Y-%m-%d}".format(datetime.datetime.strptime(date, '%Y-%m-%d') + datetime.timedelta(days=30)), frequency='1d', fields=None)['OpeningPx'] tempreturn = np.log(tempprice.iloc[-1]/tempprice.iloc[0]) #print('FV',FactorValue) FactorValue = pd.DataFrame(FactorValue) DataAll = pd.concat([FactorValue,tempreturn],axis = 1) DataAll = DataAll.dropna() DataAll.columns = ['f','p'] #print('fs',FactorValue.shape) #print('ts',tempreturn.shape) #print(DataAll) #print(DataAll.shape) #print(np.matrix(DataAll.ix[:,0]).shape) #print(np.matrix(DataAll.ix[:,1]).shape) regr = linear_model.LinearRegression() regr.fit(np.transpose(np.matrix(DataAll['f'])), np.transpose(np.matrix(DataAll['p']))) return regr.coef_
def GetResiduals(stock,enddate): Xinput = [EquityOCFP(stock,enddate), EquitySize(stock,enddate), RSIIndividual(stock,enddate), Min130Day(stock,enddate)] X = pd.concat(Xinput, axis=1) date = enddate tempprice = get_price(list(stock), date, "{:%Y-%m-%d}".format(datetime.datetime.strptime(date, '%Y-%m-%d') + datetime.timedelta(days=30)), frequency='1d', fields=None)['OpeningPx'] y = np.log(tempprice.iloc[-1]/tempprice.iloc[0]) DataAll = pd.concat([X,y],axis = 1) DataAll = DataAll.dropna() regr = linear_model.LinearRegression() regr.fit(np.matrix(DataAll.ix[:,0:4]), np.transpose(np.matrix(DataAll.ix[:,4]))) residuals = regr.predict(np.matrix(DataAll.ix[:,0:4])) - np.transpose(np.matrix(DataAll.ix[:,4])) residuals = pd.DataFrame(data = residuals, index = np.transpose(np.matrix(DataAll.index.values))) residuals.index = DataAll.index.values residuals.columns = [enddate] return residuals #This function is used in the later function
def getDataSet(self, max_value_threshold = 1000, train_length_threshold = 30): try: return self.data_set except: self.__gen_data_set(max_value_threshold = max_value_threshold, train_length_threshold = train_length_threshold) return self.data_set # def __gen_model(self, model = LinearRegression()): # X_train, y_train, _ = self.getDataSet(10000, 60) # model.fit(X_train, y_train) # if self.ifPlotTrain: # y_pred = model.predict(X_train) # df = pd.DataFrame(np.hstack((y_train.reshape(-1,1), y_pred.reshape(-1,1)))) # df.columns = ['Train', 'Predict'] # df[:60].plot() # plt.title('train_all') # fig = plt.gcf() # fig.savefig('./img/train_all.png') # plt.close(fig) # self.model = model
def analysis(): mysql_cn= pymysql.connect(host='10.25.0.119', port=3306,user='root', passwd='111111', db='music') df = pd.read_sql(''' SELECT COUNT(*) as plays, ds from user_actions JOIN songs on user_actions.song_id = songs.song_id WHERE ds >= '20150805' AND ds <= '20150830' AND action_type = '1' AND artist_id = 'c026b84e8f23a7741d9b670e3d8973f0' GROUP BY artist_id, ds ORDER BY ds '''.format(),mysql_cn) X = np.array([i for i in range(26)]) df.columns = ['plays', 'ds'] y = df['plays'].values print X, y model = LinearRegression() model.fit(X.reshape(X.shape[0], 1), y.reshape(y.shape[0])) x = np.array([i for i in range(26, 50)]) Y = model.predict(x.reshape(x.shape[0], 1)) df = pd.DataFrame(Y) print Y df.plot() plt.show() mysql_cn.close()
def test_pink_noise_slope(): n_points = 10000 fs = 500.0 try: from sklearn.linear_model import LinearRegression except ImportError: return True # test the slope for slope in [1, 1.5, 2]: noise = pink_noise(n_points, slope=slope) spec = Spectrum(fs=fs) psd = spec.periodogram(noise).T freq = np.linspace(0, fs / 2., psd.size)[:, None] # linear regression fit in the log domain reg = LinearRegression() reg.fit(np.log10(freq[1:]), np.log10(psd[1:])) assert_almost_equal(reg.coef_[0][0], -slope, decimal=1)
def mlr_val( RM, yE, disp = True, graph = True, rate = 2, more_train = True, center = None): """ Validation is peformed as much as the given ratio. """ RMt, yEt, RMv, yEv = jchem.get_valid_mode_data( RM, yE, rate = rate, more_train = more_train, center = center) clf = linear_model.LinearRegression() clf.fit( RMt, yEt) print('Training result') mlr_show( clf, RMt, yEt, disp = disp, graph = graph) print('Validation result') r_sqr, RMSE = mlr_show( clf, RMv, yEv, disp = disp, graph = graph) return r_sqr, RMSE
def cv_train_test( xMa, yVa, tr, ts): """ Regression and test is performed for given data with cross-validation streams """ xM = xMa[ tr, :] yV = yVa[ tr, 0] clf = linear_model.LinearRegression() clf.fit( xM, yV) # The testing information is extracted. xM_test = xMa[ ts, :] yV_test = yVa[ ts, 0] return yV_test.A1, clf.predict( xM_test).ravel()
def gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1, graph=False): """ gs = gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1) Inputs ====== model = svm.SVC(), or linear_model.LinearRegression(), for example param = {"C": np.logspace(-2,2,5)} """ #print(xM.shape, yVc.shape) kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) gs = model_selection.GridSearchCV( model, param_grid, cv=kf5_c, n_jobs=n_jobs) gs.fit( X, y) if graph: plt.plot( gs.cv_results_["mean_train_score"], label='E[Train]') plt.plot( gs.cv_results_["mean_test_score"], label='E[Test]') plt.legend(loc=0) plt.grid() return gs
def cv_pilot_only(self): """ Cross-validatin scores are evaluated using LOO. SNRpilot is equal to SNR, which is SNRdata. """ yT_a = self.rx_p["yT_a"] x_a = self.rx_p["x_a"] lm = linear_model.LinearRegression() scores = codes.cross_val_score_loo( lm, yT_a, x_a) # Output is stored with enviromental variables. pdi = pd.DataFrame() pdi["model"] = ["LinearRegression"] pdi["alpha"] = [0] pdi["metric"] = ["mean_squared_error"] pdi["E[scores]"] = [np.mean(scores)] pdi["std[scores]"] = [np.std(scores)] pdi["scores"] = [scores] return pdi
def cv_pilot_reg_only(self, alpha = 0): model = self.model yT_a = self.rx_p["yT_a"] x_a = self.rx_p["x_a"] # kf = KFold() # loo = cross_validation.LeaveOneOut( x_a.shape[0]) if alpha == 0: lm = linear_model.LinearRegression() else: lm = getattr( linear_model, model)(alpha) scores = codes.cross_val_score_loo( lm, yT_a, x_a) # Output is stored with enviromental variables. pdi = pd.DataFrame() pdi["model"] = [model] pdi["alpha"] = [alpha] pdi["metric"] = ["mean_squared_error"] pdi["E[scores]"] = [np.mean(np.power(scores,2))] # MSE pdi["std[scores]"] = ["t.b.d."] pdi["scores"] = [scores] return pdi
def Beta(self): prixe = math.log(0.03637 / float(365) + 1) df1 = self.sharedf df1['change']=df1['change']-prixe df2 = ShareClass().GetDayData(code='000001',zs=True) print 11111111111 coef = [] intercept = [] residues=[] ret= pandas.merge(df1,df2,how='inner',on='date') array2 = [] if len(ret) > 252: for z in range(0, 252): array2.append(math.pow(math.pow(float(1) / 2, float(1 / float(63))), (252 - z - 1))) for z in range(0, 251): coef.append(numpy.NaN) intercept.append(numpy.NaN) residues.append(numpy.NaN) for c in range(252, len(ret)+1): array=[] for x in ret[c - 252:c]['change_x']: array.append([x]) clf = linear_model.LinearRegression() clf.fit(X=array, y=ret[c - 252:c]["change_y"], sample_weight=array2) coef.append(float(clf.coef_)) residues.append(clf._residues) intercept.append(float(clf.intercept_)) ret['beta'] = coef ret['alpha'] = intercept ret['residues'] = residues return ret[['date','beta','alpha','residues']]
def define_model(self): #if self.modeltype == "AR" : # return statsmodels.tsa.ar_model.AR(max_order=self.parameters['max_order']) if self.modeltype == "RandomForest" : return ensemble.RandomForestRegressor(n_estimators=self.parameters['n_estimators']) #return ensemble.RandomForestClassifier( # n_estimators=self.parameters['n_estimators']) elif self.modeltype == "LinearRegression" : return linear_model.LinearRegression() elif self.modeltype == "Lasso" : return linear_model.Lasso( alpha=self.parameters['alpha']) elif self.modeltype == "ElasticNet" : return linear_model.ElasticNet( alpha=self.parameters['alpha'], l1_ratio=self.parameters['l1_ratio']) elif self.modeltype == "SVR" : return SVR( C=self.parameters['C'], epsilon=self.parameters['epsilon'], kernel=self.parameters['kernel']) #elif self.modeltype == 'StaticModel': # return StaticModel ( # parameters=self.parameters # ) #elif self.modeltype == 'AdvancedStaticModel': # return AdvancedStaticModel ( # parameters=self.parameters # ) # elif self.modeltype == 'SGDRegressor' : # print(self.parameters) # return linear_model.SGDRegressor( # loss=self.parameters['loss'], # penalty=self.parameters['penalty'], # l1_ratio=self.parameters['l1_ratio']) else: raise ConfigError("Unsupported model {0}".format(self.modeltype))
def regressionDistance(vec1,vec2): regr = linear_model.LinearRegression() regr.fit(np.asarray(vec1).reshape(len(vec1),1),np.asarray(vec2)) return regr.coef_
def outofsample_extensions(method='linear-regression'): # Load the data and init seeds train_data, train_labels, test_data, test_labels = load_mnist() np.random.seed(1) sklearn.utils.check_random_state(1) n_train_samples = 5000 # Learn a new space using Isomap isomap = Isomap(n_components=10, n_neighbors=20) train_data_isomap = np.float32(isomap.fit_transform(train_data[:n_train_samples, :])) if method == 'linear-regression': # Use linear regression to provide baseline out-of-sample extensions proj = LinearRegression() proj.fit(np.float64(train_data[:n_train_samples, :]), np.float64(train_data_isomap)) acc = evaluate_svm(proj.predict(train_data[:n_train_samples, :]), train_labels[:n_train_samples], proj.predict(test_data), test_labels) elif method == 'c-ISOMAP-10d' or method == 'c-ISOMAP-20d': # Use the SEF to provide out-of-sample extensions if method == 'c-ISOMAP-10d': proj = LinearSEF(train_data.shape[1], output_dimensionality=10) proj.cuda() else: proj = LinearSEF(train_data.shape[1], output_dimensionality=20) proj.cuda() loss = proj.fit(data=train_data[:n_train_samples, :], target_data=train_data_isomap, target='copy', epochs=50, batch_size=128, verbose=True, learning_rate=0.001, regularizer_weight=1) acc = evaluate_svm(proj.transform(train_data[:n_train_samples, :]), train_labels[:n_train_samples], proj.transform(test_data), test_labels) print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
def outofsample_extensions(method=None, dataset=None): np.random.seed(1) sklearn.utils.check_random_state(1) train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1) # Learn a new space using Isomap isomap = Isomap(n_components=10, n_neighbors=20) train_data_isomap = np.float32(isomap.fit_transform(train_data)) if method == 'linear-regression': from sklearn.preprocessing import StandardScaler std = StandardScaler() train_data = std.fit_transform(train_data) test_data = std.transform(test_data) # Use linear regression to provide baseline out-of-sample extensions proj = LinearRegression() proj.fit(np.float64(train_data), np.float64(train_data_isomap)) acc = evaluate_svm(proj.predict(train_data), train_labels, proj.predict(test_data), test_labels) elif method == 'c-ISOMAP-10d' or method == 'c-ISOMAP-20d': # Use the SEF to provide out-of-sample extensions if method == 'c-ISOMAP-10d': proj = LinearSEF(train_data.shape[1], output_dimensionality=10) proj.cuda() else: proj = LinearSEF(train_data.shape[1], output_dimensionality=20) proj.cuda() loss = proj.fit(data=train_data, target_data=train_data_isomap, target='copy', epochs=50, batch_size=1024, verbose=False, learning_rate=0.001, regularizer_weight=1) acc = evaluate_svm(proj.transform(train_data), train_labels, proj.transform(test_data), test_labels) print("Method: ", method, " Test accuracy: ", 100 * acc, " %")