我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用sklearn.neighbors.KNeighborsRegressor()。
def test_regression(): # Check regression for various parameter settings. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "max_features": [0.5, 1.0], "bootstrap": [True, False], "bootstrap_features": [True, False]}) for base_estimator in [None, DummyRegressor(), DecisionTreeRegressor(), KNeighborsRegressor(), SVR()]: for params in grid: BaggingRegressor(base_estimator=base_estimator, random_state=rng, **params).fit(X_train, y_train).predict(X_test)
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) prep = {'Standard Scaling': [StandardScaler()], 'Min Max Scaling': [MinMaxScaler()], 'No Preprocessing': []} est = {'Standard Scaling': [ElasticNet(), Lasso(), KNeighborsRegressor()], 'Min Max Scaling': [SVR()], 'No Preprocessing': [RandomForestRegressor(random_state=SEED), GradientBoostingRegressor()]} ens.add(est, prep) ens.add(GradientBoostingRegressor(), meta=True) return ens
def fit(self,X,y): ''' ??knn????? :param X: ??????dataframe??????????????? :param y: ??????series??X??????????????????????? :return: ''' X=pd.DataFrame(X.copy()) X=X.reset_index(drop=True) y=pd.Series(y.copy()) y=y.reset_index(drop=True) self.means=y.mean() self.models={} for col in X.columns.tolist(): if col in self.feature_cate: self.models[col]=y.groupby(X[col]).mean().to_dict() else: knn=KNeighborsRegressor(n_neighbors=self.n_neighbors) knn.fit(X[[col]],y) self.models[col]=copy.deepcopy(knn) return self
def model_cross_valid(X,Y): seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) def bulid_model(model_name): model = model_name() return model scoring = 'neg_mean_squared_error' # + random fest boost lstm gbdt for model_name in [LinearRegression,ElasticNet]: #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]: model = bulid_model(model_name) results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(model_name,results.mean())
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'weights': ['uniform', 'distance'], 'n_neighbors': range(2,100) } ] reg = GridSearchCV(neighbors.KNeighborsRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error') reg.fit(self.X_train, self.y_train) print "Best parameters set found on development set:\n" print reg.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in reg.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print reg.scorer_ print "MSE for test data set:" y_true, y_pred = self.y_test, reg.predict(self.X_test) print mean_squared_error(y_pred, y_true)
def test_kneighbors_regressor(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0): # Test k-neighbors regression rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y_target = y[:n_test_pts] weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm) knn.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert_true(np.all(abs(y_pred - y_target) < 0.3))
def test_KNeighborsRegressor_multioutput_uniform_weight(): # Test k-neighbors in multi-output regression with uniform weight rng = check_random_state(0) n_features = 5 n_samples = 40 n_output = 4 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_output) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for algorithm, weights in product(ALGORITHMS, [None, 'uniform']): knn = neighbors.KNeighborsRegressor(weights=weights, algorithm=algorithm) knn.fit(X_train, y_train) neigh_idx = knn.kneighbors(X_test, return_distance=False) y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) y_pred = knn.predict(X_test) assert_equal(y_pred.shape, y_test.shape) assert_equal(y_pred_idx.shape, y_test.shape) assert_array_almost_equal(y_pred, y_pred_idx)
def test_kneighbors_regressor_sparse(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0): # Test radius-based regression on sparse matrices # Like the above, but with various types of sparse matrices rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < .25).astype(np.int) for sparsemat in SPARSE_TYPES: knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, algorithm='auto') knn.fit(sparsemat(X), y) for sparsev in SPARSE_OR_DENSE: X2 = sparsev(X) assert_true(np.mean(knn.predict(X2).round() == y) > 0.95)
def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a # nearest neighbor query on points near the decision boundary. for algorithm in ALGORITHMS: clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm=algorithm) clf.fit(iris.data, iris.target) assert_array_equal(clf.predict(iris.data), iris.target) clf.set_params(n_neighbors=9, algorithm=algorithm) clf.fit(iris.data, iris.target) assert_true(np.mean(clf.predict(iris.data) == iris.target) > 0.95) rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm) rgs.fit(iris.data, iris.target) assert_greater(np.mean(rgs.predict(iris.data).round() == iris.target), 0.95)
def learn(self, experiences, max_iter=20): # experience is in (s, a, r, ns) states = experiences[:, 0:self.domain.state_space_dims] actions = experiences[:, self.domain.state_space_dims] rewards = experiences[:, self.domain.state_space_dims+1] next_states = experiences[:, self.domain.state_space_dims+2:] X = self.representation.phi_sa("root", states, actions) for i in range(0, max_iter): #old_qs = np.reshape(self.representation.Q("root", states, actions), (-1, 1)) nqs = self.representation.Qs("root", next_states) best_nqs = np.reshape(np.amax(nqs, axis=1), (-1, 1)) y = rewards+ self.domain.discount_factor * best_nqs #resd = np.mean(np.abs(y - old_qs)) model = KNeighborsRegressor(n_neighbors=2, n_jobs=-1) model.fit(X, y) self.representation.models["root"] = model #print "Residual is " + str(resd)
def model_fit_and_test(TrainX,TrainY,TestX,TestY): def bulid_model(model_name): model = model_name() return model #for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]: for model_name in [LinearRegression, ElasticNet]: model = bulid_model(model_name) model.fit(TrainX,TrainY) print(model_name) resid = model.predict(TestX) - TestY #print resid print("Residual sum of squares: %f"% np.mean(resid ** 2)) #print model.predict(TestX) #print TestY # Explained variance score: 1 is perfect prediction plt.scatter(model.predict(TestX), resid); plt.axhline(0, color='red') plt.xlabel('Predicted Values') plt.ylabel('Residuals') #plt.xlim([1, 50]) plt.show() print('Variance score: %.2f' % model.score(TestX, TestY)) from statsmodels.stats.stattools import jarque_bera _, pvalue, _, _ = jarque_bera(resid) print ("Test Residuals Normal", pvalue) from statsmodels import regression, stats import statsmodels.api as sms import statsmodels.stats.diagnostic as smd # xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4))) xs_with_constant = sms.add_constant(TestX) _, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant) print ("Test Heteroskedasticity", pvalue1) ljung_box = smd.acorr_ljungbox(resid, lags=10) #print "Lagrange Multiplier Statistics:", ljung_box[0] print "Test Autocorrelation P-values:", ljung_box[1] if any(ljung_box[1] < 0.05): print "The residuals are autocorrelated." else: print "The residuals are not autocorrelated."
def __init__(self, isTrain): super(RegressionKNN, self).__init__(isTrain) # data preprocessing #self.dataPreprocessing() # Create KNN regression object # first parameter is the K neighbors # 'uniform' assigns uniform weights to each neighbor # 'distance' assigns weights proportional to the inverse of the distance from the query point # default metric is euclidean distance self.regr = neighbors.KNeighborsRegressor(86, weights='distance')
def __init__(self, conf): """smpKNN.__init__ init """ smpModel.__init__(self, conf) self.fwd = KNeighborsRegressor(n_neighbors = self.n_neighbors) self.X_ = [] self.y_ = [] self.bootstrap()
def calculate(X, y): best_p, best_score = 0, -float('inf') kf = KFold(len(y), n_folds=5, shuffle=True, random_state=42) for p in numpy.linspace(1, 10, num=200): knr = KNeighborsRegressor(n_neighbors=5, weights='distance', p=p) score = max(cross_val_score(knr, X, y, cv=kf, scoring='mean_squared_error')) if score > best_score: best_score = score best_p = p return best_p, best_score
def knnPredictor(df): dataTrainX, dataTrainY, dataTestX, dataTestY = sample(df) corelationCoefficiantDictionary = {} corelationCoefficiantArray = [] for k in range(1, 200, 1): knnModel = KNeighborsRegressor(n_neighbors=k) knnModel.fit(dataTrainX, dataTrainY) knnpredicted = knnModel.predict(dataTestX) corelationCoefficient = pearsonr(dataTestY, knnpredicted) corelationCoefficiantDictionary[k] = corelationCoefficient[0] corelationCoefficiantArray.append(corelationCoefficient[0]) # plotter.plot(corelationCoefficiantArray) bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get) knnModelBest = KNeighborsRegressor(n_neighbors=bestK) knnModelBest.fit(dataTrainX, dataTrainY) print("K = ") print(bestK) print("Corelation Coeff:") print(corelationCoefficiantDictionary[bestK]) knnpredictedBest = knnModelBest.predict(dataTestX) fig, ax = plotter.subplots() corelationCoefficient = pearsonr(dataTestY, knnpredictedBest) print(corelationCoefficient[0]) ax.set_ylabel('Predicted KNN Weekly') ax.scatter(dataTestY, knnpredictedBest) ax.set_xlabel('Measured') plotter.show()
def predictKnn(data, priceToPredict): corelationCoefficiantDictionary = {} corelationCoefficiantArray = [] openingPriceTrain, openingPriceTest, closingPriceTrain, closingPriceTest = \ data["openingPriceTrain"], data["openingPriceTest"], data["closingPriceTrain"], data["closingPriceTest"] for k in range( 1 , 100 , 1): neigh = KNeighborsRegressor(n_neighbors=k) #n = 7 best fits neigh.fit(openingPriceTrain, closingPriceTrain) closingPriceTestArray = np.reshape(closingPriceTest,-1) knnpr = neigh.predict(openingPriceTest) predictedArray = np.reshape(knnpr,-1) corelationCoefficient = pearsonr(closingPriceTestArray,predictedArray) corelationCoefficiantDictionary[k] = corelationCoefficient[0] corelationCoefficiantArray.append(corelationCoefficient[0]) plotter.plot(corelationCoefficiantArray) # plotter.show() bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get) neighBest = KNeighborsRegressor(n_neighbors=bestK) neighBest.fit(openingPriceTrain, closingPriceTrain) openingPriceToPredict = np.array([priceToPredict]) print("K = ") print(bestK) print(neighBest.predict(openingPriceToPredict))
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) est = [ElasticNet(copy_X=False), Lasso(copy_X=False)] ens.add(est) ens.add(KNeighborsRegressor()) return ens
def knn(): """Fit KNN.""" print("Fitting KNN...", end=" ", flush=True) time.sleep(SLEEP) t0 = time.time() knn = KNeighborsRegressor() knn.fit(X, y) print_time(t0, "Done", end="")
def knn_regression(K, training_data, labels, test_data, weights='distance'): knn = neighbors.KNeighborsRegressor(K, weights=weights) output = knn.fit(training_data, labels).predict(test_data) return output
def generate_model(self, regressor, qty_neighbors, algorithm, distance_type): """ Regressor Model Generation""" if regressor == "knn": return KNeighborsRegressor(n_neighbors=qty_neighbors, algorithm=algorithm, p=distance_type) elif regressor == "linear": return LinearRegression(fit_intercept=True) # copy_X=True, n_jobs=1, normalize=False elif regressor == "logistic": return LogisticRegression(class_weight='balanced')
def spot_check(X, y): if type == 'regression': models = [ (LinearRegression(), 'Ordinary Least Squares'), (Ridge(alpha=0.1), 'Ridge (alpha 0.1)'), (Ridge(), 'Ridge (alpha 1.0)'), (Lasso(alpha=0.1), 'Lasso (alpha 0.1)'), (Lasso(), 'Lasso (alpha 1.0)'), (ElasticNet(alpha=0.1), 'ElasticNet (alpha 0.1)'), (ElasticNet(), 'ElasticNet (alpha 1.0)'), (DecisionTreeRegressor(), 'Decision Tree'), (KNeighborsRegressor(), 'K-Nearest Neighbors'), # (RandomForestRegressor(), 'Random Forest Regressor'), # (BaggingRegressor(), 'Bagging Regressor'), # (GradientBoostingRegressor(), 'Gradient Bosted Regression'), # (SVR(), 'Support Vector Regression') ] splits = 5 scores = [] for model, model_name in models: score = check_model(model, splits, X, y) # get average score scores.append(score) model_names = map(lambda x: x[1], models) for name, score in zip(model_names, scores): print('%s: %f' % (name, score))
def get_classifier(self, X, Y): """ ???????? :param X: ???? :param Y: ?????? :return: ?? """ clf = KNeighborsRegressor(weights='uniform') clf.fit(X, Y) return clf
def __init__(self, S, A, n_neighbors=5, weights='uniform', algorithm='auto', metric='minkowski', memory_fit=100, memory_size=100, **kwargs): #assert self.lr_mode == 'constant', 'KNNQ is only compatible with constant learning rates.' self.S = S self.A = A self.states = deque([]) self.targets = deque([]) self.memory_fit = memory_fit self.memory_size = memory_size self.count = 0 self.neigh = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, metric=metric) super(KNNQ, self).__init__(**kwargs) self.update_mode = 'set'
def calculatepRCA(data, y ='',c='',p='',x=''): ''' Returns the pRCA from data. pRCA is the probability that (RCA_{y+1} > 1) given the volume of exports (x_{cpy}), and the 'baseline term' (\sum_c x_{cpy} \sum_p x_{cpy} / \sum_c \sum_p x_{cpy}). It is computed using k-nearest neighbors, in the space of log exports and log baseline term. Parameters ---------- data : pandas.DataFrame Raw data. It has source,target,volume (trade, number of people etc.). y,c,p,x : str (optional) Labels of the columns in data used for source,target,volume Returns ------- RCA : pandas.DataFrame Table with the RCAs, with the columns c,p,x,RCA If shares is True it also includes: s_c : Share of X_cp over X_c s_p : Share of X_cp over X_p ''' df = calculateRCA_by_year(data,y ='year',c='ccode',p='pcode',x='x',log_terms = True) #Compute (RCA > 1) next year and merge it df_ = df.copy() df_['year'] = df_['year'] - 1 df_['RCA_y+1'] = (df_['log(RCA)'] > 0).astype(int) df_ = df_[['year','ccode','pcode','RCA_y+1']] df = df.merge(df_) #Prepare dataset for knn and fit M = df[['log(x)','T','RCA_y+1']].as_matrix() X, y = M[:,:2], M[:, 2] knn = neighbors.KNeighborsRegressor(n_neighbors = 200, weights = 'uniform').fit(X, y) #To avoid memory error, compute predictions in split X. Predictions are output pRCA pRCA = np.array([]) for x in np.array_split(X, 10): pRCA = np.append(pRCA, knn.predict(x)) df['pRCA'] = pRCA return df
def __init__(self, idim = 1, odim = 1): self.fwd = KNeighborsRegressor(n_neighbors=5) ActInfModel.__init__(self, idim, odim) self.X_ = [] self.y_ = [] self.bootstrap()
def build_model(self): return KNeighborsRegressor(**self.params)
def test_KNeighborsRegressor(*data): ''' test the KNN regressor :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data regr=neighbors.KNeighborsRegressor() regr.fit(X_train,y_train) print("Training Score:{0}".format(regr.score(X_train,y_train))) print("Testing Score:{0}".format(regr.score(X_test,y_test)))
def test_KNeighborsRegressor_k_w(*data): ''' test the performance with different n_neighbors and weights :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data Ks=np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int') weights=['uniform','distance'] fig=plt.figure() ax=fig.add_subplot(1,1,1) ### graph for weight in weights: training_scores=[] testing_scores=[] for K in Ks: regr=neighbors.KNeighborsRegressor(weights=weight,n_neighbors=K) regr.fit(X_train,y_train) testing_scores.append(regr.score(X_test,y_test)) training_scores.append(regr.score(X_train,y_train)) ax.plot(Ks,testing_scores,label="testing score:weight={0}".format(weight)) ax.plot(Ks,training_scores,label="training score:weight={0}".format(weight)) ax.legend(loc='best') ax.set_xlabel("K") ax.set_ylabel("score") ax.set_ylim(0,1.05) ax.set_title("KNeighborsRegressor") plt.show()
def test_KNeighborsRegressor_k_p(*data): ''' test the performance with different n_neighbors and p :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data Ks=np.linspace(1,y_train.size,endpoint=False,dtype='int') Ps=[1,2,10] fig=plt.figure() ax=fig.add_subplot(1,1,1) ### graph for P in Ps: training_scores=[] testing_scores=[] for K in Ks: regr=neighbors.KNeighborsRegressor(p=P,n_neighbors=K) regr.fit(X_train,y_train) testing_scores.append(regr.score(X_test,y_test)) training_scores.append(regr.score(X_train,y_train)) ax.plot(Ks,testing_scores,label="testing score:p={0}".format(P)) ax.plot(Ks,training_scores,label="training score:p={0}".format(P)) ax.legend(loc='best') ax.set_xlabel("K") ax.set_ylabel("score") ax.set_ylim(0,1.05) ax.set_title("KNeighborsRegressor") plt.show()
def knn(train_sample, validation_sample, features, seed): log_base = np.e knn_est = KNeighborsRegressor(n_neighbors=1, weights='distance', algorithm='auto', leaf_size=30, p=1).fit( train_sample[features], np.log1p(train_sample['volume']) / np.log(log_base)) knn_prob = np.power(log_base, knn_est.predict(validation_sample[features])) - 1 print_mape(validation_sample['volume'], knn_prob, 'KNN') return knn_prob
def main(): df_train0 = pd.read_csv("train.csv") df_train1 = pd.read_csv("train1.csv") df_train2 = pd.read_csv("train2.csv") df_train3 = pd.read_csv("train3.csv") df_train_list = [df_train0] df_train = pd.concat(df_train_list) len_train = len(df_train) df_test = pd.read_csv("test2.csv") df_train = df_train.append(df_test)[df_train.columns.tolist()] df_date = pd.read_csv("date.csv") df_ts = pd.read_csv("ts_feature2_simple.csv") print df_test.head() df_train = df_train.merge(df_date, on="date", how="left") df_train = df_train.merge(df_ts, on=["tollgate_id", "hour", "miniute", "direction"], how="left") data = pd.DataFrame.reset_index(df_train) data = data.drop("index", axis=1) print data.head(1) data = feature_transform_knn(key=1, data= data) y = data.ix[:len_train - 1]["volume"] x = data.ix[:len_train - 1, 8:] x1 = data.ix[len_train:, 8:] regressor_cubic = KNeighborsRegressor(n_neighbors=15,) regressor_cubic.fit(x, y) yhat = regressor_cubic.predict(x1) df_test["volume"] = yhat df_test = df_test[['tollgate_id', 'time_window', 'direction', 'volume']] df_test.to_csv("result/result_knn_"+str(np.mean(yhat))+".csv", index=False) print np.mean(yhat)
def test_precomputed_cross_validation(): # Ensure array is split correctly rng = np.random.RandomState(0) X = rng.rand(20, 2) D = pairwise_distances(X, metric='euclidean') y = rng.randint(3, size=20) for Est in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): metric_score = cross_val_score(Est(), X, y) precomp_score = cross_val_score(Est(metric='precomputed'), D, y) assert_array_equal(metric_score, precomp_score)
def test_neighbors_regressors_zero_distance(): # Test radius-based regressor, when distance to a sample is zero. X = np.array([[1.0, 1.0], [1.0, 1.0], [2.0, 2.0], [2.5, 2.5]]) y = np.array([1.0, 1.5, 2.0, 0.0]) radius = 0.2 z = np.array([[1.1, 1.1], [2.0, 2.0]]) rnn_correct_labels = np.array([1.25, 2.0]) knn_correct_unif = np.array([1.25, 1.0]) knn_correct_dist = np.array([1.25, 2.0]) for algorithm in ALGORITHMS: # we don't test for weights=_weight_func since user will be expected # to handle zero distances themselves in the function. for weights in ['uniform', 'distance']: rnn = neighbors.RadiusNeighborsRegressor(radius=radius, weights=weights, algorithm=algorithm) rnn.fit(X, y) assert_array_almost_equal(rnn_correct_labels, rnn.predict(z)) for weights, corr_labels in zip(['uniform', 'distance'], [knn_correct_unif, knn_correct_dist]): knn = neighbors.KNeighborsRegressor(n_neighbors=2, weights=weights, algorithm=algorithm) knn.fit(X, y) assert_array_almost_equal(corr_labels, knn.predict(z))
def test_predict_sparse_ball_kd_tree(): rng = np.random.RandomState(0) X = rng.rand(5, 5) y = rng.randint(0, 2, 5) nbrs1 = neighbors.KNeighborsClassifier(1, algorithm='kd_tree') nbrs2 = neighbors.KNeighborsRegressor(1, algorithm='ball_tree') for model in [nbrs1, nbrs2]: model.fit(X, y) assert_raises(ValueError, model.predict, csr_matrix(X))
def get_model_list(): model_list, name_list = [], [] # model_list.append(linear_model.LinearRegression()) # name_list.append('LR') # model_list.append(gaussian_process.GaussianProcessRegressor(alpha=1e-10)) # name_list.append('GaussianProcess') # model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=28)) # name_list.append('KNN_unif') # # model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=28)) # name_list.append('KNN_dist') # # model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2)) # name_list.append('SVR_poly') # # model_list.append(SVR(kernel = 'rbf', C = 0.3, gamma = 'auto')) name_list.append('SVR_rbf') # # # model_list.append(DecisionTreeRegressor()) # name_list.append('DT') # # model_list.append(RandomForestRegressor(n_estimators=150, max_depth=None,min_samples_split=2, random_state=0)) # name_list.append('RF') # # model_list.append(ExtraTreesRegressor(n_estimators=150, max_depth=None, max_features='auto', min_samples_split=2, random_state=0)) # name_list.append('ET') return model_list,name_list #MAPE
def get_model_list(): model_list, name_list = [], [] # model_list.append(linear_model.LinearRegression()) # name_list.append('LR') # model_list.append(gaussian_process.GaussianProcessRegressor(alpha=1e-10)) # name_list.append('GaussianProcess') # model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=28)) # name_list.append('KNN_unif') # # model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=28)) # name_list.append('KNN_dist') # # model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2)) # name_list.append('SVR_poly') # # model_list.append(SVR(kernel = 'rbf', C = 0.3, gamma = 'auto')) name_list.append('SVR_rbf') # # # model_list.append(DecisionTreeRegressor()) # name_list.append('DT') # # model_list.append(RandomForestRegressor(n_estimators=150, max_depth=None,min_samples_split=2, random_state=0)) # name_list.append('RF') # # model_list.append(ExtraTreesRegressor(n_estimators=150, max_depth=None, max_features='auto', min_samples_split=2, random_state=0)) # name_list.append('ET') return model_list,name_list #????
def fit(self, X, y=None, **kwargs): X = check_array(X, dtype=np.float64, force_all_finite=False) X_nan = np.isnan(X) most_by_nan = X_nan.sum(axis=0).argsort()[::-1] imputed = self.initial_imputer.fit_transform(X) new_imputed = imputed.copy() self.statistics_ = np.ma.getdata(X) self.gamma_ = [] if self.f_model == "RandomForest": self.estimators_ = [RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=i, **kwargs) for i in range(X.shape[1])] elif self.f_model == "KNN": self.estimators_ = [KNeighborsRegressor(n_neighbors=min(5, sum(~X_nan[:, i])), **kwargs) for i in range(X.shape[1])] elif self.f_model == "PCA": self.estimators_ = [PCA(n_components=int(np.sqrt(min(X.shape))), whiten=True, **kwargs)] for iter in range(self.max_iter): if len(self.estimators_) > 1: for i in most_by_nan: X_s = np.delete(new_imputed, i, 1) y_nan = X_nan[:, i] X_train = X_s[~y_nan] y_train = new_imputed[~y_nan, i] X_unk = X_s[y_nan] estimator_ = self.estimators_[i] estimator_.fit(X_train, y_train) if len(X_unk) > 0: new_imputed[y_nan, i] = estimator_.predict(X_unk) else: estimator_ = self.estimators_[0] estimator_.fit(new_imputed) new_imputed[X_nan] = estimator_.inverse_transform(estimator_.transform(new_imputed))[X_nan] gamma = ((new_imputed-imputed)**2/(1e-6+new_imputed.var(axis=0))).sum()/(1e-6+X_nan.sum()) self.gamma_.append(gamma) if np.abs(np.diff(self.gamma_[-2:])) < self.tol: break return self
def train_validate(self, df, validation_range, update_progress): """ Train and validate regressor on df samples with indices listed in validation_range. """ training_summary = pd.DataFrame() first_sample, samples, labels = prepare_samples(df, self.indicators_samples) # progress bar parameters total_steps = len(self.model_params['sample_presentation']) * \ len(self.model_params['exp_weight']) * len(self.model_params['k']) completed_steps = 0 # loop over model parameters for sample_presentation in self.model_params['sample_presentation']: presented_samples, presented_labels, normalizer = set_presentation(samples, labels, sample_presentation, self.indicators_samples['Daily']) for exp_weight in self.model_params['exp_weight']: weighted_samples = apply_exp_weights(presented_samples, exp_weight) for k in self.model_params['k']: model, total_train_time, total_test_time = [[0 for i in range (len(h))] for j in range(3)] error_list, relative_error_list, hit_list = [[[] for i in range (len(h))] for j in range(3)] params = (sample_presentation, exp_weight, k) # model training and validation core for h_index in range(len(h)): for index in validation_range: i = index-first_sample x_train, x_validate = weighted_samples[:i-h[h_index]+1,:], weighted_samples[i,:] #need to stop training h steps before test y_train, y_validate = presented_labels[h_index][:i-h[h_index]+1], presented_labels[h_index][i] #train t1 = time.time() model[h_index] = KNeighborsRegressor(n_neighbors=k) # train a separate model for each horizon model[h_index].fit(x_train, y_train) t2 = time.time() train_time = (t2-t1) #test y_predict = model[h_index].predict(x_validate.reshape(1,-1)) test_time = (time.time()-t2) #apend new results y_validate_absolute = remove_presentation(y_validate,normalizer[i], sample_presentation) y_predict_absolute = remove_presentation(y_predict ,normalizer[i], sample_presentation) error_list[h_index] += [y_validate_absolute - y_predict_absolute] relative_error_list[h_index] += [(y_validate_absolute - y_predict_absolute)/y_validate_absolute] hit_list[h_index] += [(y_validate-x_validate[-1])*(y_predict-x_validate[-1]) > 0] total_train_time[h_index] += train_time total_test_time[h_index] += test_time if i == len(presented_labels[h_index])-1: #very last training point, include last training oppurtunity x_train = weighted_samples[:i+1,:] y_train = presented_labels[h_index][:i+1] model[h_index].fit(x_train, y_train) break completed_steps += 1 update_progress(100.0 * completed_steps/total_steps) #save last trained model, and add to training summary training_summary = training_summary.append(summarize(self, model, error_list, relative_error_list, hit_list, params, total_train_time, total_test_time)) return training_summary, make_presentable(training_summary, self.summary_name)
def __init__(self, isTrain): super(RegressionUniformBlending, self).__init__(isTrain) # data preprocessing #self.dataPreprocessing() self.net1 = NeuralNet( layers=[ # three layers: one hidden layer ('input', layers.InputLayer), ('hidden', layers.DenseLayer), #('hidden2', layers.DenseLayer), #('hidden3', layers.DenseLayer), ('output', layers.DenseLayer), ], # layer parameters: input_shape=(None, 13), # input dimension is 13 hidden_num_units=6, # number of units in hidden layer #hidden2_num_units=8, # number of units in hidden layer #hidden3_num_units=4, # number of units in hidden layer output_nonlinearity=None, # output layer uses sigmoid function output_num_units=1, # output dimension is 1 # obejctive function objective_loss_function = lasagne.objectives.squared_error, # optimization method: update=lasagne.updates.nesterov_momentum, update_learning_rate=0.002, update_momentum=0.4, # use 25% as validation train_split=TrainSplit(eval_size=0.2), regression=True, # flag to indicate we're dealing with regression problem max_epochs=100, # we want to train this many epochs verbose=0, ) # Create linear regression object self.linRegr = linear_model.LinearRegression() # Create KNN regression object self.knn = neighbors.KNeighborsRegressor(86, weights='distance') # Create Decision Tree regression object self.decisionTree = DecisionTreeRegressor(max_depth=7, max_features=None) # Create AdaBoost regression object decisionReg = DecisionTreeRegressor(max_depth=10) rng = np.random.RandomState(1) self.adaReg = AdaBoostRegressor(decisionReg, n_estimators=400, random_state=rng) # Create linear regression object self.model = RandomForestRegressor(max_features='sqrt', n_estimators=32, max_depth=39)
def __init__(self, S, A, maxlen=1000, mode=None, embedding_dim=1, **kwargs): super(TableQ2, self).__init__(**kwargs) self.S = S self.A = A if mode == None: if type(S) == type(A) == gym.spaces.Discrete: self.mode = 'array' elif type(A) == gym.spaces.Discrete: self.mode = 'dictionary' else: pass self.mode = mode self.maxlen = maxlen self.embedding_dim = embedding_dim if self.mode == 'array': s_dim = get_space_dim(S) a_dim = get_space_dim(A) self.table = np.zeros((s_dim, a_dim)) self.maxlen = s_dim elif self.mode == 'dictionary': self.table = {0: np.zeros(self.A.n)} elif self.mode == 'tables': self.k = 4 self.neigh = KNeighborsRegressor(n_neighbors=self.k) self.states = np.zeros((self.maxlen,self.embedding_dim)) self.values = np.zeros((self.maxlen, self.A.n)) self.recency= np.zeros((self.maxlen,)) self.i = 0 elif self.mode == 'action_tables': #self.states = [] #self.recency= [] self.k = 4 self.action_tables = [ [[],[], KNeighborsRegressor(n_neighbors=self.k), []] for _ in xrange(self.A.n)] """ for at in self.action_tables: states, values, neigh, recency = at for _ in xrange(self.k): if self.embedding_dim > 1: states.append(np.ones(self.embedding_dim)) else: states.append(1) values.append(0) recency.append(0) #print states, values #neigh.fit(np.array(states), np.array(values)) s = self._list_to_sklearn(states) v = self._list_to_sklearn(values) #print s, v neigh.fit(s, v) """ else: raise NotImplementedError, 'Sorry, TableQ only supports three modes.'
def plot(k=1,xyzFile='xyz_synth_surf.txt',write=False): with open(xyzFile) as f: xyz=np.float64([row.split() for row in f.readlines()]) #~ plt.figure() #~ plt.scatter(xyz[:, 0], xyz[:, 1], c=xyz[:,2]) #~ plt.plot(xyz[:3, 0], xyz[:3, 1], c='k', marker='s',ms=10) #~ plt.plot(xyz[:50, 0], xyz[:50, 1], xyz[:50,2], c='k', marker='s',ms=3) fig=plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(xyz[:, 0], xyz[:, 1], xyz[:,2], c=xyz[:,2], marker='o',linewidths=0) ax.plot(xyz[:50, 0], xyz[:50, 1], xyz[:50,2], c='k', marker='s',ms=3) #~ ax.scatter(xyz[:50, 0], xyz[:50, 1], xyz[:50,2], c='k', marker='s',linewidths=0,cmap=plt.cm.bone) xmin=np.min(xyz[:,0]) xmax=np.max(xyz[:,0]) step=(xmax-xmin)/100. x_=np.arange(np.min(xyz[:,0]),np.max(xyz[:,0]),step) y_=np.arange(np.min(xyz[:,0]),np.max(xyz[:,0]),step) xx,yy=np.meshgrid(x_,y_) xy=np.append(xx.ravel()[:,np.newaxis],yy.ravel()[:,np.newaxis],1) knn = neighbors.KNeighborsRegressor(k, weights='distance',p=1) z_= knn.fit(xyz[:,:2],xyz[:,2]).predict(xy) fig=plt.figure() ax = fig.add_subplot(111, projection='3d') ax.plot_surface(xx, yy, z_.reshape(np.shape(xx)),rstride=1, cstride=1, cmap=plt.cm.spectral, linewidth=0, antialiased=False) ax.plot(xyz[:50, 0], xyz[:50, 1], xyz[:50,2], c='k', marker='s',ms=3) if write: with open('knn_'+ xyzFile,'w') as f: for xi,yi,zi in zip(xx.ravel(),yy.ravel(),z_): f.write('%f %f %f\n' % (xi,yi,zi)) ################################################################# #calculate a distance matrix based on variation of information
def main(): df_test = pd.read_csv("test2.csv") df_train0 = pd.read_csv("train.csv") df_train_list = [df_train0,] random.shuffle(df_train_list) df_train = pd.concat(df_train_list) df_ts = pd.read_csv("ts_feature2_simple.csv") df_date = pd.read_csv("date.csv") df_train = df_train.merge(df_date, on="date", how="left") df_train = df_train.merge(df_ts, on=["tollgate_id", "hour", "miniute", "direction"], how="left") df_test = df_test.merge(df_date, on="date", how="left") df_test = df_test.merge(df_ts, on=["tollgate_id", "hour", "miniute", "direction"], how="left") df_train_grouped = df_train.groupby(["tollgate_id", "direction"]) df_test_grouped = df_test.groupby(["tollgate_id", "direction"]) result = [] oob = [] for key, train_data in df_train_grouped: test_data = df_test_grouped.get_group(key) len_train = len(train_data) train_data = train_data.append(test_data)[train_data.columns.tolist()] train_data = feature_transform_knn(key, train_data) regressor_cubic = KNeighborsRegressor(n_neighbors=8, algorithm="auto") train_data = pd.DataFrame.reset_index(train_data) train_data = train_data.drop("index", axis=1) y = train_data.ix[:len_train - 1, :]["volume"] x = train_data.ix[:len_train - 1, 8:] print x.head() x1 = train_data.ix[len_train:, 8:] regressor_cubic.fit(x, y) yhat = regressor_cubic.predict(x1) test_data["volume"] = yhat result.append(test_data[['tollgate_id', 'time_window', 'direction', 'volume']]) df_result = pd.concat(result, axis=0) print np.mean(df_result["volume"]) df_result.to_csv("result/result_split_knn"+str(np.mean(df_result["volume"]))+".csv", index=False) print np.mean(oob)
def test_precomputed(random_state=42): """Tests unsupervised NearestNeighbors with a distance matrix.""" # Note: smaller samples may result in spurious test success rng = np.random.RandomState(random_state) X = rng.random_sample((10, 4)) Y = rng.random_sample((3, 4)) DXX = metrics.pairwise_distances(X, metric='euclidean') DYX = metrics.pairwise_distances(Y, X, metric='euclidean') for method in ['kneighbors']: # TODO: also test radius_neighbors, but requires different assertion # As a feature matrix (n_samples by n_features) nbrs_X = neighbors.NearestNeighbors(n_neighbors=3) nbrs_X.fit(X) dist_X, ind_X = getattr(nbrs_X, method)(Y) # As a dense distance matrix (n_samples by n_samples) nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute', metric='precomputed') nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check auto works too nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric='precomputed') nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check X=None in prediction dist_X, ind_X = getattr(nbrs_X, method)(None) dist_D, ind_D = getattr(nbrs_D, method)(None) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Must raise a ValueError if the matrix is not of correct shape assert_raises(ValueError, getattr(nbrs_D, method), X) target = np.arange(X.shape[0]) for Est in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): print(Est) est = Est(metric='euclidean') est.radius = est.n_neighbors = 1 pred_X = est.fit(X, target).predict(Y) est.metric = 'precomputed' pred_D = est.fit(DXX, target).predict(DYX) assert_array_almost_equal(pred_X, pred_D)
def test_neighbors_badargs(): # Test bad argument values: these should all raise ValueErrors assert_raises(ValueError, neighbors.NearestNeighbors, algorithm='blah') X = rng.random_sample((10, 2)) Xsparse = csr_matrix(X) y = np.ones(10) for cls in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): assert_raises(ValueError, cls, weights='blah') assert_raises(ValueError, cls, p=-1) assert_raises(ValueError, cls, algorithm='blah') nbrs = cls(algorithm='ball_tree', metric='haversine') assert_raises(ValueError, nbrs.predict, X) assert_raises(ValueError, ignore_warnings(nbrs.fit), Xsparse, y) nbrs = cls() assert_raises(ValueError, nbrs.fit, np.ones((0, 2)), np.ones(0)) assert_raises(ValueError, nbrs.fit, X[:, :, None], y) nbrs.fit(X, y) assert_raises(ValueError, nbrs.predict, [[]]) if (isinstance(cls, neighbors.KNeighborsClassifier) or isinstance(cls, neighbors.KNeighborsRegressor)): nbrs = cls(n_neighbors=-1) assert_raises(ValueError, nbrs.fit, X, y) nbrs = neighbors.NearestNeighbors().fit(X) assert_raises(ValueError, nbrs.kneighbors_graph, X, mode='blah') assert_raises(ValueError, nbrs.radius_neighbors_graph, X, mode='blah')
def get_model_list(task_name): model_list, name_list = [], [] model_list.append(linear_model.LinearRegression()) name_list.append('LR') # model_list.append(linear_model.SGDRegressor()) name_list.append('LR_SGD') model_list.append(linear_model.Lasso(alpha = 1.0)) name_list.append('Lasso') model_list.append(linear_model.Ridge (alpha = 1.0)) name_list.append('Ridge') model_list.append(linear_model.LassoLars(alpha=.1)) name_list.append('LassoLars') model_list.append(linear_model.BayesianRidge()) name_list.append('BayesianRidge') model_list.append(KernelRidge(alpha=1.0)) name_list.append('KernelRidge') model_list.append(gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)) name_list.append('GaussianProcess') model_list.append(KNeighborsRegressor(weights = 'uniform',n_neighbors=3)) name_list.append('KNN_unif') model_list.append(KNeighborsRegressor(weights = 'distance',n_neighbors=3)) name_list.append('KNN_dist') model_list.append(SVR(kernel = 'linear', C = 1, gamma = 'auto', coef0 = 0, degree = 2)) name_list.append('SVM_linear') model_list.append(SVR(kernel = 'poly', C = 1, gamma = 'auto', coef0 = 0, degree = 2)) name_list.append('SVM_poly') model_list.append(SVR(kernel = 'rbf', C = 1, gamma = 'auto', coef0 = 0, degree = 2)) name_list.append('SVM_rbf') model_list.append(DecisionTreeRegressor()) name_list.append('DT') model_list.append(RandomForestRegressor(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0)) name_list.append('RF') model_list.append(ExtraTreesRegressor(n_estimators=100, max_depth=None, max_features='auto', min_samples_split=2, random_state=0)) name_list.append('ET') return model_list, name_list