我们从Python开源项目中,提取了以下22个代码示例,用于说明如何使用sklearn.neighbors.KernelDensity()。
def bic_gaussian_kernel(chain_fname,data_length): """ Bayesian information criterion Only valid if data_length >> n_params # Note that bandwidth of kernel is set to 1 universally """ chain = np.load(chain_fname + '.npy') n_params = np.shape(chain)[-1] samples = chain.reshape((-1,n_params)) kde = KernelDensity(kernel='gaussian',bandwidth=1).fit(samples) # Best fit = medians of the distribution medians = np.median(samples,axis=0) # shape = (n_params,) medians = medians.reshape(1,-1) # Reshape to (1,n_params) log10_L = float(kde.score_samples(medians)) lnL = log10_L /np.log10(2.7182818) return -2*lnL + n_params*np.log(data_length) ############################################################################### # Process chain ###############################################################################
def kde_sklearn(data, grid, **kwargs): """ Kernel Density Estimation with Scikit-learn Parameters ---------- data : numpy.array Data points used to compute a density estimator. It has `n x p` dimensions, representing n points and p variables. grid : numpy.array Data points at which the desity will be estimated. It has `m x p` dimensions, representing m points and p variables. Returns ------- out : numpy.array Density estimate. Has `m x 1` dimensions """ kde_skl = KernelDensity(**kwargs) kde_skl.fit(data) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(grid) return np.exp(log_pdf)
def calculate_densities(target_feature_id, feature_id): feature = Feature.objects.get(pk=feature_id) target_feature = Feature.objects.get(pk=target_feature_id) df = _get_dataframe(feature.dataset.id) target_col = df[target_feature.name] categories = target_feature.categories def calc_density(category): kde = KernelDensity(kernel='gaussian', bandwidth=0.75) X = df[target_col == category][feature.name] # Fitting requires expanding dimensions X = np.expand_dims(X, axis=1) kde.fit(X) # We'd like to sample 100 values X_plot = np.linspace(feature.min, feature.max, 100) # We need the last dimension again X_plot = np.expand_dims(X_plot, axis=1) log_dens = kde.score_samples(X_plot) return np.exp(log_dens).tolist() return [{'target_class': category, 'density_values': calc_density(category)} for category in categories]
def kde_opt1(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["hour"] = (1 + df["hour"]) * 3.92105 df_new["weekday"] = (1 + df["weekday"]) * 4.28947 df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) * 9.44736 df_new["x"] = df["x"] * 424.489 df_new["y"] = df["y"] * 959.183 return df_new logging.info("train kde_opt1 model") df_cell_train_feats_kde = prepare_feats(df_cell_train_feats) df_cell_test_feats_kde = prepare_feats(df_cell_test_feats) n_class = len(np.unique(y_train)) y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d") Xte = df_cell_test_feats_kde.values for i in range(n_class): X = df_cell_train_feats_kde[y_train == i].values cstd = np.std(np.sum(np.abs(X), axis=1)) gridcv = GridSearchCV(KernelDensity(kernel='gaussian', metric='manhattan'), {'bandwidth': cstd * np.logspace(-1, 1, 10)}, cv=5) gridcv.fit(X) y_test_pred[:, i] += np.exp(gridcv.best_estimator_.score_samples(Xte)) return y_test_pred
def getKernelDensityEstimation(nodes, metric='euclidean', metric_params=None, bbox=None, bandwidth=0.002, optimizeBandwidth=False, bwmin=0.0001, bwmax=0.01, crossValidation=20): lon = [] lat = [] for nlon,nlat in nodes: lon.append(nlon) lat.append(nlat) lon = np.array(lon) lat = np.array(lat) if bbox is None: xmin, xmax = min(lon), max(lon) ymin, ymax = min(lat), max(lat) bbox = [xmin, xmax, ymin, ymax] else: xmin, ymin, xmax, ymax = bbox[0],bbox[1],bbox[2],bbox[3] bbox = [xmin, xmax, ymin, ymax] x, y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = np.vstack([x.ravel(), y.ravel()]) values = np.vstack([lon, lat]) if optimizeBandwidth: grid = GridSearchCV(KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree'), {'bandwidth': np.linspace(bwmin, bwmax, 30)}, cv=crossValidation) # 20-fold cross-validation grid.fit(zip(*values)) bandwidth = grid.best_params_['bandwidth'] kernel = grid.best_estimator_ else: kernel = KernelDensity(kernel='gaussian', metric=metric, metric_params=metric_params, algorithm='ball_tree', bandwidth=bandwidth) kernel.fit(zip(*values)) return kernel, positions, x, y, bbox, bandwidth
def __init__(self, hyperparameter, param_name, data, oob_strategy='resample', bandwith=0.4): if oob_strategy not in ['resample', 'round', 'ignore']: raise ValueError() self.oob_strategy = oob_strategy self.param_name = param_name self.hyperparameter = hyperparameter reshaped = np.reshape(data, (len(data), 1)) if self.hyperparameter.log: if isinstance(self.hyperparameter, UniformIntegerHyperparameter): # self.probabilities = {val: self.distrib.pdf(np.log2(val)) for val in range(self.hyperparameter.lower, self.hyperparameter.upper)} raise ValueError('Log Integer hyperparameter not supported: %s' %param_name) # self.distrib = gaussian_kde(np.log2(data)) # self.distrib = KernelDensity(kernel='gaussian').fit(np.log2(np.reshape(data, (len(data), 1)))) self.distrib = KernelDensity(kernel='gaussian', bandwidth=bandwith).fit(np.log2(reshaped)) else: # self.distrib = gaussian_kde(data) self.distrib = KernelDensity(kernel='gaussian', bandwidth=bandwith).fit(reshaped) pass
def test_kernel_density(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) Y = rng.randn(n_samples, n_features) for kernel in ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']: for bandwidth in [0.01, 0.1, 1]: dens_true = compute_kernel_slow(Y, X, kernel, bandwidth) def check_results(kernel, bandwidth, atol, rtol): kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol) log_dens = kde.fit(X).score_samples(Y) assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1E-7, rtol)) assert_allclose(np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1E-7, rtol)) for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): yield (check_results, kernel, bandwidth, atol, rtol)
def test_kde_algorithm_metric_choice(): # Smoke test for various metrics and algorithms rng = np.random.RandomState(0) X = rng.randn(10, 2) # 2 features required for haversine dist. Y = rng.randn(10, 2) for algorithm in ['auto', 'ball_tree', 'kd_tree']: for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev', 'haversine']: if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics: assert_raises(ValueError, KernelDensity, algorithm=algorithm, metric=metric) else: kde = KernelDensity(algorithm=algorithm, metric=metric) kde.fit(X) y_dens = kde.score_samples(Y) assert_equal(y_dens.shape, Y.shape[:1])
def __init__(self, masks): n_class = 10 self.maps_with_class = [[], [], [], [], [], [], [], [], [], []] self.kde_samplers = [] self.class_probs = np.ones(n_class) / n_class # self.class_probs = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5]) self.mask_size = None ts = time.time() for mask_i, mask in enumerate(masks): assert mask.shape[2] == n_class if not self.mask_size: self.mask_size = mask.shape[1] samplers = [] for class_i in range(n_class): X = np.nonzero(mask[:, :, class_i]) X = np.stack(X, axis=1) # np.random.shuffle(X) # X = X[:50000] if not X.size: samplers.append(None) else: self.maps_with_class[class_i].append(mask_i) sampler = neighbors.KernelDensity(self.mask_size * 0.02).fit(X) samplers.append(sampler) assert len(samplers) == n_class self.kde_samplers.append(samplers) print('sampler init time: {}'.format(time.time() - ts))
def display(self, output_filename): fig, (self.ax) = plt.subplots(1, 1) self.kde = KernelDensity(kernel = 'gaussian', bandwidth = self.bandwidth) for i in range(len(self.datasets)): self.displayDataset(self.datasets[i]) if self.title is not None: self.ax.set_xlabel(self.title) self.ax.set_ylabel('Density') self.ax.legend(bbox_to_anchor = (0., 1.02, 1., .102), loc = 3, ncol = 3, mode = 'expand', borderaxespad = 0.) fig.savefig(output_filename) plt.close(fig)
def test_gridsearch_no_predict(): # test grid-search with an estimator without predict. # slight duplication of a test from KDE def custom_scoring(estimator, X): return 42 if estimator.bandwidth == .1 else 0 X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) search = dcv.GridSearchCV(KernelDensity(), param_grid=dict(bandwidth=[.01, .1, 1]), scoring=custom_scoring) search.fit(X) assert search.best_params_['bandwidth'] == .1 assert search.best_score_ == 42
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] + df["hour"] / 24. df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) df_new["x"] = df["x"] df_new["y"] = df["y"] return df_new logging.info("train kde_opt4 model") df_cell_train_feats_kde = prepare_feats(df_cell_train_feats) df_cell_test_feats_kde = prepare_feats(df_cell_test_feats) n_class = len(np.unique(y_train)) y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d") for i in range(n_class): X = df_cell_train_feats_kde[y_train == i] y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d") for feat in df_cell_train_feats_kde.columns.values: X_feat = X[feat].values BGK10_output = kdeBGK10(X_feat) if BGK10_output is None: kde = gaussian_kde(X_feat, "scott") kde = gaussian_kde(X_feat, kde.factor * 0.741379) y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values) else: bandwidth, mesh, density = BGK10_output kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth) kde.fit(X_feat[:, np.newaxis]) y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis])) y_test_pred[:, i] += y_test_pred_i return y_test_pred
def get_bw_cv(x,params): grid = GridSearchCV(KernelDensity(), params,cv=2,n_jobs=1) grid.fit(x[:,None]) bw = grid.best_estimator_.bandwidth return bw
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples. Returns ------- self : detector Return self. """ X = check_array(X) self._kde = KernelDensity( bandwidth = self.bandwidth, kernel = self.kernel, metric = self.metric, metric_params = self.metric_params ).fit(X) self.y_score_ = self.anomaly_score(X) self.threshold_ = np.percentile( self.y_score_, 100.0 * (1.0 - self.fpr) ) return self
def fit_kde(X, bandwidth): kde = KernelDensity(bandwidth=bandwidth) kde.fit(X=X) return kde
def search_bandwidth(val_data, cvJobs): data = convert_to_ndarrays(val_data) params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params, n_jobs=cvJobs) grid.fit(data) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) return grid.best_estimator_.bandwidth
def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert_equal(kde.sample().shape, (1, 1))
def test_kde_badargs(): assert_raises(ValueError, KernelDensity, algorithm='blah') assert_raises(ValueError, KernelDensity, bandwidth=0) assert_raises(ValueError, KernelDensity, kernel='blah') assert_raises(ValueError, KernelDensity, metric='blah') assert_raises(ValueError, KernelDensity, algorithm='kd_tree', metric='blah')
def test_kde_pipeline_gridsearch(): # test that kde plays nice in pipelines and grid-searches X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False), KernelDensity(kernel="gaussian")) params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) search = GridSearchCV(pipe1, param_grid=params, cv=5) search.fit(X) assert_equal(search.best_params_['kerneldensity__bandwidth'], .1)
def test_gridsearch_no_predict(): # test grid-search with an estimator without predict. # slight duplication of a test from KDE def custom_scoring(estimator, X): return 42 if estimator.bandwidth == .1 else 0 X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) search = GridSearchCV(KernelDensity(), param_grid=dict(bandwidth=[.01, .1, 1]), scoring=custom_scoring) search.fit(X) assert_equal(search.best_params_['bandwidth'], .1) assert_equal(search.best_score_, 42)
def find_consensus_values( x, n_expected=None, bandwidth_auto=True, min_bandwidth=0.02, # XXX This is GC-MS specific default_bandwidth=0.1, # XXX This too show_plot=False, out=sys.stdout, err=sys.stderr): """ Use kernel density estimation to analyze the distribution of data points along the X-axis, and identify consensus values for major clusters. This is used to identify common peaks in a set of related GC-MS samples. """ if isinstance(x, list): x = np.array(x) x_grid = np.linspace(x.min() - 0.25, x.max() + 0.25, 1000) if bandwidth_auto: # http://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(min_bandwidth, 0.1, 30)}, cv=20) # 20-fold cross-validation grid.fit(x[:, None]) bandwidth = grid.best_params_['bandwidth'] print("Best bandwidth: %.4f" % bandwidth, file=err) kde = grid.best_estimator_ pdf = np.exp(kde.score_samples(x_grid[:, None])) else: bandwidth = default_bandwidth kde = gaussian_kde(x) pdf = kde.evaluate(x_grid) i_maxima = local_maxima(x_grid, pdf) max_values = [] for i_max in i_maxima: max_values.append((x_grid[i_max], pdf[i_max])) # sort maxima by value in distribution max_values.sort(key=lambda x: x[1], reverse=True) pdf_max = pdf.max() major_peaks, minor_peaks = extract_peaks(max_values, pdf_max, n_expected=n_expected, out=out, err=err) # now sort major peaks by retention time major_peaks.sort(key=lambda x: x[0]) print("Major retention time peaks:", file=err) for i_peak, (xval, pdf_val) in enumerate(major_peaks): print(" %2d %8.3f" % (i_peak+1, xval), file=err) if show_plot: import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.plot(x_grid, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth) ax.hist(x, 50, fc='gray', histtype='stepfilled', alpha=0.3, normed=True) for rt, pdf_val in major_peaks: ax.axvline(rt, color='red') ax.axvline(rt-bandwidth, color='magenta') ax.axvline(rt+bandwidth, color='magenta') plt.show() return [xval for xval, yval in major_peaks], float(bandwidth)