Python numpy 模块,percentile() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.percentile()。
def remove_outliers(idx_ts):
"""Remove outliers from a given timestamps array.
Parameters
----------
idx_ts : numpy.ndarray
given timestamp array.
Returns
-------
idx_ts_new : numpy.ndarray.
outliers removed timestamp array.
"""
idx_ts_new = idx_ts.copy()
summary = np.percentile(idx_ts_new, [25, 50, 75])
high_lim = summary[0] - 1.5 * (summary[2] - summary[1])
low_lim = summary[2] + 1.5 * (summary[2] - summary[1])
idx_ts_new = idx_ts_new[~(idx_ts_new >= low_lim)]
idx_ts_new = idx_ts_new[~(idx_ts_new <= high_lim)]
return idx_ts_new
def _classify_gems(counts0, counts1):
""" Infer number of distinct transcriptomes present in each GEM (1 or 2) and
report cr_constants.GEM_CLASS_GENOME0 for a single cell w/ transcriptome 0,
report cr_constants.GEM_CLASS_GENOME1 for a single cell w/ transcriptome 1,
report cr_constants.GEM_CLASS_MULTIPLET for multiple transcriptomes """
# Assumes that most of the GEMs are single-cell; model counts independently
thresh0, thresh1 = [cr_constants.DEFAULT_MULTIPLET_THRESHOLD] * 2
if sum(counts0 > counts1) >= 1 and sum(counts1 > counts0) >= 1:
thresh0 = np.percentile(counts0[counts0 > counts1], cr_constants.MULTIPLET_PROB_THRESHOLD)
thresh1 = np.percentile(counts1[counts1 > counts0], cr_constants.MULTIPLET_PROB_THRESHOLD)
doublet = np.logical_and(counts0 >= thresh0, counts1 >= thresh1)
dtype = np.dtype('|S%d' % max(len(cls) for cls in cr_constants.GEM_CLASSES))
result = np.where(doublet, cr_constants.GEM_CLASS_MULTIPLET, cr_constants.GEM_CLASS_GENOME0).astype(dtype)
result[np.logical_and(np.logical_not(result == cr_constants.GEM_CLASS_MULTIPLET), counts1 > counts0)] = cr_constants.GEM_CLASS_GENOME1
return result
def get_normalized_dispersion(mat_mean, mat_var, nbins=20):
mat_disp = (mat_var - mat_mean) / np.square(mat_mean)
quantiles = np.percentile(mat_mean, np.arange(0, 100, 100 / nbins))
quantiles = np.append(quantiles, mat_mean.max())
# merge bins with no difference in value
quantiles = np.unique(quantiles)
if len(quantiles) <= 1:
# pathological case: the means are all identical. just return raw dispersion.
return mat_disp
# calc median dispersion per bin
(disp_meds, _, disp_bins) = scipy.stats.binned_statistic(mat_mean, mat_disp, statistic='median', bins=quantiles)
# calc median absolute deviation of dispersion per bin
disp_meds_arr = disp_meds[disp_bins-1] # 0th bin is empty since our quantiles start from 0
disp_abs_dev = abs(mat_disp - disp_meds_arr)
(disp_mads, _, disp_bins) = scipy.stats.binned_statistic(mat_mean, disp_abs_dev, statistic='median', bins=quantiles)
# calculate normalized dispersion
disp_mads_arr = disp_mads[disp_bins-1]
disp_norm = (mat_disp - disp_meds_arr) / disp_mads_arr
return disp_norm
def prctile(data, p_vals=[0, 25, 50, 75, 100], sorted_=False):
"""``prctile(data, 50)`` returns the median, but p_vals can
also be a sequence.
Provides for small samples or extremes IMHO better values than
matplotlib.mlab.prctile or np.percentile, however also slower.
"""
ps = [p_vals] if np.isscalar(p_vals) else p_vals
if not sorted_:
data = sorted(data)
n = len(data)
d = []
for p in ps:
fi = p * n / 100 - 0.5
if fi <= 0: # maybe extrapolate?
d.append(data[0])
elif fi >= n - 1:
d.append(data[-1])
else:
i = int(fi)
d.append((i + 1 - fi) * data[i] + (fi - i) * data[i + 1])
return d[0] if np.isscalar(p_vals) else d
def fit_predict(self, ts):
"""
Unsupervised training of TSBitMaps.
:param ts: 1-D numpy array or pandas.Series
:return labels: `+1` for normal observations and `-1` for abnormal observations
"""
assert self._lag_window_size > self._feature_window_size, 'lag_window_size must be >= feature_window_size'
self._ref_ts = ts
scores = self._slide_chunks(ts)
self._ref_bitmap_scores = scores
thres = np.percentile(scores[self._lag_window_size: -self._lead_window_size + 1], self._q)
labels = np.full(len(ts), 1)
for idx, score in enumerate(scores):
if score > thres:
labels[idx] = -1
return labels
def fit(self, X, y=None):
old_threshold = None
threshold = None
self.threshold_ = 0.0
self._fit(X,y)
count = 0
while count < 100 and (old_threshold is None or abs(threshold - old_threshold) > 0.01):
old_threshold = threshold
ss = self.decision_function(X,y)
threshold = percentile(ss, 100 * self.contamination)
self._fit(X[ss > threshold],y[ss > threshold] if y is not None else None)
count += 1
self.threshold_ = threshold
return self
def _update_quantile(self):
states = np.array(self._quantile_states, dtype=np.float32)
limited_action_values = self.action_values(states, self._limited_action)
base_action_values = np.max(
np.array(
[
self.action_values(states, action)
for action in six.moves.range(self.num_actions)
if action != self._limited_action
]
),
axis=0
)
target = np.percentile(
limited_action_values - base_action_values, self._quantile
)
print("REWARD PENALTY TARGET:", target)
self.quantile_value += self._quantile_update_rate * target
print("QUANTILE:", self.quantile_value)
def b_value(mags, mt, perc=[2.5, 97.5], n_reps=None):
"""Compute the b-value and optionally its confidence interval."""
# Extract magnitudes above completeness threshold
m = mags[mags >= mt]
# Compute b-value
b = (np.mean(m) - mt) * np.log(10)
# Draw bootstrap replicates
if n_reps is None:
return b
else:
m_bs_reps = dcst.draw_bs_reps(m, np.mean, size=n_reps)
# Compute b-value from replicates
b_bs_reps = (m_bs_reps - mt) * np.log(10)
# Compute confidence interval
conf_int = np.percentile(b_bs_reps, perc)
return b, conf_int
def simple_slope_percentiles(res, df, target, varying, percs=[25, 50, 75]):
exog = {}
for param in res.fe_params.index:
if len(param.split(":")) != 1:
continue
if param == "Intercept":
exog[param] = 1.0
else:
exog[param] = np.median(df[param])
ret_vals = collections.OrderedDict()
for varying_perc in percs:
exog[varying] = np.percentile(df[varying], varying_perc)
ret_vals[exog[varying]] = collections.defaultdict(list)
for target_perc in [25, 75]:
exog[target] = np.percentile(df[target], target_perc)
exog_arr = np.array([exog[param] if len(param.split(":")) == 1 else exog[param.split(":")[0]] * exog[param.split(":")[1]]
for param in res.fe_params.index])
ret_vals[exog[varying]]["endog"].append(res.model.predict(res.fe_params, exog=exog_arr))
ret_vals[exog[varying]]["target"].append(exog[target])
return ret_vals
def simple_slope_categories(res, df, target, cat, cats):
exog = {}
for param in res.fe_params.index:
if len(param.split(":")) != 1:
continue
if param == "Intercept":
exog[param] = 1.0
elif param in cats:
exog[param] = 0
else:
exog[param] = np.mean(df[param])
if cat != None:
exog[cat] = 1
x_points = []
y_points = []
for target_perc in [10, 90]:
exog[target] = np.percentile(df[target], target_perc)
# exog[target] = target_perc
exog_arr = np.array([exog[param] if len(param.split(":")) == 1 else exog[param.split(":")[0]] * exog[param.split(":")[1]]
for param in res.fe_params.index])
y_points.append(res.model.predict(res.fe_params, exog=exog_arr))
x_points.append(exog[target])
return x_points, y_points
def test_value_at_risk(self):
value_at_risk = self.empyrical.value_at_risk
returns = [1.0, 2.0]
assert_almost_equal(value_at_risk(returns, cutoff=0.0), 1.0)
assert_almost_equal(value_at_risk(returns, cutoff=0.3), 1.3)
assert_almost_equal(value_at_risk(returns, cutoff=1.0), 2.0)
returns = [1, 81, 82, 83, 84, 85]
assert_almost_equal(value_at_risk(returns, cutoff=0.1), 41)
assert_almost_equal(value_at_risk(returns, cutoff=0.2), 81)
assert_almost_equal(value_at_risk(returns, cutoff=0.3), 81.5)
# Test a returns stream of 21 data points at different cutoffs.
returns = np.random.normal(0, 0.02, 21)
for cutoff in (0, 0.0499, 0.05, 0.20, 0.999, 1):
assert_almost_equal(
value_at_risk(returns, cutoff),
np.percentile(returns, cutoff * 100),
)
def value_at_risk(returns, cutoff=0.05):
"""
Value at risk (VaR) of a returns stream.
Parameters
----------
returns : pandas.Series or 1-D numpy.array
Non-cumulative daily returns.
cutoff : float, optional
Decimal representing the percentage cutoff for the bottom percentile of
returns. Defaults to 0.05.
Returns
-------
VaR : float
The VaR value.
"""
return np.percentile(returns, 100 * cutoff)
def get_packets_per_second(trace, features):
"""
Gets the total number of packets per second along with mean, standard deviation, min, max and median
@return a 1D list that contains the packets per second
"""
packets_per_second = {}
for val in trace:
second = floor(val[0])
if second not in packets_per_second:
packets_per_second[second] = 0
packets_per_second[second] += 1
l = list(packets_per_second.values())
features.append(sum(l) / len(l))
features.append(np.std(l))
features.append(np.percentile(l, 50))
features.append(min(l))
features.append(max(l))
return l
def get_inter_arrival_time(trace, in_trace, out_trace, features):
"""
For both the complete trace, the trace only containing incoming and the trace only containing outcoming packets,
we calculate the inter-arrival time.
Next, we add the max, mean, standard deviation and third quartile as features.
@param in_trace contains all the incoming packets and nothing else
@param out_trace contains all the outcoming packets and nothing else
"""
complete_inter_arrival_time = inter_packet_time(trace)
in_inter_arrival_time = inter_packet_time(in_trace)
out_inter_arrival_time = inter_packet_time(out_trace)
inter_arrival_times = [complete_inter_arrival_time, in_inter_arrival_time, out_inter_arrival_time]
for time in inter_arrival_times:
if len(time) == 0:
features.extend([0, 0, 0, 0])
else:
features.append(max(time))
features.append(sum(time) / len(time))
features.append(np.std(time))
features.append(np.percentile(time, 75))
def get_transmission_time_stats(trace, in_trace, out_trace, features):
"""
For the complete trace and the traces only containing incoming and outcoming packets, we extract the following:
- First, second and third quartile
- Total transmission time
"""
in_times = [x[0] for x in in_trace]
out_times = [x[0] for x in out_trace]
total_times = [x[0] for x in trace]
times = [total_times, in_times, out_times]
for time in times:
if len(time) == 0:
features.extend([0, 0, 0, 0])
else:
features.append(np.percentile(time, 25))
features.append(np.percentile(time, 50))
features.append(np.percentile(time, 75))
features.append(np.percentile(time, 100))
def deltasCSVWriter(self, name='ant'):
"Changes"
header = array([h.name[1:] for h in self.test.headers[:-2]])
oldRows = [r for r, p in zip(self.test._rows, self.pred) if p > 0]
delta = array([self.delta(t) for t in oldRows])
y = median(delta, axis=0)
yhi, ylo = percentile(delta, q=[75, 25], axis=0)
dat1 = sorted(
[(h, a, b, c) for h, a, b, c in zip(header, y, ylo, yhi)], key=lambda F: F[1])
dat = asarray([(d[0], n, d[1], d[2], d[3])
for d, n in zip(dat1, range(1, 21))])
with open('/Users/rkrsn/git/GNU-Plots/rkrsn/errorbar/%s.csv' % (name), 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=' ')
for el in dat[()]:
writer.writerow(el)
# new = [self.newRow(t) for t in oldRows]
def _nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
interpolation='linear', keepdims=False):
"""
Private function that doesn't support extended axis or keepdims.
These methods are extended to this function using _ureduce
See nanpercentile for parameter usage
"""
if axis is None:
part = a.ravel()
result = _nanpercentile1d(part, q, overwrite_input, interpolation)
else:
result = np.apply_along_axis(_nanpercentile1d, axis, a, q,
overwrite_input, interpolation)
# apply_along_axis fills in collapsed axis with results.
# Move that axis to the beginning to match percentile's
# convention.
if q.ndim != 0:
result = np.rollaxis(result, axis)
if out is not None:
out[...] = result
return result
def test_out(self):
mat = np.random.rand(3, 3)
nan_mat = np.insert(mat, [0, 2], np.nan, axis=1)
resout = np.zeros(3)
tgt = np.percentile(mat, 42, axis=1)
res = np.nanpercentile(nan_mat, 42, axis=1, out=resout)
assert_almost_equal(res, resout)
assert_almost_equal(res, tgt)
# 0-d output:
resout = np.zeros(())
tgt = np.percentile(mat, 42, axis=None)
res = np.nanpercentile(nan_mat, 42, axis=None, out=resout)
assert_almost_equal(res, resout)
assert_almost_equal(res, tgt)
res = np.nanpercentile(nan_mat, 42, axis=(0, 1), out=resout)
assert_almost_equal(res, resout)
assert_almost_equal(res, tgt)
def test_multiple_percentiles(self):
perc = [50, 100]
mat = np.ones((4, 3))
nan_mat = np.nan * mat
# For checking consistency in higher dimensional case
large_mat = np.ones((3, 4, 5))
large_mat[:, 0:2:4, :] = 0
large_mat[:, :, 3:] *= 2
for axis in [None, 0, 1]:
for keepdim in [False, True]:
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
val = np.percentile(mat, perc, axis=axis, keepdims=keepdim)
nan_val = np.nanpercentile(nan_mat, perc, axis=axis,
keepdims=keepdim)
assert_equal(nan_val.shape, val.shape)
val = np.percentile(large_mat, perc, axis=axis,
keepdims=keepdim)
nan_val = np.nanpercentile(large_mat, perc, axis=axis,
keepdims=keepdim)
assert_equal(nan_val, val)
megamat = np.ones((3, 4, 5, 6))
assert_equal(np.nanpercentile(megamat, perc, axis=(1, 2)).shape, (2, 3, 6))
def test_keepdims(self):
d = np.ones((3, 5, 7, 11))
assert_equal(np.percentile(d, 7, axis=None, keepdims=True).shape,
(1, 1, 1, 1))
assert_equal(np.percentile(d, 7, axis=(0, 1), keepdims=True).shape,
(1, 1, 7, 11))
assert_equal(np.percentile(d, 7, axis=(0, 3), keepdims=True).shape,
(1, 5, 7, 1))
assert_equal(np.percentile(d, 7, axis=(1,), keepdims=True).shape,
(3, 1, 7, 11))
assert_equal(np.percentile(d, 7, (0, 1, 2, 3), keepdims=True).shape,
(1, 1, 1, 1))
assert_equal(np.percentile(d, 7, axis=(0, 1, 3), keepdims=True).shape,
(1, 1, 7, 1))
assert_equal(np.percentile(d, [1, 7], axis=(0, 1, 3),
keepdims=True).shape, (2, 1, 1, 7, 1))
assert_equal(np.percentile(d, [1, 7], axis=(0, 3),
keepdims=True).shape, (2, 1, 5, 7, 1))
def test_out_nan(self):
with warnings.catch_warnings(record=True):
warnings.filterwarnings('always', '', RuntimeWarning)
o = np.zeros((4,))
d = np.ones((3, 4))
d[2, 1] = np.nan
assert_equal(np.percentile(d, 0, 0, out=o), o)
assert_equal(
np.percentile(d, 0, 0, interpolation='nearest', out=o), o)
o = np.zeros((3,))
assert_equal(np.percentile(d, 1, 1, out=o), o)
assert_equal(
np.percentile(d, 1, 1, interpolation='nearest', out=o), o)
o = np.zeros(())
assert_equal(np.percentile(d, 1, out=o), o)
assert_equal(
np.percentile(d, 1, interpolation='nearest', out=o), o)
def quantile(x, q, weights=None):
"""
Like numpy.percentile, but:
* Values of q are quantiles [0., 1.] rather than percentiles [0., 100.]
* scalar q not supported (q must be iterable)
* optional weights on x
"""
if weights is None:
return np.percentile(x, [100. * qi for qi in q])
else:
idx = np.argsort(x)
xsorted = x[idx]
cdf = np.add.accumulate(weights[idx])
cdf /= cdf[-1]
return np.interp(q, cdf, xsorted).tolist()
def _degradation_CI(results):
'''
Description
-----------
Monte Carlo estimation of uncertainty in degradation rate from OLS results
Parameters
----------
results: OLSResults object from fitting a model of the form:
results = sm.OLS(endog = df.energy_ma, exog = df.loc[:,['const','years']]).fit()
Returns
-------
68.2% confidence interval for degradation rate
'''
sampled_normal = np.random.multivariate_normal(results.params, results.cov_params(), 10000)
dist = sampled_normal[:, 1] / sampled_normal[:, 0]
Rd_CI = np.percentile(dist, [50 - 34.1, 50 + 34.1]) * 100.0
return Rd_CI
def info(self,burn=1000,plot=False):
"""
Print the summary statistics and optionally plot the results
"""
rows=len(self.varnames)
cols=2
chain=np.array(self.chain[burn:])
nsize=chain.shape[0]
# print rows,cols
print '%4s %16s %12s %12s [%12s, %12s, %12s]'%('no','name','mean','stddev','16%','50%','84%')
for i,name in enumerate(self.varnames):
temp=np.percentile(chain[:,i],[16.0,84.0,50.0])
print '%4i %16s %12g %12g [%12g, %12g, %12g]'%(i,name,np.mean(chain[:,i]),(temp[1]-temp[0])/2.0,temp[0],temp[2],temp[1])
if plot:
ax=plt.subplot(rows,cols,2*i+1)
# plt.text(0.05,0.9,r'$\tau$='+'%5.1f'%(acor.acor(chain[:,i])[0]),transform=ax.transAxes)
plt.plot(chain[:,i])
plt.ylabel(self.model.descr[name][3])
plt.xlabel('Iteration')
ax=plt.subplot(rows,cols,2*i+2)
plt.hist(chain[:,i],bins=100,histtype='step')
plt.text(0.05,0.9,sround(np.mean(chain[:,i]),temp[0],temp[1]),transform=ax.transAxes)
plt.xlabel(self.model.descr[name][3])
# plt.text(0.05,0.9,'%6g %3g (%4g-%4g)'%(np.mean(chain[:,i]),(temp[1]-temp[0])/2.0,temp[0],temp[1]),transform=ax.transAxes)
def filterout_outliers(l_data, l_date):
'''
Return a list with data filtered from outliers
:param l_data: list. data to be analyzed
'''
# === old code to filter outliers =======
# Q3 = np.percentile(l_data, 98)
# Q1 = np.percentile(l_data, 2)
# step = (Q3 - Q1) * 1.5
# step = max(3000., step)
# na_val = np.array(l_data)
# na_val = na_val[(na_val >= Q1 - step) & (na_val <= Q3 + step)]
# return na_val
# =======================================
# group by minute
df_filter = pd.Series(np.array(l_date)/60).astype(int)
l_filter = list((df_filter != df_filter.shift()).values)
l_filter[0] = True
l_filter[-1] = True
return np.array(pd.Series(l_data)[l_filter].values)
def filterout_outliers(l_data, l_date):
'''
Return a list with data filtered from outliers
:param l_data: list. data to be analyzed
'''
# === old code to filter outliers =======
# Q3 = np.percentile(l_data, 98)
# Q1 = np.percentile(l_data, 2)
# step = (Q3 - Q1) * 1.5
# step = max(3000., step)
# na_val = np.array(l_data)
# na_val = na_val[(na_val >= Q1 - step) & (na_val <= Q3 + step)]
# return na_val
# =======================================
# group by minute
df_filter = pd.Series(np.array(l_date)/60).astype(int)
l_filter = list((df_filter != df_filter.shift()).values)
l_filter[0] = True
l_filter[-1] = True
return np.array(pd.Series(l_data)[l_filter].values)
def _cut_windows_vertically(self, door_top, roof_top, sky_sig, win_strip):
win_sig = np.percentile(win_strip, 85, axis=1)
win_sig[sky_sig > 0.5] = 0
if win_sig.max() > 0:
win_sig /= win_sig.max()
win_sig[:roof_top] = 0
win_sig[door_top:] = 0
runs, starts, values = run_length_encode(win_sig > 0.5)
win_heights = runs[values]
win_tops = starts[values]
if len(win_heights) > 0:
win_bottom = win_tops[-1] + win_heights[-1]
win_top = win_tops[0]
win_vertical_spacing = np.diff(win_tops).mean() if len(win_tops) > 1 else 0
else:
win_bottom = win_top = win_vertical_spacing = -1
self.top = int(win_top)
self.bottom = int(win_bottom)
self.vertical_spacing = int(win_vertical_spacing)
self.vertical_scores = make_list(win_sig)
self.heights = np.array(win_heights)
self.tops = np.array(win_tops)
def _first_interval(x, n_bins):
"""
Gets the first interval based on the percentiles,
either a closed interval containing the same value multiple times
or a closed-open interval with a different lower and upper bound.
"""
# calculate the percentiles
percentiles = np.linspace(0., 100., n_bins + 1)
bounds = np.percentile(x, q=percentiles, interpolation='higher')
lower = bounds[0]
upper = bounds[1]
if lower == upper:
return lower, upper, True, True
else:
return lower, upper, True, False
#------- private methods for categorical binnings-------#
def random_submatrix(Z,g,s,o,threshpct=99,returnIdx=False):
xg = np.zeros(Z.shape[0],dtype=np.bool)
xg[:g] = True
np.random.shuffle(xg)
xt = np.zeros(Z.shape[1],dtype=np.bool)
xt[:s] = True
np.random.shuffle(xt)
X0 = Z[xg][:,xt]
thresh = np.percentile(X0,threshpct)
X0[(X0 > thresh)] = thresh
xo = np.zeros(X0.shape[0],dtype=np.bool)
xo[:o] = True
np.random.shuffle(xo)
Xobs = X0[xo]
if returnIdx:
return X0,xo,Xobs,xt,xg
else:
return X0,xo,Xobs
def filter_and_smooth_angular_velocity(angular_velocity,
low_pass_kernel_size, clip_percentile, plot=False):
"""Reduce the noise in a velocity signal."""
max_value = np.percentile(angular_velocity, clip_percentile)
print("Clipping angular velocity norms to {} rad/s ...".format(max_value))
angular_velocity_clipped = np.clip(angular_velocity, -max_value, max_value)
print("Done clipping angular velocity norms...")
low_pass_kernel = np.ones((low_pass_kernel_size, 1)) / low_pass_kernel_size
print("Smoothing with kernel size {} samples...".format(low_pass_kernel_size))
angular_velocity_smoothed = signal.correlate(angular_velocity_clipped,
low_pass_kernel, 'same')
print("Done smoothing angular velocity norms...")
if plot:
plot_angular_velocities("Angular Velocities", angular_velocity,
angular_velocity_smoothed, True)
return angular_velocity_smoothed.copy()
def norm_post_sim(modes,cov_matrix):
post = multivariate_normal(modes,cov_matrix)
nsims = 30000
phi = np.zeros([nsims,len(modes)])
for i in range(0,nsims):
phi[i] = post.rvs()
chain = np.array([phi[i][0] for i in range(len(phi))])
for m in range(1,len(modes)):
chain = np.vstack((chain,[phi[i][m] for i in range(len(phi))]))
mean_est = [np.mean(np.array([phi[i][j] for i in range(len(phi))])) for j in range(len(modes))]
median_est = [np.median(np.array([phi[i][j] for i in range(len(phi))])) for j in range(len(modes))]
upper_95_est = [np.percentile(np.array([phi[i][j] for i in range(len(phi))]),95) for j in range(len(modes))]
lower_95_est = [np.percentile(np.array([phi[i][j] for i in range(len(phi))]),5) for j in range(len(modes))]
return chain, mean_est, median_est, upper_95_est, lower_95_est
def calculate_aggregate(values):
agg_measures = {
'avg': np.mean(values),
'std': np.std(values),
'var': np.var(values),
'med': np.median(values),
'10p': np.percentile(values, 10),
'25p': np.percentile(values, 25),
'50p': np.percentile(values, 50),
'75p': np.percentile(values, 75),
'90p': np.percentile(values, 90),
'iqr': np.percentile(values, 75) - np.percentile(values, 25),
'iqm': interquartile_range_mean(values),
'mad': mean_absolute_deviation(values),
'cov': 1.0 * np.mean(values) / np.std(values),
'gin': gini_coefficient(values),
'skw': stats.skew(values),
'kur': stats.kurtosis(values),
'sum': np.sum(values)
}
return agg_measures
def get_pr(reference_frames,output_frames,mode='type',pr_resolution=100):
# filter output by confidence
confidence = collect_confidence(output_frames)
conf_order = []
step = 100/float(pr_resolution)
for j in range(1,pr_resolution+1):
conf_order.append( np.percentile(confidence, j*step) )
conf_order = [-1] + conf_order + [2]
# get curve
params = []
for threshold in conf_order:
params.append( [ reference_frames, output_frames, confidence, threshold, mode ] )
all_tp, all_fp, all_fn, all_prec, all_rec = zip(*pool.map(single_point, params))
all_prec = list(all_prec) #+ [0]
all_rec = list(all_rec) #+ [1]
all_rec, all_prec = zip(*sorted(zip(all_rec, all_prec)))
AUC = metrics.auc(all_rec, all_prec)
return all_rec, all_prec, AUC
# create complete output for 1 mode --------------------------------------------
def results_stat():
for file in os.listdir('./results'):
if file.endswith('.txt'):
file_path = os.path.join('./results', file)
data = [[], [], [], []]
with open(file_path, 'r') as f:
lines = f.read().split('\n')
for line in lines:
if len(line) > 0:
line_data = map(eval, line.split())
for i in range(4):
data[i].append(line_data[i])
lines = [[], [], [], []]
for i in range(4):
lines[i].append(np.mean(data[i]))
lines[i].append(np.std(data[i]))
lines[i].append(np.min(data[i]))
lines[i].append(np.percentile(data[i], 25))
lines[i].append(np.percentile(data[i], 50))
lines[i].append(np.percentile(data[i], 75))
lines[i].append(np.max(data[i]))
output_path = os.path.join('./stat', file)
with open(output_path, 'w') as f:
for line in lines:
f.write('\t'.join(map(str, line)) + '\n')
def splitclusters(datapointwts,seeds,theta):
std = {}; seeds1 = []; seedweight = [];
cluster, p2cluster = point2cluster(datapointwts, seeds,theta);
for cl in cluster:
mang = seeds[cl][-1];
if len(cluster[cl]) > 10:
std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90)
clockwise = [xx for xx in cluster[cl] if greaterthanangle(xx[2], mang)];
if std[cl]>20 and len(clockwise)>0 and len(clockwise)<len(cluster[cl]):
seeds1.append(avgpoint(clockwise))
seeds1.append(avgpoint([xx for xx in cluster[cl] if not greaterthanangle(xx[2], mang)]))
seedweight.append(len(clockwise))
seedweight.append(len(cluster[cl]) -len(clockwise))
else:
seeds1.append(seeds[cl]); seedweight.append(len(cluster[cl]))
else:
seeds1.append(seeds[cl]); seedweight.append(len(cluster[cl]))
return seeds1, seedweight
def splitclustersparallel(datapointwts,seeds):
X = [(xx[0], xx[1]) for xx in datapointwts]; S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = []; roadwidth = [];
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(S)
distances, indices = nbrs.kneighbors(X)
for cd in range(len(seeds)):
cluster[cd] = []; roadwidth.append(0);
for ii, ll in enumerate(indices):
dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll]
cd = ll[dd.index(min(dd))];
cluster[cd].append(datapointwts[ii])
p2cluster.append(cd)
for cl in cluster:
mang = seeds[cl][-1];
scl = seeds[cl]
if len(cluster[cl]) > 10:
std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90)
roadwidth[cl] = 1+5*np.std([geodist(scl,xx)*np.sin(anglebetweentwopoints(scl,xx)-scl[-1]) for xx in cluster[cl]])
print(cl,scl,[(anglebetweentwopoints(scl,xx),scl[-1]) for xx in cluster[cl]])
def _write_config(self, fasta_filename):
"""Write daligner sensitive config to fasta_filename.sensitive.config."""
lens = [len(r.sequence) for r in ContigSetReaderWrapper(fasta_filename)]
self.low_cDNA_size, self.high_cDNA_size = 0, 0
if len(lens) == 1:
self.low_cDNA_size, self.high_cDNA_size = lens[0], lens[0]
if len(lens) >= 2:
self.low_cDNA_size = int(np.percentile(lens, 10))
self.high_cDNA_size = int(np.percentile(lens, 90))
try:
with open(fasta_filename+'.sensitive.config', 'w') as f:
f.write("sensitive={s}\n".format(s=self.sensitive_mode))
f.write("low={l}\n".format(l=self.low_cDNA_size))
f.write("high={h}\n".format(h=self.high_cDNA_size))
except IOError:
pass # it's OK not to have write permission
def summary(self):
""" Method to produce a summary table of of the Mack Chainladder
model.
Returns:
This calculation is consistent with the R calculation
BootChainLadder$summary
"""
IBNR = self.IBNR
tri = self.tri
summary = pd.DataFrame()
summary['Latest'] = tri.get_latest_diagonal()
summary['Mean Ultimate'] = summary['Latest'] + pd.Series([np.mean(np.array(IBNR)[:,num]) for num in range(len(tri.data))],index=summary.index)
summary['Mean IBNR'] = summary['Mean Ultimate'] - summary['Latest']
summary['SD IBNR'] = pd.Series([np.std(np.array(IBNR)[:,num]) for num in range(len(tri.data))],index=summary.index)
summary['IBNR 75%'] = pd.Series([np.percentile(np.array(IBNR)[:,num],q=75) for num in range(len(tri.data))],index=summary.index)
summary['IBNR 95%'] = pd.Series([np.percentile(np.array(IBNR)[:,num],q=95) for num in range(len(tri.data))],index=summary.index)
return summary
def run_ltsd_example():
fs, d = fetch_sample_speech_tapestry()
winsize = 1024
d = d.astype("float32") / 2 ** 15
d -= d.mean()
pad = 3 * fs
noise_pwr = np.percentile(d, 1) ** 2
noise_pwr = max(1E-9, noise_pwr)
d = np.concatenate((np.zeros((pad,)) + noise_pwr * np.random.randn(pad), d))
_, vad_segments = ltsd_vad(d, fs, winsize=winsize)
v_up = np.where(vad_segments == True)[0]
s = v_up[0]
st = v_up[-1] + int(.5 * fs)
d = d[s:st]
bname = "tapestry.wav".split(".")[0]
wavfile.write("%s_out.wav" % bname, fs, soundsc(d))
def run_ltsd_example():
fs, d = fetch_sample_speech_tapestry()
winsize = 1024
d = d.astype("float32") / 2 ** 15
d -= d.mean()
pad = 3 * fs
noise_pwr = np.percentile(d, 1) ** 2
noise_pwr = max(1E-9, noise_pwr)
d = np.concatenate((np.zeros((pad,)) + noise_pwr * np.random.randn(pad), d))
_, vad_segments = ltsd_vad(d, fs, winsize=winsize)
v_up = np.where(vad_segments == True)[0]
s = v_up[0]
st = v_up[-1] + int(.5 * fs)
d = d[s:st]
bname = "tapestry.wav".split(".")[0]
wavfile.write("%s_out.wav" % bname, fs, soundsc(d))
def _compute_data_weights_topk(self, opts, density_ratios):
"""Put a uniform distribution on K points with largest prob real data.
This is a naiive heuristic which makes next GAN concentrate on those
points of the training set, which were classified correctly with
largest margins. I.e., out current mixture model is not capable of
generating points looking similar to these ones.
"""
threshold = np.percentile(density_ratios,
opts["topk_constant"]*100.0)
# Note that largest prob_real_data corresponds to smallest density
# ratios.
mask = density_ratios <= threshold
data_weights = np.zeros(self._data_num)
data_weights[mask] = 1.0 / np.sum(mask)
return data_weights
def quantile_binning(data=None, bins=10, qrange=(0.0, 1.0), **kwargs):
"""Binning schema based on quantile ranges.
This binning finds equally spaced quantiles. This should lead to
all bins having roughly the same frequencies.
Note: weights are not (yet) take into account for calculating
quantiles.
Parameters
----------
bins: sequence or Optional[int]
Number of bins
qrange: Optional[tuple]
Two floats as minimum and maximum quantile (default: 0.0, 1.0)
Returns
-------
StaticBinning
"""
if np.isscalar(bins):
bins = np.linspace(qrange[0] * 100, qrange[1] * 100, bins + 1)
bins = np.percentile(data, bins)
return static_binning(bins=make_bin_array(bins), includes_right_edge=True)
def estimate_bayes_factor(traces, logp, r=0.05):
"""
Esitmate Odds ratios in a random subsample of the chains in MCMC
AstroML (see Eqn 5.127, pg 237)
"""
ndim, nsteps = traces.shape # [ndim,number of steps in chain]
# compute volume of a n-dimensional (ndim) sphere of radius r
Vr = np.pi ** (0.5 * ndim) / gamma(0.5 * ndim + 1) * (r ** ndim)
# use neighbor count within r as a density estimator
bt = BallTree(traces.T)
count = bt.query_radius(traces.T, r=r, count_only=True)
# BF = N*p/rho
bf = logp + np.log(nsteps) + np.log(Vr) - np.log(count) #log10(bf)
p25, p50, p75 = np.percentile(bf, [25, 50, 75])
return p50, 0.7413 * (p75 - p25)
########################################################################
def reducer(self, key, values):
if key == 'latencies':
values = list(values)
yield 'average_latency', numpy.average(values)
yield 'median_latency', numpy.median(values)
yield '95th_percentile_latency', numpy.percentile(values, 95)
elif key == 'sent':
first, last, count = minmax(values)
yield 'count', count
yield 'sending_first', first
yield 'sending_last', last
yield 'sending_overhead', (last - first) / (count - 1)
yield 'sending_throughput', (count - 1) / (last - first)
elif key == 'received':
first, last, count = minmax(values)
yield 'receiving_first', first
yield 'receiving_last', last
yield 'receiving_overhead', (last - first) / (count - 1)
yield 'receiving_throughput', (count - 1) / (last - first)
def calculate_batchsize_maxlen(texts):
""" Calculates the maximum length in the provided texts and a suitable
batch size. Rounds up maxlen to the nearest multiple of ten.
# Arguments:
texts: List of inputs.
# Returns:
Batch size,
max length
"""
def roundup(x):
return int(math.ceil(x / 10.0)) * 10
# Calculate max length of sequences considered
# Adjust batch_size accordingly to prevent GPU overflow
lengths = [len(tokenize(t)) for t in texts]
maxlen = roundup(np.percentile(lengths, 80.0))
batch_size = 250 if maxlen <= 100 else 50
return batch_size, maxlen
def ss_octile(y):
"""Obtain the octile summary statistic.
The statistic reaches the optimal performance upon a high number of
observations. According to Allingham et al. (2009), it is more stable than ss_robust.
Parameters
----------
y : array_like
Yielded points.
Returns
-------
array_like of the shape (batch_size, dim_ss=8, dim_ss_point)
"""
octiles = np.linspace(12.5, 87.5, 7)
E1, E2, E3, E4, E5, E6, E7 = np.percentile(y, octiles, axis=1)
# Combining the summary statistics.
ss_octile = np.hstack((E1, E2, E3, E4, E5, E6, E7))
ss_octile = ss_octile[:, :, np.newaxis]
return ss_octile
def flag_outliers(series, iqr_multiplier=1.5):
"""Use Tukey's boxplot criterion for outlier identification.
"""
top_quartile_cutoff = np.percentile(series.get_values(), 75)
bottom_quartile_cutoff = np.percentile(series.get_values(), 25)
# Compute interquartile range
iqr = top_quartile_cutoff - bottom_quartile_cutoff
top_outlier_cutoff = top_quartile_cutoff + iqr * iqr_multiplier
bottom_outlier_cutoff = bottom_quartile_cutoff - iqr * iqr_multiplier
return series[(series < bottom_outlier_cutoff) | (series > top_outlier_cutoff)]
# In[ ]:
def truncate_and_scale(data, percMin=0.01, percMax=99.9, zeroTo=1.0):
"""Truncate and scale the data as a preprocessing step.
Parameters
----------
data : nd numpy array
Data/image to be truncated and scaled.
percMin : float, positive
Minimum percentile to be truncated.
percMax : float, positive
Maximum percentile to be truncated.
zeroTo : float
Data will be returned in the range from 0 to this number.
Returns
-------
data : nd numpy array
Truncated and scaled data/image.
"""
# adjust minimum
percDataMin = np.percentile(data, percMin)
data[np.where(data < percDataMin)] = percDataMin
data = data - data.min()
# adjust maximum
percDataMax = np.percentile(data, percMax)
data[np.where(data > percDataMax)] = percDataMax
data = 1./data.max() * data
return data * zeroTo
def preprocess_bitmap(bitmap):
# contrast stretching
p2, p98 = np.percentile(bitmap, (2, 98))
assert abs(p2-p98) > 10
bitmap = skimage.exposure.rescale_intensity(bitmap, in_range=(p2, p98))
# from skimage.filters import threshold_otsu
# thresh = threshold_otsu(bitmap)
# bitmap = bitmap > thresh
return bitmap
def plot_tsne_totalcounts(chart, sample_properties, sample_data):
""" Plot cells colored by total counts """
analysis = sample_data.analysis
if not analysis or len(sample_properties['genomes']) > 1:
return None
reads_per_bc = analysis.matrix.get_reads_per_bc()
vmin, vmax = np.percentile(reads_per_bc, ws_gex_constants.TSNE_TOTALCOUNTS_PRCT_CLIP)
return plot_dimensions_color(chart, analysis.get_tsne().transformed_tsne_matrix,
reads_per_bc,
ws_gex_constants.TSNE_TOTALCOUNTS_DESCRIPTION,
vmin, vmax,
1, 2)