我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用scipy.stats.scoreatpercentile()。
def discrete(self, x, bin=5): #res = np.array([0] * x.shape[-1], dtype=int) #?????????????????????WOE?????????????<=?WOE?? x_copy = pd.Series.copy(x) x_copy = x_copy.astype(str) #x_copy = x_copy.astype(np.str_) #x_copy = x x_gt0 = x[x>=0] #if x.name == 'TD_PLTF_CNT_1M': #bin = 5 #x_gt0 = x[(x>=0) & (x<=24)] for i in range(bin): point1 = stats.scoreatpercentile(x_gt0, i * (100.0/bin)) point2 = stats.scoreatpercentile(x_gt0, (i + 1) * (100.0/bin)) x1 = x[(x >= point1) & (x <= point2)] mask = np.in1d(x, x1) #x_copy[mask] = i + 1 x_copy[mask] = '%s-%s' % (point1,point2) #x_copy[mask] = point1 #print x_copy[mask] #print x #print x return x_copy
def grade(self, x, bin=5): #res = np.array([0] * x.shape[-1], dtype=int) #?????????????????????WOE?????????????<=?WOE?? x_copy = np.copy(x) #x_copy = x_copy.astype(str) #x_copy = x_copy.astype(np.str_) #x_copy = x x_gt0 = x[x>=0] for i in range(bin): point1 = stats.scoreatpercentile(x_gt0, i * (100.0/bin)) point2 = stats.scoreatpercentile(x_gt0, (i + 1) * (100.0/bin)) x1 = x[(x >= point1) & (x <= point2)] mask = np.in1d(x, x1) #x_copy[mask] = i + 1 x_copy[mask] = i + 1 #x_copy[mask] = point1 #print x_copy[mask] #print x print point1,point2 #print x return x_copy
def print_stats(data): data = np.array(data) desc = stats.describe(data) print('# of observations:', desc.nobs) print('min: %d\nmax: %d' % desc.minmax) print('mean: %.1f' % desc.mean) # print('variance: %.1f' % desc.variance) print('stdev: %.1f' % math.sqrt(desc.variance)) print('percentiles') for p in PERCENTILES: print('%6.2f' % p, ' ', end='') print() for p in stats.scoreatpercentile(data, PERCENTILES): print('%6d' % p, ' ', end='') print()
def __call__(self, y, pred, sample_weight=None): pred = pred.ravel() diff = y - pred gamma = self.gamma if gamma is None: if sample_weight is None: gamma = stats.scoreatpercentile(np.abs(diff), self.alpha * 100) else: gamma = _weighted_percentile(np.abs(diff), sample_weight, self.alpha * 100) gamma_mask = np.abs(diff) <= gamma if sample_weight is None: sq_loss = np.sum(0.5 * diff[gamma_mask] ** 2.0) lin_loss = np.sum(gamma * (np.abs(diff[~gamma_mask]) - gamma / 2.0)) loss = (sq_loss + lin_loss) / y.shape[0] else: sq_loss = np.sum(0.5 * sample_weight[gamma_mask] * diff[gamma_mask] ** 2.0) lin_loss = np.sum(gamma * sample_weight[~gamma_mask] * (np.abs(diff[~gamma_mask]) - gamma / 2.0)) loss = (sq_loss + lin_loss) / sample_weight.sum() return loss
def _get_support_mask(self): check_is_fitted(self, 'scores_') # Cater for NaNs if self.percentile == 100: return np.ones(len(self.scores_), dtype=np.bool) elif self.percentile == 0: return np.zeros(len(self.scores_), dtype=np.bool) scores = _clean_nans(self.scores_) treshold = stats.scoreatpercentile(scores, 100 - self.percentile) mask = scores > treshold ties = np.where(scores == treshold)[0] if len(ties): max_feats = int(len(scores) * self.percentile / 100) kept_ties = ties[:max_feats - mask.sum()] mask[kept_ties] = True return mask
def iqr(a): """ Calculate the IQR for an array of numbers. """ a = np.asarray(a) q1 = stats.scoreatpercentile(a, 25) q3 = stats.scoreatpercentile(a, 75) return q3 - q1
def get_sparse_int(sz, rng, sparsity, hard=True, seed=1299821): np.random.seed(seed) cost = np.random.randint(1, rng+1, size=(sz, sz)) if hard is True: cost = make_hard(cost, 0, rng) mask = np.random.rand(sz, sz) thresh = scoreatpercentile( mask.flat, max(0, (sparsity - sz/float(sz*sz)) * 100.)) mask = mask < thresh # Make sure there exists a solution. row = np.random.permutation(sz) col = np.random.permutation(sz) mask[row, col] = True return cost, mask
def calc_quantiles(vals): quantiles = [] for i in range(1, 100): quantiles.append(stats.scoreatpercentile(vals.flatten(), i)) return quantiles
def Denoise(data): """ This function implements the denoising given in the url below: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4607982&tag=1 with title "Astronomical Spectra Denoising Based on Simplifed SURE-LET Wavelet Thresholding" The data should be a kglib.utils.DataStructures.xypoint instance. """ y, boolarr = mlpy.wavelet.pad(data.y) WC = mlpy.wavelet.dwt(y, 'd', 10, 0) # Figure out the unknown parameter 'a' sum1 = 0.0 sum2 = 0.0 numlevels = int(np.log2(WC.size)) start = 2 ** (numlevels - 1) median = np.median(WC[start:]) sigma = np.median(np.abs(WC[start:] - median)) / 0.6745 for w in WC: phi = w * np.exp(-w ** 2 / (12.0 * sigma ** 2)) dphi = np.exp(-w ** 2 / (12.0 * sigma ** 2)) * (1 - 2 * w ** 2 / (12 * sigma ** 2) ) sum1 += sigma ** 2 * dphi sum2 += phi ** 2 a = -sum1 / sum2 # Adjust all wavelet coefficients WC = WC + a * WC * np.exp(-WC ** 2 / (12 * sigma ** 2)) # Now, do a soft threshold threshold = scoreatpercentile(WC, 80.0) WC[np.abs(WC) <= threshold] = 0.0 WC[np.abs(WC) > threshold] -= threshold * np.sign(WC[np.abs(WC) > threshold]) #Transform back y2 = mlpy.wavelet.idwt(WC, 'd', 10) data.y = y2[boolarr] return data # Kept for legacy support, since I was using Denoise3 in several codes in the past.
def filter_data(x, percentile, no_zeros=True): """Remove data from an array which is below a certain percentile value. Optionally, if no_zeros is specified, also remove any zeros from the array. If removing values would result in returning an empty array, do nothing. :param x: Output values are taken from this array :type x: :class:`~numpy.ndarray` :param float percentile: Percentile at which to remove values \ (e.g. if percentile=95.0, only the top 5% of values \ are retained). :param bool no_zeros: If True, also discard any values equal \ to zero from the output array. :returns: New array contining values from x that pass the filter. """ percentile_score = scoreatpercentile(x, percentile) less_than_percentile = list(x < percentile_score) if no_zeros: not_a_zero = x > 0 # only keep points which are both less than percentile AND not a zero points_to_keep = list(map(all, list(zip(less_than_percentile, not_a_zero)))) else: points_to_keep = less_than_percentile out_data = x[points_to_keep] if out_data.size: return out_data if no_zeros: return x[not_a_zero] return x
def atomic_benchmark_estimator(estimator, X_test, verbose=False): """Measure runtime prediction of each instance.""" n_instances = X_test.shape[0] runtimes = np.zeros(n_instances, dtype=np.float) for i in range(n_instances): instance = X_test[[i], :] start = time.time() estimator.predict(instance) runtimes[i] = time.time() - start if verbose: print("atomic_benchmark runtimes:", min(runtimes), scoreatpercentile( runtimes, 50), max(runtimes)) return runtimes
def bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose): """Measure runtime prediction of the whole input.""" n_instances = X_test.shape[0] runtimes = np.zeros(n_bulk_repeats, dtype=np.float) for i in range(n_bulk_repeats): start = time.time() estimator.predict(X_test) runtimes[i] = time.time() - start runtimes = np.array(list(map(lambda x: x / float(n_instances), runtimes))) if verbose: print("bulk_benchmark runtimes:", min(runtimes), scoreatpercentile( runtimes, 50), max(runtimes)) return runtimes
def n_feature_influence(estimators, n_train, n_test, n_features, percentile): """ Estimate influence of the number of features on prediction time. Parameters ---------- estimators : dict of (name (str), estimator) to benchmark n_train : nber of training instances (int) n_test : nber of testing instances (int) n_features : list of feature-space dimensionality to test (int) percentile : percentile at which to measure the speed (int [0-100]) Returns: -------- percentiles : dict(estimator_name, dict(n_features, percentile_perf_in_us)) """ percentiles = defaultdict(defaultdict) for n in n_features: print("benchmarking with %d features" % n) X_train, y_train, X_test, y_test = generate_dataset(n_train, n_test, n) for cls_name, estimator in estimators.items(): estimator.fit(X_train, y_train) gc.collect() runtimes = bulk_benchmark_estimator(estimator, X_test, 30, False) percentiles[cls_name][n] = 1e6 * scoreatpercentile(runtimes, percentile) return percentiles
def fit(self, X, y, sample_weight=None): if sample_weight is None: self.quantile = stats.scoreatpercentile(y, self.alpha * 100.0) else: self.quantile = _weighted_percentile(y, sample_weight, self.alpha * 100.0)
def calculate_decision(histogram, bins): bins = bins[0:numpy.size(bins)-1] thresh_above = histogram[numpy.where(bins>2)] thresh_below = histogram[numpy.where(bins<=2)] thresh_above = numpy.sort(thresh_above) lower_quartile = stats.scoreatpercentile(thresh_above, 25) upper_quartile = stats.scoreatpercentile(thresh_above, 75) outlier = 2 * upper_quartile - lower_quartile decision = bool(numpy.size(thresh_above[numpy.where(thresh_below > outlier)])) return decision
def _select_features(self, all_scores, all_pvalues, feature_names): """This function selects the top ``percentile`` of features from the F-scores. Parameters ---------- all_scores : np.ndarray (float) The scores all_pvalues : np.ndarray (float) The p-values feature_names : array_like (str) The list of names that are eligible for drop Returns ------- list : the features to drop """ percentile = self.percentile # compute which features to keep or drop if percentile == 100: return [] elif percentile == 0: return feature_names else: # adapted from sklearn.feature_selection.SelectPercentile all_scores = _clean_nans(all_scores) thresh = stats.scoreatpercentile(all_scores, 100 - percentile) mask = all_scores > thresh ties = np.where(all_scores == thresh)[0] if len(ties): max_feats = int(len(all_scores) * percentile / 100) kept_ties = ties[:max_feats - mask.sum()] mask[kept_ties] = True # inverse, since we're recording which features to DROP, not keep mask = np.asarray(~mask) # now se the drop as the inverse mask return (np.asarray(feature_names)[mask]).tolist()
def fit(self,X,y=None): #create the data generators self.generators = [None] * X.shape[1] for col in xrange(X.shape[1]): if(self.discrete_threshold > 1): discrete_gen = discrete.DiscreteGenerator(X[:,col]) if(discrete_gen.total_keys < self.discrete_threshold): generator = discrete_gen else: mean = np.mean(X[:,col]) stddev = np.std(X[:,col]) if(stddev == 0): generator = abstract.DummyGenerator(mean) else: generator = gaussian.GaussianGenerator(mean,stddev,self.random_state) else: mean = np.mean(X[:,col]) stddev = np.std(X[:,col]) if(stddev == 0): generator = abstract.DummyGenerator(mean) else: generator = gaussian.GaussianGenerator(mean,stddev,self.random_state) self.generators[col] = generator #generate data totalInstances = len(X) / (1 - self.proportion_generated) generated_len = int(totalInstances - len(X)) generated = [None] * generated_len for i in xrange(generated_len): row = [None] * X.shape[1] for col in xrange(X.shape[1]): row[col] = self.generators[col].generate() generated[i] = row #work out the threshold of prob(X|C) using cross validation skf = StratifiedKFold(n_splits=self.cv_folds,\ random_state=self.random_state, shuffle=True) newX = np.vstack((X,generated)) newY = np.hstack((np.ones(len(X)),np.zeros(len(X)))) thresholds = [None] * self.cv_folds for i, (train_indices, test_indices) in enumerate(skf.split(newX,newY)): if(~self.density_only): #only train if you need to! self.base_classifier.fit(newX[train_indices], newY[train_indices]) probabilities = self._get_probabilities(newX[test_indices]) thresholds[i] = stats.scoreatpercentile(probabilities, 100 * self.contamination) self.threshold = np.mean(thresholds) #retrain on all the data if(~self.density_only): self.base_classifier.fit(newX,newY)
def clock_filter(self, root_seq=None, n_iqd=3, max_gaps = 1.0, plot=False): ''' remove sequences form the set that are that evolve much faster or slower compared the majority. Regions with predominantly gaps can be removed since this can skew the evolutionary rates. ''' if root_seq is None: # use consensus af = calc_af(self.aln, nuc_alpha) root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)] if type(root_seq)==str and root_seq in self.sequence_lookup: root_seq = np.array(self.sequence_lookup[root_seq]) if max_gaps<1.0: af=calc_af(self.aln, nuc_alpha) good_pos = af[nuc_alpha.index('-')]<max_gaps else: good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool) date_vs_distance = {} # self.reference_aln = None already set at alignment step for seq in self.aln: date_vs_distance[seq.id] = (seq.attributes['num_date'], np.mean((np.array(seq)!=root_seq)[(np.array(seq)!='-')&(root_seq!='-')&good_pos])) # if seq.id==self.reference.id: # self.reference_aln = seq date_vs_distance_array=np.array(date_vs_distance.values()) from scipy.stats import linregress, scoreatpercentile slope, intercept, rval, pval, stderr = linregress(date_vs_distance_array[:,0], date_vs_distance_array[:,1]) print("distance vs time regression:",slope) residuals = (intercept + slope*date_vs_distance_array[:,0]) - date_vs_distance_array[:,1] IQD = scoreatpercentile(residuals, 75) - scoreatpercentile(residuals,25) if plot: import matplotlib.pyplot as plt plt.ion() plt.scatter(date_vs_distance_array[:,0], date_vs_distance_array[:,1], c='g') bad_points = abs(intercept+slope*date_vs_distance_array[:,0] - date_vs_distance_array[:,1])>n_iqd*IQD plt.scatter(date_vs_distance_array[bad_points,0], date_vs_distance_array[bad_points,1], c='r') print("before clock filter:",len(self.aln)) tmp = {seq.id:seq for seq in self.aln if abs(intercept+slope*date_vs_distance[seq.id][0] - date_vs_distance[seq.id][1])<n_iqd*IQD} if self.reference.id not in tmp and self.reference.reference_in_dataset: self.log.notify('adding reference again after clock filter') tmp[self.reference.id] = self.reference_aln self.aln = MultipleSeqAlignment(tmp.values()) print("after clock filter:",len(self.aln))
def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90, debug=0): '''# estimate low and high thresholds ignore this much of the border for threshold estimation, default: %(default)s scale for estimating a mask over the text region, default: %(default)s lo percentile for black estimation, default: %(default)s hi percentile for white estimation, default: %(default)s ''' d0,d1 = flat.shape o0,o1 = int(bignore*d0), int(bignore*d1) est = flat[o0:d0-o0,o1:d1-o1] if escale>0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = escale v = est - filters.gaussian_filter(est, e*20.0) if debug: plt.clf() plt.title("first gaussian_filter") plt.imshow(v) raw_input("PRESS ANY KEY TO CONTINUE.") v = filters.gaussian_filter(v**2, e*20.0)**0.5 if debug: plt.clf() plt.title("second gaussian_filter") plt.imshow(v) raw_input("PRESS ANY KEY TO CONTINUE.") v = (v > 0.3 * np.amax(v)) if debug: plt.clf() plt.title("binarization") plt.imshow(v) raw_input("PRESS ANY KEY TO CONTINUE.") v = morphology.binary_dilation(v, structure=np.ones((int(e*50), 1))) v = morphology.binary_dilation(v, structure=np.ones((1, int(e*50)))) if debug: plt.clf() plt.title("morphology dilation") plt.imshow(v) raw_input("PRESS ANY KEY TO CONTINUE.") est = est[v] lo = stats.scoreatpercentile(est.ravel(),lo) hi = stats.scoreatpercentile(est.ravel(),hi) return lo, hi