我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用scipy.stats.percentileofscore()。
def percentile_score_single(test_vals, bg_vals): """ For each value in test_vals, compute its percentile score compared to bg_vals. Args: test_vals (numpy array) bg_vals (numpy array) Returns: out_score (float) """ # Compute percentile score for each value in test_vals percentile_scores = [stats.percentileofscore(bg_vals, test_val, kind="rank") for test_val in test_vals] # Take mean of percentile scores out_score = np.mean(percentile_scores) return out_score
def getMapBins(map_counts, num_bins): #e.g. latlon_map[0] #find upper limit of percentile for 0 count (for all data is something like 30th) bottom = sp.percentileofscore(map_counts.flatten(), 0.0, kind='weak') #define 10 percentile bins, evenly divide rest of bins above 0 count into nine chunks step = (100-bottom)/(num_bins-1) percentiles = [0] for ix in xrange(0,num_bins-1): percentiles.append(bottom+(ix*step)) #find counts that bound each bin countRanges = [] for bin in percentiles: countRanges.append(round(np.percentile(map_counts,bin),0)) countRanges.append(round(np.amax(map_counts),0)) #define fill colors for each of the bins fillColors = [] for ix in xrange(0,num_bins): #the 255*0.8 is the max opacity fillColors.append([66, 91, 161, (255*0.8)*(ix/float(num_bins-1))]) return percentiles, countRanges, fillColors # Create the actual data to power our map overlay
def DVO(df, w = [0.5, 0.5, 0, 0], N = 2, s = [0.5, 0.5], M = 252): ratio = df.close/(df.high * w[0] + df.low * w[1] + df.open * w[2] + df.close * w[3]) theta = pd.Series(index = df.index) dvo = pd.Series(index = df.index, name='DV%s_%s' % (N, M)) ss = np.array(list(reversed(s))) for idx, d in enumerate(ratio.index): if idx >= N-1: y = ratio[idx-N+1:idx+1].values theta[idx] = np.dot(y, ss) if idx >= M+N-2: ts = theta[idx-(M-1):idx+1] dvo[idx] = stats.percentileofscore(ts.values, theta[idx]) return dvo
def summarizeMap(mapDataFrame): latlon = mapDataFrame[['meta_latitude','meta_longitude']] latlon = latlon[pd.notnull(latlon['meta_latitude'])] latlon = latlon[pd.notnull(latlon['meta_longitude'])] minLat = np.amin(latlon['meta_latitude']) maxLat = np.amax(latlon['meta_latitude']) minLon = np.amin(latlon['meta_longitude']) maxLon = np.amax(latlon['meta_longitude']) if len(latlon) > 1: latlon_map = np.histogram2d(x=latlon['meta_longitude'],y=latlon['meta_latitude'],bins=[36,18], range=[[minLon, maxLon], [minLat, maxLat]]) else: latlon_map = np.histogram2d(x=[],y=[],bins=[36,18], range=[[-180, 180], [-90, 90]]) #define latlon map color bin info percentiles, countRanges, fillColors = getMapBins(latlon_map[0], num_bins=10) # range should be flexible to rules in DatasetSearchSummary # latlon_map[0] is the lonxlat (XxY) array of counts; latlon_map[1] is the nx/lon bin starts; map[2] ny/lat bin starts lonstepsize = (latlon_map[1][1]-latlon_map[1][0])/2 latstepsize = (latlon_map[2][1]-latlon_map[2][0])/2 maxMapCount = np.amax(latlon_map[0]) map_data = [] for lon_ix,lonbin in enumerate(latlon_map[0]): for lat_ix,latbin in enumerate(lonbin): #[latlon_map[2][ix]+latstepsize for ix,latbin in enumerate(latlon_map[0][0])] lat = latlon_map[2][lat_ix]+latstepsize lon = latlon_map[1][lon_ix]+lonstepsize value = latbin buffer=0.0001 #left-bottom, left-top, right-top, right-bottom, left-bottom polygon = [[lon-lonstepsize+buffer,lat-latstepsize+buffer], [lon-lonstepsize+buffer,lat+latstepsize-buffer], [lon+lonstepsize-buffer,lat+latstepsize-buffer], [lon+lonstepsize-buffer,lat-latstepsize+buffer], [lon-lonstepsize,lat-latstepsize]] bin_ix = np.amax(np.argwhere(np.array(percentiles)<=sp.percentileofscore(latlon_map[0].flatten(), value))) fillColor = fillColors[bin_ix] map_data.append({"lat":lat,"lon":lon,"count":value,"polygon":polygon, "fillColor":fillColor}) map_legend_info = {"ranges":countRanges, "fills":fillColors} return (map_data,map_legend_info) # Query Construction Helpers / Data Retrieval # Based on a rule (field name, comparator and value), add a filter to a query object # TODO add some better documentation here on what each type is
def permutation_test(corrmat, tails, mask=None, n=100, seed=None): """Permute tail assignments to generate null distribution.""" rs = np.random.RandomState(seed) corrs_real = tail_correlations(corrmat, tails, mask) corrs_null = [] for _ in xrange(n): perm_tails = rs.permutation(tails) corrs_null.append(tail_correlations(corrmat, perm_tails, mask)) diff_real = np.subtract(*corrs_real) diff_null = np.subtract(*zip(*corrs_null)) pctile = stats.percentileofscore(diff_null, diff_real) return pctile
def percentile_score(null, real): """Vectorized function for computing percentile of score.""" if np.isscalar(real): return stats.percentileofscore(null, real) percentiles = [] assert len(null) == len(real) for null_i, real_i in zip(null, real): percentiles.append(stats.percentileofscore(null_i, real_i, "mean")) assert len(percentiles) == len(real) return np.array(percentiles)
def calc_regression(self, label): """ Calculate least squares regression for *label*. If *weighted* is ``True``, calculates weighted least squares; else ordinary least squares. Regression results are stored in ``'/main/label/scores'`` """ if self.check_store("/main/{}/scores".format(label)): return elif "/main/{}/scores".format(label) in self.store.keys(): # need to remove the current keys because we are using append self.store.remove("/main/{}/scores".format(label)) logging.info("Calculating {} regression coefficients ({})".format(self.scoring_method, label), extra={'oname' : self.name}) # append is required because it takes the "min_itemsize" argument, and put doesn't longest = self.store.select("/main/{}/log_ratios".format(label), "columns='index'").index.map(len).max() chunk = 1 if self.scoring_method == "WLS": for data in self.store.select_as_multiple(["/main/{}/log_ratios".format(label), "/main/{}/weights".format(label)], chunksize=self.chunksize): logging.info("Calculating weighted least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name}) result = data.apply(regression_apply, args=[self.timepoints, True], axis="columns") self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest}) chunk += 1 elif self.scoring_method == "OLS": for data in self.store.select("/main/{}/log_ratios".format(label), chunksize=self.chunksize): logging.info("Calculating ordinary least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name}) result = data.apply(regression_apply, args=[self.timepoints, False], axis="columns") self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest}) chunk += 1 else: raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name)) # need to read from the file, calculate percentiles, and rewrite it logging.info("Calculating slope standard error percentiles ({})".format(label), extra={'oname' : self.name}) data = self.store['/main/{}/scores'.format(label)] data['score'] = data['slope'] data['SE'] = data['SE_slope'] data['SE_pctile'] = [stats.percentileofscore(data['SE'], x, "weak") for x in data['SE']] data = data[['score', 'SE', 'SE_pctile', 'slope', 'intercept', 'SE_slope', 't', 'pvalue_raw']] # reorder columns self.store.put("/main/{}/scores".format(label), data, format="table", data_columns=data.columns)
def score_hmm_events(bst, k_folds=None, num_states=30, n_shuffles=5000, shuffle='row-wise', verbose=False): """scores all sequences in the entire bst""" if k_folds is None: k_folds = 5 if shuffle == 'row-wise': rowwise = True elif shuffle == 'col-wise': rowwise = False else: raise ValueError("tmat must be either 'row-wise' or 'col-wise'") X = [ii for ii in range(bst.n_epochs)] scores_hmm = np.zeros(bst.n_epochs) scores_hmm_shuffled = np.zeros((bst.n_epochs, n_shuffles)) for kk, (training, validation) in enumerate(k_fold_cross_validation(X, k=k_folds)): if verbose: print(' fold {}/{}'.format(kk+1, k_folds)) PBEs_train = bst[training] PBEs_test = bst[validation] # train HMM on all training PBEs hmm = PoissonHMM(n_components=num_states, random_state=0, verbose=False) hmm.fit(PBEs_train) # reorder states according to transmat ordering transmat_order = hmm.get_state_order('transmat') hmm.reorder_states(transmat_order) # compute scores_hmm (log likelihoods) of validation set: scores_hmm[validation] = hmm.score(PBEs_test) hmm_shuffled = copy.deepcopy(hmm) for nn in range(n_shuffles): # shuffle transition matrix: if rowwise: hmm_shuffled.transmat_ = shuffle_transmat(hmm_shuffled.transmat) else: hmm_shuffled.transmat_ = shuffle_transmat_Kourosh_breaks_stochasticity(hmm_shuffled.transmat) hmm_shuffled.transmat_ = hmm_shuffled.transmat / np.tile(hmm_shuffled.transmat.sum(axis=1), (hmm_shuffled.n_components, 1)).T # score validation set with shuffled HMM scores_hmm_shuffled[validation, nn] = hmm_shuffled.score(PBEs_test) n_scores = len(scores_hmm) scores_hmm_percentile = np.array([stats.percentileofscore(scores_hmm_shuffled[idx], scores_hmm[idx], kind='mean') for idx in range(n_scores)]) return scores_hmm, scores_hmm_shuffled, scores_hmm_percentile
def qq_plot(self, df_samp, df_clu): """ :param df1: interval df of enterprise a. The column name should be the enterprise id :param df2: interval df of enterprise b. The column name should be the enterprise id :return: slope, intercept and total fit error of fitted regression line """ # use longer list as reference distribution outdir = self.output_dir + "/qq-plot" # make output directory if not exists if not os.path.exists(outdir): os.makedirs(outdir) ref = np.asarray(df_clu) samp = np.asarray(df_samp) ref_id = df_clu.columns samp_id = df_samp.columns print "Start drawing Q-Q plot using data from sample {} and cluster {}.".format(samp_id, ref_id) # theoretical quantiles samp_pct_x = np.asarray([percentileofscore(ref, x) for x in samp]) # sample quantiles samp_pct_y = np.asarray([percentileofscore(samp, x) for x in samp]) # calculate the error from real percentiles to predicted percentiles: as same as mean squared error pct_error = np.sum(np.power(samp_pct_y - samp_pct_x, 2)) / (2 * len(samp_pct_x)) # estimated linear regression model p = np.polyfit(samp_pct_x, samp_pct_y, 1) regr = LinearRegression() model_x = samp_pct_x.reshape(len(samp_pct_x), 1) model_y = samp_pct_y.reshape(len(samp_pct_y), 1) regr.fit(model_x, model_y) r2 = regr.score(model_x, model_y) if p[1] > 0: p_function = "y= {} x + {}, r-square = {}".format(p[0], p[1], r2) elif p[1] < 0: p_function = "y= {} x - {}, r-square = {}".format(p[0], -p[1], r2) else: p_function = "y= {} x, r-square = {}".format(p[0], r2) print "The fitted linear regression model in Q-Q plot using data from enterprises {} and cluster {} is {}".format(samp_id, ref_id, p_function) # plot q-q plot x_ticks = np.arange(0, 100, 20) y_ticks = np.arange(0, 100, 20) plt.scatter(x=samp_pct_x, y=samp_pct_y, color='blue') plt.xlim((0, 100)) plt.ylim((0, 100)) # add fit regression line plt.plot(samp_pct_x, regr.predict(model_x), color='red', linewidth=2) # add 45-degree reference line plt.plot([0, 100], [0, 100], linewidth=2) plt.text(10, 70, p_function) plt.xticks(x_ticks, x_ticks) plt.yticks(y_ticks, y_ticks) plt.xlabel('cluster quantiles - id: {}'.format(ref_id)) plt.ylabel('sample quantiles - id: {}'.format(samp_id)) plt.title('{} VS {} Q-Q plot'.format(ref_id, samp_id)) outfile = "{}/enterprise-{}-VS-cluster-{}.qqplot.png".format(outdir, samp_id, ref_id) plt.savefig(outfile) print "Plotting Q-Q plot done! The plot is stored at {}.".format(outfile) plt.close() return p[0], p[1], pct_error