我们从Python开源项目中,提取了以下28个代码示例,用于说明如何使用scipy.stats.zscore()。
def find_outliers(data): absolute_normalized = np.abs(zscore(data)) return absolute_normalized > 3
def X(self): # Scale the data across features X = stats.zscore(np.asarray(self.betas)) # Remove zero-variance features X = X[:, self.good_voxels] assert not np.isnan(X).any() # Regress out behavioral confounds rt = np.asarray(self.rt) m, s = np.nanmean(rt), np.nanstd(rt) rt = np.nan_to_num((rt - m) / s) X = OLS(X, rt).fit().resid return X
def process(self, X): onset_func = zscore(self.func(X)) onset_func = signal.filtfilt(self.moving_avg_filter, 1, onset_func) onset_func = onset_func - signal.medfilt( onset_func[:,np.newaxis], self.median_kernel )[:,0] peaks = signal.argrelmax(onset_func) onsets = peaks[0][np.where(onset_func[peaks[0]] > self.threshold)] return onsets
def _generate_noise_temporal_task(stimfunction_tr, motion_noise='gaussian', ): """Generate the signal dependent noise Create noise specific to the signal, for instance there is variability in how the signal manifests on each event Parameters ---------- stimfunction_tr : 1 Dimensional array This is the timecourse of the stimuli in this experiment, each element represents a TR motion_noise : str What type of noise will you generate? Can be gaussian or rician Returns ---------- noise_task : one dimensional array, float Generates the temporal task noise timecourse """ # Make the noise to be added stimfunction_tr = stimfunction_tr != 0 if motion_noise == 'gaussian': noise = stimfunction_tr * np.random.normal(0, 1, size=len( stimfunction_tr)) elif motion_noise == 'rician': noise = stimfunction_tr * stats.rice.rvs(0, 1, size=len( stimfunction_tr)) noise_task = stimfunction_tr + noise # Normalize noise_task = stats.zscore(noise_task) return noise_task
def calc_weighted_event_var(self, D, weights, event_pat): """Computes normalized weighted variance around event pattern Utility function for computing variance in a training set of weighted event examples. For each event, the sum of squared differences for all timepoints from the event pattern is computed, and then the weights specify how much each of these differences contributes to the variance (normalized by the number of voxels). Parameters ---------- D : timepoint by voxel ndarray fMRI data for which to compute event variances weights : timepoint by event ndarray specifies relative weights of timepoints for each event event_pat : voxel by event ndarray mean event patterns to compute variance around Returns ------- ev_var : ndarray of variances for each event """ Dz = stats.zscore(D, axis=1, ddof=1) ev_var = np.empty(event_pat.shape[1]) for e in range(event_pat.shape[1]): # Only compute variances for weights > 0.1% of max weight nz = weights[:, e] > np.max(weights[:, e])/1000 sumsq = np.dot(weights[nz, e], np.sum(np.square(Dz[nz, :] - event_pat[:, e]), axis=1)) ev_var[e] = sumsq/(np.sum(weights[nz, e]) - np.sum(np.square(weights[nz, e])) / np.sum(weights[nz, e])) ev_var = ev_var / D.shape[1] return ev_var
def build_spectrogram(feature_id, width=256, height=128, frequency_base=1.0): """ Builds a spectrogram using @Lichtso's ccwt library :param feature_id: The feature uuid to be analyzed. :param width: Width of the exported image (should be smaller than the number of samples) :param height: Height of the exported image :param frequency_base: Base for exponential frequency scales or 1.0 for linear scale """ feature = Feature.objects.get(pk=feature_id) df = _get_dataframe(feature.dataset.id) feature_column = zscore(df[feature.name].values) fourier_transformed_signal = fft(feature_column) minimum_frequency = 0.001 * len(feature_column) maximum_frequency = 0.5 * len(feature_column) if frequency_base == 1.0: # Linear frequency_band_result = frequency_band(height, maximum_frequency - minimum_frequency, minimum_frequency) else: minimum_octave = log(minimum_frequency) / log(frequency_base) maximum_octave = log(maximum_frequency) / log(frequency_base) # Exponential frequency_band_result = frequency_band(height, maximum_octave - minimum_octave, minimum_octave, frequency_base) # Write into the spectrogram path filename = '{0}/spectrograms/{1}.png'.format(settings.MEDIA_ROOT, feature_id) os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, 'w') as output_file: render_png(output_file, EQUIPOTENTIAL, 0.0, fourier_transformed_signal, frequency_band_result, width) Spectrogram.objects.create( feature=feature, width=width, height=height, image=filename )
def z_score(a, b): """ uses scipy.stats.zscore to transform to z-scores expects two arrays corresponding to sentences a and sentences b of sentence pairs, and returns two arrays which correspond to a and b undergone the same scaling by the mean & SD of their concatenation """ if len(a) != len(b): print "warning: uneven nr of sentences in a and b" all_sents = numpy.concatenate((a, b)) scaled = zscore(all_sents) return scaled[:len(a),:], scaled[len(a):,:]
def zscore_site(args): """ z-scores only one site """ from scipy.stats import zscore dataframe, columns, site = args return zscore(dataframe.loc[dataframe.site == site, columns].values, ddof=1, axis=0)
def __call__(self, *args): vals = args[0].values return sps.zscore(vals).tolist()
def filter_outlier_points(points): kt = kdtree.KDTree(points) distances, i = kt.query(kt.data, k=9) z_distances = stats.zscore(np.mean(distances, axis=1)) o_filter = abs(z_distances) < 1 # rather arbitrary return points[o_filter]
def confound_design(exp, subj, run, hpf_kernel): """Build a matrix of confound variables.""" analysis_dir = PROJECT["analysis_dir"] if exp == "dots": tr = 1 elif exp == "sticks": tr = .72 # Load in the confound information fstem = op.join(analysis_dir, exp, subj, "preproc", "run_{}".format(run)) motion = (pd.read_csv(op.join(fstem, "realignment_params.csv")) .filter(regex="rot|trans") .apply(stats.zscore)) nuisance = (pd.read_csv(op.join(fstem, "nuisance_variables.csv")) .filter(regex="wm_") .apply(stats.zscore)) artifacts = (pd.read_csv(op.join(fstem, "artifacts.csv")) .any(axis=1)) # Upsample dots data if exp == "dots": motion = pd.DataFrame(moss.upsample(motion, 2)) nuisance = pd.DataFrame(moss.upsample(nuisance, 2)) new_tps = np.arange(0, 229.5, .5) artifacts = artifacts.reindex(new_tps).ffill().reset_index(drop=True) # Build this portion of the design matrix confounds = pd.concat([motion, nuisance], axis=1) dmat = glm.DesignMatrix(confounds=confounds, artifacts=artifacts, hpf_kernel=hpf_kernel, tr=tr) return dmat.design_matrix
def signal_confound_design(self, run): """Build a matrix of signal confound variables.""" analysis_dir = PROJECT["analysis_dir"] if self.exp == "dots": tr = 2 elif self.exp == "sticks": tr = .72 fstem = op.join(analysis_dir, self.exp, self.subj, "preproc", "run_{}".format(run)) motion = (pd.read_csv(op.join(fstem, "realignment_params.csv")) .filter(regex="rot|trans") .apply(stats.zscore)) nuisance = (pd.read_csv(op.join(fstem, "nuisance_variables.csv")) .filter(regex="wm_") .apply(stats.zscore)) artifacts = (pd.read_csv(op.join(fstem, "artifacts.csv")) .any(axis=1)) confounds = pd.concat([motion, nuisance], axis=1) dmat = glm.DesignMatrix(confounds=confounds, artifacts=artifacts, hpf_kernel=self.hpf_kernel, tr=tr) return dmat.design_matrix
def california(): calif = [] with open('california.txt') as f: calif = f.readlines() out_array = [] for ccs in calif: cal_array = ccs.split(',') piece = [cal_array[0].replace('"',''),cal_array[1].replace(' ',''),cal_array[2].replace(' ','').replace('\n','')] out_array.append(piece) array_county = [] for x in out_array: array_county.append(x[0]) set_array_county = set(array_county) final_arr = [] for county in set_array_county: divisor = 0 total_score = 0 for y in out_array: if county == y[0]: divisor += int(y[1]) if divisor == 0: divisor = 1 total_score += float(y[2]) final_arr.append((total_score/divisor)) pprint(final_arr) np_array = np.asarray(final_arr) zscore_arr = stats.zscore(np_array) fin = [] for x in zscore_arr: num = percentage_of_area_under_std_normal_curve_from_zcore(x) fin.append(num) proj = zip(set_array_county,fin) for x in proj: print(x[0],x[1])
def zStat(sg_row,pNull): #logging(sg_row) if pNull == 0 or pNull ==1: return None else: zscore = (sg_row.pmme-pNull) * math.sqrt(sg_row.treat+sg_row.ctrl) / math.sqrt(pNull*(1-pNull)) return zscore
def standarizeMaxmeanDic(md): sfactor = {} snullmaxmean = [] for k in md.keys(): sfactor[k] = (np.mean(md[k]),np.std(md[k])) md[k] = stats.zscore(md[k]) snullmaxmean += md[k].tolist() return (md,sfactor,snullmaxmean)
def find_outliers(X, threshold=3.0, max_iter=2): """Find outliers based on iterated Z-scoring This procedure compares the absolute z-score against the threshold. After excluding local outliers, the comparison is repeated until no local outlier is present any more. Parameters ---------- X : np.ndarray of float, shape (n_elemenets,) The scores for which to find outliers. threshold : float The value above which a feature is classified as outlier. max_iter : int The maximum number of iterations. Returns ------- bad_idx : np.ndarray of int, shape (n_features) The outlier indices. """ from scipy.stats import zscore my_mask = np.zeros(len(X), dtype=np.bool) for _ in range(max_iter): X = np.ma.masked_array(X, my_mask) this_z = np.abs(zscore(X)) local_bad = this_z > threshold my_mask = np.max([my_mask, local_bad], 0) if not np.any(local_bad): break bad_idx = np.where(my_mask)[0] return bad_idx
def sprZScores(self, PBC): if np.std(self.sprs.values()) == 0: zscores = {k : (0.0, self.sprs[k]) for k in self.sprs.keys()} else: zscores = {k : (zscore, self.sprs[k]) for (k, zscore) in zip(self.sprs.keys(), stats.zscore(self.sprs.values()))} CSVExporter.CSVExportScoutZScores(zscores) PBC.sendExport('SPRExport.csv')
def rValuesForAverageFunctionForDict(self, averageFunction, d): values = map(averageFunction, self.cachedComp.teamsWithMatchesCompleted) for index, value in enumerate(values): if value == None: values[index] = 0 if not values: return if not np.std(values): zscores = [0.0 for v in values] #Don't calculate z-score if the standard deviation is 0 else: zscores = list(stats.zscore(values)) [utils.setDictionaryValue(d, self.cachedComp.teamsWithMatchesCompleted[i].number, zscores[i]) for i in range(len(self.cachedComp.teamsWithMatchesCompleted))]
def pearson_correllations(y, y_pred): """Computes the pearson correllation of a set of predictions. Parameters ---------- y: Actual test set values. A numpy array of shape (npoints, nvariables) y_pred: The predicted values. A numpy array of shape (npoints, nvariables) """ return (zscore(y)*zscore(y_pred)).mean(axis=0)
def normalize(signals, axis=None, groups=None, MP = False, comp=None): """ :param signals: 1D, 2D or 3D signals returns zscored per patient """ if comp is not None: print('zmapping with axis {}'.format(axis)) return stats.zmap(signals, comp , axis=axis) if groups is None: print('zscoring with axis {}'.format(axis)) return stats.zscore(signals, axis=axis) if signals.ndim == 1: signals = np.expand_dims(signals,0) if signals.ndim == 2: signals = np.expand_dims(signals,2) if MP: print('zscoring per patient using {} cores'.format(cpu_count())) p = Pool(cpu_count()) #use all except for one res = [] new_signals = np.zeros_like(signals) for ID in np.unique(groups): idx = groups==ID job = p.apply_async(stats.zscore, args = (signals[idx],), kwds={'axis':None}) res.append(job) start = 0 for r in res: values = r.get(timeout = 1200) end = start + len(values) new_signals[start:end] = values end = start return new_signals else: print('zscoring per patient') res = [] for ID in np.unique(groups): idx = groups==ID job = stats.zscore(signals[idx], axis=None) res.append(job) new_signals = np.vstack(res) return new_signals
def _generate_noise_temporal_drift(trs, tr_duration, period=300, ): """Generate the drift noise Create a sinewave, of a given period and random phase, to represent the drift of the signal over time Parameters ---------- trs : int How many volumes (aka TRs) are there tr_duration : float How long in seconds is each volume acqusition period : int How many seconds is the period of oscillation of the drift Returns ---------- noise_drift : one dimensional array, float The drift timecourse of activity """ # Calculate the cycles of the drift for a given function. cycles = trs * tr_duration / period # Create a sine wave with a given number of cycles and random phase timepoints = np.linspace(0, trs - 1, trs) phaseshift = np.pi * 2 * np.random.random() phase = (timepoints / (trs - 1) * cycles * 2 * np.pi) + phaseshift noise_drift = np.sin(phase) # Normalize so the sigma is 1 noise_drift = stats.zscore(noise_drift) # Return noise return noise_drift
def _generate_noise_temporal_phys(timepoints, resp_freq=0.2, heart_freq=1.17, ): """Generate the physiological noise. Create noise representing the heart rate and respiration of the data. Default values based on Walvaert, Durnez, Moerkerke, Verdoolaege and Rosseel, 2011 Parameters ---------- timepoints : 1 Dimensional array What time points, in seconds, are sampled by a TR resp_freq : float What is the frequency of respiration heart_freq : float What is the frequency of heart beat Returns ---------- noise_phys : one dimensional array, float Generates the physiological temporal noise timecourse """ noise_phys = [] # Preset resp_phase = (np.random.rand(1) * 2 * np.pi)[0] heart_phase = (np.random.rand(1) * 2 * np.pi)[0] for tr_counter in timepoints: # Calculate the radians for each variable at this # given TR resp_radians = resp_freq * tr_counter * 2 * np.pi + resp_phase heart_radians = heart_freq * tr_counter * 2 * np.pi + heart_phase # Combine the two types of noise and append noise_phys.append(np.cos(resp_radians) + np.sin(heart_radians)) # Normalize noise_phys = stats.zscore(noise_phys) return noise_phys
def _logprob_obs(self, data, mean_pat, var): """Log probability of observing each timepoint under each event model Computes the log probability of each observed timepoint being generated by the Gaussian distribution for each event pattern Parameters ---------- data: voxel by time ndarray fMRI data on which to compute log probabilities mean_pat: voxel by event ndarray Centers of the Gaussians for each event var: float or 1D array of length equal to the number of events Variance of the event Gaussians. If scalar, all events are assumed to have the same variance Returns ------- logprob : time by event ndarray Log probability of each timepoint under each event Gaussian """ n_vox = data.shape[0] t = data.shape[1] # z-score both data and mean patterns in space, so that Gaussians # are measuring Pearson correlations and are insensitive to overall # activity changes data_z = stats.zscore(data, axis=0, ddof=1) mean_pat_z = stats.zscore(mean_pat, axis=0, ddof=1) logprob = np.empty((t, self.n_events)) if type(var) is not np.ndarray: var = var * np.ones(self.n_events) for k in range(self.n_events): logprob[:, k] = -0.5 * n_vox * np.log( 2 * np.pi * var[k]) - 0.5 * np.sum( (data_z.T - mean_pat_z[:, k]).T ** 2, axis=0) / var[k] logprob /= n_vox return logprob
def calculate_ranks(game): logging.info("Calculating new ranks") players = Player.objects.all() ranks = list(Rank.objects.all()) rank_distribution = OrderedDict() step = 6 / (len(ranks) - 2) for i in range(len(ranks) - 2): rank_distribution[-3 + i * step] = ranks[i] player_list = [] for player in players: s1 = player.gameplayerrelation_set.filter(team=1).aggregate(Sum('game__team1_score'))[ "game__team1_score__sum"] if not None else 0 s2 = player.gameplayerrelation_set.filter(team=2).aggregate(Sum('game__team2_score'))[ "game__team2_score__sum"] if not None else 0 s1 = s1 if s1 else 0 s2 = s2 if s2 else 0 if s1 + s2 > 4: player_list.append(player) if not player_list: logging.debug("No players with four round victories") return scores = [player.score for player in player_list] if len(set(scores)) == 1: logging.debug("Only one player {} with rank".format(player_list[0])) z_scores = [0.0 for i in range(len(player_list))] else: z_scores = zscore(scores) logging.debug("Players: {}".format(player_list)) logging.debug("Z_scores: {}".format(z_scores)) for i in range(len(player_list)): player = player_list[i] if player not in list(game.players.all()): logging.debug("Not setting rank for %s because he didn't play", player) break player_z_score = z_scores[i] player = player_list[i] rank = None for required_z_score in rank_distribution.keys(): if player_z_score > required_z_score: rank = rank_distribution[required_z_score] else: break logging.debug("Setting rank {} for {}".format(rank, player.name)) player.rank = rank player.save()
def normalization_scaler(norm, data, row_processing): ''' data: N*M np.array N: sample M: features data_T: M*N data_T_scale: scaler for column by column, M*N data_T_scale_T: N*M ''' if data.size == 0: return data if '1' in norm: if row_processing: data_T = data.transpose() scaler = MinMaxScaler().fit_transform(StandardScaler().fit_transform(np.log(data+1))) #data_T_scale = scaler.transform(data_T) return scaler.transpose() else: #scaler = StandardScaler().fit(data) scaler =MinMaxScaler().fit_transform(StandardScaler().fit_transform(np.log(data+1))) return scaler elif '2' in norm: if row_processing: data_T = data.transpose() scaler = MinMaxScaler().fit(data_T) data_T_scale = scaler.transform(data_T) return data_T_scale.transpose() else: scaler = MinMaxScaler().fit(data) return scaler.transform(data) elif '3' in norm: if row_processing: data_T = data.transpose() data_T.apply(zscore) return data_T.transpose() else: data.apply(zscore) return data else: return data
def runClustering(cluster_df): from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score as silhouette_score Xcols = [col for col in cluster_df.columns if 'NOTMODEL' not in col.upper()] # Convert character columns to dummy variables X = cluster_df[Xcols] cols = X.columns num_cols = X._get_numeric_data().columns char_cols = list(set(cols) - set(num_cols)) for col in char_cols: if len(X[col].unique()) <= 20: dummy = pd.get_dummies(X[col], prefix='dm' + col) column_name = X.columns.values.tolist() column_name.remove(col) X = X[column_name].join(dummy) else: if col in X.columns: # If more than 20 distinct values then delete del X[col] # Standardize (Z-score normalize) all continuous variables from scipy.stats import zscore for col in X: if len(X[col].unique()) > 2: # Standardize non-dummy variables col_zscore = 'z_' + col X[col_zscore] = zscore(X[col]) del X[col] # Fill missing values with 0 = the mean in the z-normalize data # Obviously missing values can be handled in many different ways X.fillna(0, inplace=True) # convert to matrix/numpy array to use in KMeans clustering class data_for_clustering_matrix = X.as_matrix() number_of_Clusters = [] silhouette_value = [] # Loop through 2 and 20 clusters and identify which has the highest silhouette score k = range(2, 21) for i in k: clustering_method = KMeans(n_clusters=i) clustering_method.fit(data_for_clustering_matrix) labels = clustering_method.predict(data_for_clustering_matrix) silhouette_average = silhouette_score(data_for_clustering_matrix, labels) silhouette_value.append(silhouette_average) number_of_Clusters.append(int(i)) # maxind = np.argmax(silhouette_value) max_value = max(silhouette_value) indexMaxValue = silhouette_value.index(max_value) # FIT KMEANS CLUSTER MODEL WITH NUMBER OF CLUSTERS WITH HIGHEST SILHOUETTE SCORE clustering_method = KMeans(n_clusters=number_of_Clusters[indexMaxValue]) clustering_method.fit(data_for_clustering_matrix) labels = clustering_method.predict(data_for_clustering_matrix) # SCORE THE DATAFRAME score_df cluster_df['cluster'] = labels return cluster_df