我们从Python开源项目中,提取了以下27个代码示例,用于说明如何使用scipy.stats.gmean()。
def calculate_features_for_VAD(sound_frames, frequencies_axis, spectrogram): features = numpy.empty((spectrogram.shape[0], 3)) # smooted_spectrogram, smoothed_frequencies_axis = smooth_spectrogram(spectrogram, frequencies_axis, 24) for time_ind in range(spectrogram.shape[0]): mean_spectrum = spectrogram[time_ind].mean() if mean_spectrum > 0.0: sfm = -10.0 * math.log10(stats.gmean(spectrogram[time_ind]) / mean_spectrum) else: sfm = 0.0 # max_freq = smoothed_frequencies_axis[smooted_spectrogram[time_ind].argmax()] max_freq = frequencies_axis[spectrogram[time_ind].argmax()] features[time_ind][0] = numpy.square(sound_frames[time_ind]).mean() features[time_ind][1] = sfm features[time_ind][2] = max_freq """medfilt_order = 3 for feature_ind in range(features.shape[0]): features[feature_ind] = signal.medfilt(features[feature_ind], medfilt_order)""" return features
def get_statistical_property_scores_per_input_per_impl(self, func: StatisticalPropertyFunc, input_num: int, reduce: ReduceFunc = stats.gmean) -> t.Dict[str, t.List[float]]: """ Assumptions: - Most programs have the same number of input (known as max input number) - The input number n takes roughly the same amount of time for every program category """ cats = self._get_categories_for_number_of_inputs(self.get_max_input_num()) scores_per_impl = InsertionTimeOrderedDict() for cat in cats: scores = cat.get_statistical_property_scores_per_input_per_impl(func, cat.get_input_strs()[input_num]) for impl in scores: if impl not in scores_per_impl: scores_per_impl[impl] = [] scores_per_impl[impl].append(reduce(scores[impl])) return scores_per_impl
def make_submission(): clfs = train_xgboost() df = pd.read_csv('data/stage2_sample_submission.csv') x = np.array([np.mean(np.load('data/stage2/%s.npy' % str(did)).reshape(-1, 2048), axis=0) for did in df['id'].tolist()]) preds = [] for clf in clfs: preds.append(np.clip(clf.predict(x), 0.001, 1)) pred = gmean(np.array(preds), axis=0) # print pred df['cancer'] = pred df.to_csv('subm.csv', index=False) # print df.head()
def make_ensemble(series, name, use_gmean = True): '''series: list of pandas.Series name: string''' preds = pd.concat(series, axis=1) if use_gmean: ens = preds.apply(lambda x: sc.gmean(x), axis=1) else: ens = preds.apply(lambda x: np.mean(x), axis=1) # an alternative ens = pd.Series(ens,index = series[0].index, name = 'probability') ens.to_csv(name+'_ens.csv', index_label = 't_id', header = True)
def used_summarize_mean(values: t.List[float]) -> float: if CALC_MODE in [Mode.geom_mean_rel_to_best, Mode.mean_rel_to_one]: return stats.gmean(values) elif CALC_MODE in [Mode.mean_rel_to_first]: return sp.mean(values) assert False
def get_geom_over_rel_means(self) -> t.Dict[str, float]: return self.get_reduced_x_per_impl(used_rel_mean_property, stats.gmean)
def get_geom_over_rel_stds(self) -> t.Dict[str, float]: return self.get_reduced_x_per_impl(rel_std_property, stats.gmean)
def get_html(self, base_file_name: str, h_level: int) -> str: html = """ <h{}>Program: {!r} ({} lines, {} entropy)</h{}> The following plot shows the mean score per input distribution for every implementation. """.format(h_level, self.name, self.line_number, self.entropy, h_level) html += self.get_box_plot_html(base_file_name) scores = self.get_impl_mean_scores() std_devs = self.get_statistical_property_scores(rel_std_dev_func) html += """ <table class="table"> <tr><th>implementation</th><th>geom mean over means relative to best (per input) aka mean score</th> <th>... std dev rel. to the best mean</th> </tr> """ for impl in scores.keys(): html += """ <tr><td>{}</td><td>{:5.2%}</td><td>{:5.2%}</td></tr> """.format(impl, stats.gmean(scores[impl]), stats.gmean(std_devs[impl])) html += "</table>" impl_names = list(scores.keys()) for (i, input) in enumerate(self.prog_inputs.keys()): app = html_escape_property(input) if len(app) > 20: app = str(i) html += self.prog_inputs[input].get_html(base_file_name + "_" + app, h_level + 1) return html
def get_statistical_property_scores(self, func: StatisticalPropertyFunc, reduce: ReduceFunc = stats.gmean) -> t.Dict[str, float]: ret = InsertionTimeOrderedDict() scores_per_impl = self.get_statistical_property_scores_per_impl(func) for impl in scores_per_impl: ret[impl] = reduce(scores_per_impl[impl]) return ret
def geom_std(values: t.List[float]) -> float: """ Calculates the geometric standard deviation for the passed values. Source: https://en.wikipedia.org/wiki/Geometric_standard_deviation """ import scipy.stats as stats import scipy as sp gmean = stats.gmean(values) return sp.exp(sp.sqrt(sp.sum([sp.log(x / gmean) ** 2 for x in values]) / len(values)))
def first_rel_to_second(self) -> float: """ Calculates the geometric mean of the first means relative to the second means. See http://www.cse.unsw.edu.au/~cs9242/15/papers/Fleming_Wallace_86.pdf """ return st.gmean([x.first_rel_to_second() for x in self.properties.values()])
def sparsity_penalty_terms(self, P, j = None): log.debug(" > sparsing P") if self.do_sparsity_constraint: den_sprs_P = self.rho * gmean(P + EPS, axis = -1)[:,:,None] \ / (P * (np.sum(P, axis = -1)[:,:,None]) + EPS) num_sprs_P = self.rho * ((self.J * gmean(P + EPS, axis = -1)) \ / ((np.sum(P, axis = -1))**2 + EPS))[:,:,None] else: num_sprs_P = np.zeros(self.J)[None,None,:] den_sprs_P = np.zeros(self.J)[None,None,:] if j is not None: return (num_sprs_P[...,0], den_sprs_P[...,j]) else: return (num_sprs_P, den_sprs_P)
def sparsity_cost(self, P): if self.do_sparsity_constraint: return self.rho * np.sum( gmean(P + EPS, axis = 2) / (np.mean(P, axis = 2) + EPS)) else: return 0 return "error"
def score(self, X, y, method='geometric', threshold=0.5): probs = self.predict_proba(X, y) if method == 'geometric': mean_probs = gmean(probs, axis=1) guesses = [int(x >= threshold) for x in mean_probs] acc = np.true_divide(np.sum(guesses == y), len(y)) return acc #predicting results with the test data
def predict(self, X, y, method='geometric', threshold=0.5): probs = self.predict_proba(X, y) if method == 'geometric': mean_probs = gmean(probs, axis=1) guesses = [int(x >= threshold) for x in mean_probs] return np.array(guesses) #gets the predicted probabilities of the test data
def predict_proba(self, X, y, mean=False): probs = pd.DataFrame(np.zeros([X.shape[0], len(self.mods)])) probs.columns = self.mods.keys() for i in range(len(self.mods)): if self.mods.keys()[i] != 'nbsvm': probs.iloc[:, i] = self.mods.values()[i].predict_proba(X)[:,1] else: probs.iloc[:, i] = self.mods['nbsvm'].predict_proba(X, y) #probs[probs == 0] = 0.0000001 if mean: return gmean(probs, axis=1) else: return probs
def check_coarsening_method(methods): accepted_methods = ['min', 'max', 'amean', 'hmean', 'gmean', 'median'] if methods is not None: for method in methods: if method not in accepted_methods: raise ValueError( ' Coarsening method {0} is not implemented..\ \n Use these: {1}'.format(method, accepted_methods) ) return methods else: return accepted_methods
def gmean(self): """Returns the gmean of the models predictions. Returns ------- `PipeApply` """ return self.apply(lambda x: gmean(x, axis=0))
def create_scipy_features(base_features, sentinel): r"""Calculate the skew, kurtosis, and other statistical features for each row. Parameters ---------- base_features : numpy array The feature dataframe. sentinel : float The number to be imputed for NaN values. Returns ------- sp_features : numpy array The calculated SciPy features. """ logger.info("Creating SciPy Features") # Generate scipy features logger.info("SciPy Feature: geometric mean") row_gmean = sps.gmean(base_features, axis=1) logger.info("SciPy Feature: kurtosis") row_kurtosis = sps.kurtosis(base_features, axis=1) logger.info("SciPy Feature: kurtosis test") row_ktest, pvalue = sps.kurtosistest(base_features, axis=1) logger.info("SciPy Feature: normal test") row_normal, pvalue = sps.normaltest(base_features, axis=1) logger.info("SciPy Feature: skew") row_skew = sps.skew(base_features, axis=1) logger.info("SciPy Feature: skew test") row_stest, pvalue = sps.skewtest(base_features, axis=1) logger.info("SciPy Feature: variation") row_var = sps.variation(base_features, axis=1) logger.info("SciPy Feature: signal-to-noise ratio") row_stn = sps.signaltonoise(base_features, axis=1) logger.info("SciPy Feature: standard error of mean") row_sem = sps.sem(base_features, axis=1) sp_features = np.column_stack((row_gmean, row_kurtosis, row_ktest, row_normal, row_skew, row_stest, row_var, row_stn, row_sem)) sp_features = impute_values(sp_features, 'float64', sentinel) sp_features = StandardScaler().fit_transform(sp_features) # Return new SciPy features logger.info("SciPy Feature Count : %d", sp_features.shape[1]) return sp_features # # Function create_clusters #
def get_html(self, base_file_name: str, h_level: int) -> str: html = """ <h{}>{}</h{}> """.format(h_level, self.name, h_level) scores = self.get_impl_mean_scores() std_devs = self.get_statistical_property_scores(rel_std_dev_func) if len(self.programs) > 1: html += """ Mean scores per implementation for this program category <p> """ html += self.get_box_plot_html(base_file_name) html += """ </p> <table class="table"> <tr><th>implementation</th><th>geom mean over means relative to best (per input and program) aka mean score</th> <th>... std devs relative to the best means </th> </tr> """ for impl in scores.keys(): html += """ <tr><td>{}</td><td>{:5.2%}</td><td>{:5.2%}</td></tr> """.format(impl, scores[impl], std_devs[impl]) html += "</table>" if len(self.get_input_strs()) > 1: html += """ <h{h}> Mean scores per input</h{h}> """.format(h=h_level + 1) for input in self.get_input_strs(): mean_scores = self.get_statistical_property_scores_per_input_per_impl(rel_mean_func, input) std_scores = self.get_statistical_property_scores_per_input_per_impl(rel_std_dev_func, input) html += """ <h{h}>Mean scores for input {!r}</h{h}> The plot shows the distribution of mean scores per program for each implementation. <p> """.format(input, h=h_level + 2) html += self.get_box_plot_per_input_per_impl_html(base_file_name, input) html += """ </p> <table class="table"> <tr><th>impl</th><th>geom mean over means relative to best (per program) aka mean score</th> <th>... std devs relative to the best means </th> </tr> """ for impl in mean_scores.keys(): html += """ <tr><td>{}</td><td>{:5.2%}</td><td>{:5.2%}</td></tr> """.format(impl, stats.gmean(mean_scores[impl]), stats.gmean(std_scores[impl])) html += "</table>" impl_names = list(scores.keys()) for (i, prog) in enumerate(self.programs): html += self.programs[prog].get_html(base_file_name + "_" + html_escape_property(prog), h_level + 1) return html
def _PerformDataCoarsening(self, Chrom, resolution, coarsening_method): """Base method to perform Data coarsening. This method read temporary Numpy array files and perform data coarsening using the given input method. .. warning:: **Private method**. Use it at your own risk. It is used internally in :meth:`WigHandler._StoreInHdf5File`. Parameters ---------- Chrom : str Chromosome name resolution : str resolution in word. coarsening_method : str Name of method to use for data coarsening. Accepted keywords: min, max, median, amean, gmean and hmean. """ output = [] binsize = util.resolutionToBinsize(resolution) size = self.chromSizeInfo[Chrom] + 1 for i in range(1, size, binsize): tmpx = None if i+binsize >= size: tmpx = self.tmpNumpyArrayFiles.arrays[Chrom][i : size] else: tmpx = self.tmpNumpyArrayFiles.arrays[Chrom][i : i+binsize] int_idx = np.nonzero(tmpx > 0) if int_idx[0].shape[0] == 0: output.append(0.0) continue #print(Chrom, tmpx.shape, i, i+binsize, tmpx) if coarsening_method == 'max': output.append(np.amax(tmpx[int_idx])) if coarsening_method == 'min': output.append(np.amin(tmpx[int_idx])) if coarsening_method == 'amean': output.append(np.mean(tmpx[int_idx])) if coarsening_method == 'hmean': output.append(spstats.hmean(tmpx[int_idx])) if coarsening_method == 'gmean': output.append(spstats.gmean(tmpx[int_idx])) if coarsening_method == 'median': output.append(np.median(tmpx[int_idx])) # print(Chrom, resolution, coarse_method, size, binsize, size/binsize, len(output), np.amax(output)) return np.asarray(output)
def saveAsH5(self, hdf5Out, title=None, resolutions=None, coarsening_methods=None, compression='lzf', keep_original=False): """To convert Wig files to hdf5 file Parameters ---------- hdf5Out : :class:`HDF5Handler` or str Output hdf5 file name or :class:`HDF5Handler` instance title : str Title of the data resolutions : list of str Additional input resolutions other than these default resolutions: 1kb', '2kb', '4kb', '5kb', '8kb', '10kb', '20kb', '40kb', '80kb', '100kb', '160kb','200kb', '320kb', '500kb', '640kb', and '1mb'. For Example: use ``resolutions=['25kb', '50kb', '75kb']`` to add additional 25kb, 50kb and 75kb resolution data. coarsening_methods : list of str Methods to coarse or downsample the data for converting from 1-base to coarser resolutions. Presently, five methods are implemented. * ``'min'`` -> Minimum value * ``'max'`` -> Maximum value * ``'amean'`` -> Arithmetic mean or average * ``'hmean'`` -> Harmonic mean * ``'gmean'`` -> Geometric mean * ``'median'`` -> Median In case of ``None``, all five methods will be considered. User may use only subset of these methods. For example: ``coarse_method=['max', 'amean']`` can be used for downsampling by only these two methods. compression : str data compression method in HDF5 file : ``lzf`` or ``gzip`` method. keep_original : bool Whether original data present in bigwig file should be incorporated in HDF5 file. This will significantly increase size of HDF5 file. """ if not self.isWigParsed: self.parseWig() # Storing data in hdf5 file self._StoreInHdf5File(hdf5Out, title, compression=compression, coarsening_methods=coarsening_methods, resolutions=resolutions, keep_original=keep_original)
def _PerformDataCoarsening(self, Chrom, resolution, coarse_method): """Base method to perform Data coarsening. This method read temporary Numpy array files and perform data coarsening using the given input method. .. warning:: **Private method**. Use it at your own risk. It is used internally in :meth:`BEDHandler._StoreInHdf5File`. Parameters ---------- Chrom : str Chromosome name resolution : str resolution in word. coarse_method : str Name of method to use for data coarsening. Accepted keywords: min, max, median, amean, gmean and hmean. """ output = [] binsize = util.resolutionToBinsize(resolution) size = self.chromSizeInfo[Chrom] + 1 for i in range(1, size, binsize): tmpx = None if i+binsize >= size: tmpx = self.tmpNumpyArrayFiles.arrays[Chrom][i : size] else: tmpx = self.tmpNumpyArrayFiles.arrays[Chrom][i : i+binsize] int_idx = np.nonzero(tmpx > 0) if int_idx[0].shape[0] == 0: output.append(0.0) continue #print(Chrom, tmpx.shape, i, i+binsize, tmpx) if coarse_method == 'max': output.append(np.amax(tmpx[int_idx])) if coarse_method == 'min': output.append(np.amin(tmpx[int_idx])) if coarse_method == 'amean': output.append(np.mean(tmpx[int_idx])) if coarse_method == 'hmean': output.append(spstats.hmean(tmpx[int_idx])) if coarse_method == 'gmean': output.append(spstats.gmean(tmpx[int_idx])) if coarse_method == 'median': output.append(np.median(tmpx[int_idx])) # print(Chrom, resolution, coarse_method, size, binsize, size/binsize, len(output), np.amax(output)) return np.asarray(output)
def saveAsH5(self, hdf5Out, title=None, resolutions=None, coarsening_methods=None, compression='lzf', keep_original=False): """To convert bed files to hdf5 file It parses bed files, coarsened the data and store in an input hdf5/h5 file. Parameters ---------- hdf5Out : :class:`HDF5Handler` or str Output hdf5 file name or :class:`HDF5Handler` instance title : str Title of the data resolutions : list of str Additional input resolutions other than these default resolutions: 1kb', '2kb', '4kb', '5kb', '8kb', '10kb', '20kb', '40kb', '80kb', '100kb', '160kb','200kb', '320kb', '500kb', '640kb', and '1mb'. For Example: use ``resolutions=['25kb', '50kb', '75kb']`` to add additional 25kb, 50kb and 75kb resolution data. coarsening_methods : list of str Methods to coarse or downsample the data for converting from 1-base to coarser resolutions. Presently, five methods are implemented. * ``'min'`` -> Minimum value * ``'max'`` -> Maximum value * ``'amean'`` -> Arithmetic mean or average * ``'hmean'`` -> Harmonic mean * ``'gmean'`` -> Geometric mean * ``'median'`` -> Median In case of ``None``, all five methods will be considered. User may use only subset of these methods. For example: ``coarse_method=['max', 'amean']`` can be used for downsampling by only these two methods. compression : str data compression method in HDF5 file : ``lzf`` or ``gzip`` method. keep_original : bool Whether original data present in bigwig file should be incorporated in HDF5 file. This will significantly increase size of HDF5 file. """ if not self.isBedParsed: self.parseBed() # Storing data in hdf5 file self._StoreInHdf5File(hdf5Out, title, resolutions=resolutions, coarsening_methods=coarsening_methods, compression=compression, keep_original=keep_original)
def predict_fold(nfold, versn='rnx50_4', dev=mx.gpu(), datas='val'): mods = get_models(versn, nfold, dev) if not os.path.exists(f'dump/{versn}'): os.mkdir(f'dump/{versn}') if datas == 'val': df = pd.read_csv(f'../../_data/fold{nfold}/val.csv') paths = np.array( [fn.replace('code/amazon_from_space/_data', 'dataset/amazon') for fn in df.iloc[:, 0].tolist()]) else: paths = sorted(glob.glob('../../../../dataset/amazon/test-jpg/*')) val_dataset = CSVDataset_tst(paths) val = data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=8) centr = [] plt.figure() buf = np.zeros((3, 17)) out = [] gen = enumerate(val, 0) gen = tqdm(gen, total=len(val)) for j, val_data in gen: inputs, labels = val_data inputs = inputs.numpy() for i, mod in enumerate(mods): mod.forward(Batch([mx.nd.array(inputs[0])])) y_prd = mod.get_outputs()[0].asnumpy() if i == 0: centr.append(y_prd[0]) y_prd = gmean(y_prd, axis=0) buf[i] = y_prd a = gmean(buf, axis=0) out.append(a) np.save(f'dump/{versn}/{datas}_{versn}_{nfold}_21crop', np.array(out)) np.save(f'dump/{versn}/{datas}_{versn}_{nfold}_1crop', np.array(centr)) np.save(f'dump/{versn}/{datas}_{versn}_{nfold}_fns', np.array(paths))
def predict_fold(nfold, versn='rnx50_4', dev=mx.gpu(), datas='val'): if not os.path.exists(f'dump/{versn}'): os.mkdir(f'dump/{versn}') mods = get_models(versn, nfold, dev) if datas == 'val': df = pd.read_csv(f'../../_data/fold{nfold}/val.csv') paths = np.array( [fn.replace('code/amazon_from_space/_data', 'dataset/amazon') for fn in df.iloc[:, 0].tolist()]) else: paths = sorted(glob.glob('../../../../dataset/amazon/test-jpg/*')) val_dataset = CSVDataset_tst(paths) val = data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=4) centr = [] plt.figure() buf = np.zeros((3, 17)) out = [] gen = enumerate(val, 0) gen = tqdm(gen, total=len(val)) for j, val_data in gen: inputs, labels = val_data inputs = inputs.numpy() for i, mod in enumerate(mods): mod.forward(Batch([mx.nd.array(inputs[0])])) y_prd = mod.get_outputs()[0].asnumpy() if i == 0: centr.append(y_prd[0]) y_prd = gmean(y_prd, axis=0) buf[i] = y_prd a = gmean(buf, axis=0) out.append(a) np.save(f'dump/{versn}/{datas}_{versn}_{nfold}_21crop', np.array(out)) np.save(f'dump/{versn}/{datas}_{versn}_{nfold}_1crop', np.array(centr)) np.save(f'dump/{versn}/{datas}_{versn}_{nfold}_fns', np.array(paths))