我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用scipy.stats.kurtosis()。
def calculate_aggregate(values): agg_measures = { 'avg': np.mean(values), 'std': np.std(values), 'var': np.var(values), 'med': np.median(values), '10p': np.percentile(values, 10), '25p': np.percentile(values, 25), '50p': np.percentile(values, 50), '75p': np.percentile(values, 75), '90p': np.percentile(values, 90), 'iqr': np.percentile(values, 75) - np.percentile(values, 25), 'iqm': interquartile_range_mean(values), 'mad': mean_absolute_deviation(values), 'cov': 1.0 * np.mean(values) / np.std(values), 'gin': gini_coefficient(values), 'skw': stats.skew(values), 'kur': stats.kurtosis(values), 'sum': np.sum(values) } return agg_measures
def calculate_aggregate(values): agg_measures = { 'avg': np.mean(values), 'std': np.std(values), 'var': np.var(values), 'med': np.median(values), '10p': np.percentile(values, 10), '25p': np.percentile(values, 25), '50p': np.percentile(values, 50), '75p': np.percentile(values, 75), '90p': np.percentile(values, 90), 'iqr': np.percentile(values, 75) - np.percentile(values, 25), 'iqm': interquartile_range_mean(values), 'mad': mean_absolute_deviation(values), 'cov': 1.0 * np.mean(values) / np.std(values), 'gin': gini_coefficient(values), 'skw': stats.skew(values), 'kur': stats.kurtosis(values) } return agg_measures
def test_kurt(self): tm._skip_if_no_scipy() from scipy.stats import kurtosis alt = lambda x: kurtosis(x, bias=False) self._check_stat_op('kurt', alt) index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) self.assertAlmostEqual(s.kurt(), s.kurt(level=0)['bar']) # test corner cases, kurt() returns NaN unless there's at least 4 # values min_N = 4 for i in range(1, min_N + 1): s = Series(np.ones(i)) df = DataFrame(np.ones((i, i))) if i < min_N: self.assertTrue(np.isnan(s.kurt())) self.assertTrue(np.isnan(df.kurt()).all()) else: self.assertEqual(0, s.kurt()) self.assertTrue((df.kurt() == 0).all())
def test_kurt(self): tm._skip_if_no_scipy() from scipy.stats import kurtosis def alt(x): if len(x) < 4: return np.nan return kurtosis(x, bias=False) self._check_stat_op('kurt', alt) index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) df = DataFrame(np.random.randn(6, 3), index=index) kurt = df.kurt() kurt2 = df.kurt(level=0).xs('bar') assert_series_equal(kurt, kurt2, check_names=False) self.assertTrue(kurt.name is None) self.assertEqual(kurt2.name, 'bar')
def plot_hist(item, figure_id=1): pt.figure(figure_id) kurtosis = -np.ones(8) for i in range(item.shape[1]): pt.subplot(240+i) tmp = item[item[:,i]!=-1,i] tmp = tmp + np.random.rand(len(tmp)) - 0.5 pt.hist(tmp, bins=6, normed=True, range=(0.9,6.1), alpha=0.8, color=colorc[i]) pt.title(name[i]) density = kde.gaussian_kde(tmp) xgrid = np.linspace(0, 6, 100) pt.plot(xgrid, density(xgrid), 'r-') avg = np.mean(tmp) sd = np.std(tmp) pt.plot(xgrid, normpdf(xgrid,avg,sd)) pt.show() kurtosis[i] = sps.kurtosis(item[item[:,i]!=-1,i]) return(kurtosis)
def _statistics(self): data = self.tr.data t = np.arange(0, self.delta * self.npts, self.delta) m = len(data) Nsta = int(self.t_win * self.sampling_rate) # compute the short time average (STA) kt = np.zeros(m, dtype='float64') pad_kt = np.zeros(Nsta) # Tricky: Construct a big window of length len(a)-nsta. Now move this # window nsta points, i.e. the window "sees" every point in a at least # once. # Changed xrange to range as it is compatible in both python 2 & 3 for i in range(m): # window size to smooth over kt[i] = abs(kurtosis(data[i-Nsta:i])) kt[0:Nsta] = 0 return kt
def mfccPostProcess(directory,fileCount): for count in range(fileCount): print("{0}/{1}".format(count+1,fileCount)) for mfccext in mfccList: mfcc = np.loadtxt(directory+str(count)+mfccext+".csv",delimiter=",") dmfcc = librosa.feature.delta(mfcc) result = np.zeros((mfcc.shape[1],14)) result[:,0] = np.mean(mfcc, axis=0) result[:,1] = np.var(mfcc, axis=0, dtype=np.float64) result[:,2] = stats.skew(mfcc, axis=0) result[:,3] = stats.kurtosis(mfcc, axis=0, fisher=False) result[:,4] = np.median(mfcc, axis=0) result[:,5] = np.min(mfcc, axis=0) result[:,6] = np.max(mfcc, axis=0) result[:,7] = np.mean(dmfcc, axis=0) result[:,8] = np.var(dmfcc, axis=0, dtype=np.float64) result[:,9] = stats.skew(dmfcc, axis=0) result[:,10] = stats.kurtosis(dmfcc, axis=0, fisher=False) result[:,11] = np.median(dmfcc, axis=0) result[:,12] = np.min(dmfcc, axis=0) result[:,13] = np.max(dmfcc, axis=0) result[np.where(np.isnan(result))] = 0 np.savetxt(directory+str(count)+mfccext+"_stat.txt",result.flatten("F"),delimiter=",")
def feat_eeg(signals): """ calculate the relative power as defined by Leangkvist (2012), assuming signal is recorded with 100hz """ if signals.ndim == 1: signals = np.expand_dims(signals,0) sfreq = use_sfreq nsamp = float(signals.shape[1]) feats = np.zeros((signals.shape[0],9),dtype='float32') # 5 FEATURE for freq babnds w = (fft(signals,axis=1)).real delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1) theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1) alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1) beta = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1) gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1) # only until 50, because hz=100 spindle = np.sum(np.abs(w[:,np.arange(12*nsamp/sfreq,14*nsamp/sfreq, dtype=int)]),axis=1) sum_abs_pow = delta + theta + alpha + beta + gamma + spindle feats[:,0] = delta /sum_abs_pow feats[:,1] = theta /sum_abs_pow feats[:,2] = alpha /sum_abs_pow feats[:,3] = beta /sum_abs_pow feats[:,4] = gamma /sum_abs_pow feats[:,5] = spindle /sum_abs_pow feats[:,6] = np.log10(stats.kurtosis(signals, fisher=False, axis=1)) # kurtosis feats[:,7] = np.log10(-np.sum([(x/nsamp)*(np.log(x/nsamp)) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1)) # entropy.. yay, one line... #feats[:,7] = np.polynomial.polynomial.polyfit(np.log(f[np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]), np.log(w[0,np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),1) feats[:,8] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5) if np.any(feats==np.nan): print('NaN detected') return np.nan_to_num(feats)
def feat_wavelet(signals): """ calculate the relative power as defined by Leangkvist (2012), assuming signal is recorded with 100hz """ if signals.ndim == 1: signals = np.expand_dims(signals,0) sfreq = use_sfreq nsamp = float(signals.shape[1]) feats = np.zeros((signals.shape[0],8),dtype='float32') # 5 FEATURE for freq babnds w = (fft(signals,axis=1)).real delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1) theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1) alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1) beta = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1) gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1) # only until 50, because hz=100 sum_abs_pow = delta + theta + alpha + beta + gamma feats[:,0] = delta /sum_abs_pow feats[:,1] = theta /sum_abs_pow feats[:,2] = alpha /sum_abs_pow feats[:,3] = beta /sum_abs_pow feats[:,4] = gamma /sum_abs_pow feats[:,5] = np.log10(stats.kurtosis(signals,fisher=False,axis=1)) # kurtosis feats[:,6] = np.log10(-np.sum([(x/nsamp)*(np.log(x/nsamp)) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1)) # entropy.. yay, one line... #feats[:,7] = np.polynomial.polynomial.polyfit(np.log(f[np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]), np.log(w[0,np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),1) feats[:,7] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5) if np.any(feats==np.nan): print('NaN detected') return np.nan_to_num(feats)
def feat_eog(signals): """ calculate the EOG features :param signals: 1D or 2D signals """ if signals.ndim == 1: signals = np.expand_dims(signals,0) sfreq = use_sfreq nsamp = float(signals.shape[1]) w = (fft(signals,axis=1)).real feats = np.zeros((signals.shape[0],15),dtype='float32') delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1) theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1) alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1) beta = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1) gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1) # only until 50, because hz=100 sum_abs_pow = delta + theta + alpha + beta + gamma feats[:,0] = delta /sum_abs_pow feats[:,1] = theta /sum_abs_pow feats[:,2] = alpha /sum_abs_pow feats[:,3] = beta /sum_abs_pow feats[:,4] = gamma /sum_abs_pow feats[:,5] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5) #smean feats[:,6] = np.sqrt(np.max(signals, axis=1)) #PAV feats[:,7] = np.sqrt(np.abs(np.min(signals, axis=1))) #VAV feats[:,8] = np.argmax(signals, axis=1)/nsamp #PAP feats[:,9] = np.argmin(signals, axis=1)/nsamp #VAP feats[:,10] = np.sqrt(np.sum(np.abs(signals), axis=1)/ np.mean(np.sum(np.abs(signals), axis=1))) # AUC feats[:,11] = np.sum(((np.roll(np.sign(signals), 1,axis=1) - np.sign(signals)) != 0).astype(int),axis=1)/nsamp #TVC feats[:,12] = np.log10(np.std(signals, axis=1)) #STD/VAR feats[:,13] = np.log10(stats.kurtosis(signals,fisher=False,axis=1)) # kurtosis feats[:,14] = np.log10(-np.sum([(x/nsamp)*((np.log((x+np.spacing(1))/nsamp))) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1)) # entropy.. yay, one line... if np.any(feats==np.nan): print('NaN detected') return np.nan_to_num(feats)
def feat_emg(signals): """ calculate the EMG median as defined by Leangkvist (2012), """ if signals.ndim == 1: signals = np.expand_dims(signals,0) sfreq = use_sfreq nsamp = float(signals.shape[1]) w = (fft(signals,axis=1)).real feats = np.zeros((signals.shape[0],13),dtype='float32') delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1) theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1) alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1) beta = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1) gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1) # only until 50, because hz=100 sum_abs_pow = delta + theta + alpha + beta + gamma feats[:,0] = delta /sum_abs_pow feats[:,1] = theta /sum_abs_pow feats[:,2] = alpha /sum_abs_pow feats[:,3] = beta /sum_abs_pow feats[:,4] = gamma /sum_abs_pow feats[:,5] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5) #smean emg = np.sum(np.abs(w[:,np.arange(12.5*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1) feats[:,6] = emg / np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1) # ratio of high freq to total motor feats[:,7] = np.median(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1) # median freq feats[:,8] = np.mean(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1) # mean freq feats[:,9] = np.std(signals, axis=1) # std feats[:,10] = np.mean(signals,axis=1) feats[:,11] = np.log10(stats.kurtosis(signals,fisher=False,axis=1) ) feats[:,12] = np.log10(-np.sum([(x/nsamp)*((np.log((x+np.spacing(1))/nsamp))) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1)) # entropy.. yay, one line... if np.any(feats==np.nan): print('NaN detected') return np.nan_to_num(feats)
def __init__(self, s, lags=0, kurtosis='adapt', learningRate=1.5, tolerance=1.0e-6, maxIter=10000, callback=None, verbose=False, *args, **kwargs): STrans.__init__(self, s, lags=lags, *args, **kwargs) self.train(s, kurtosis=kurtosis, learningRate=learningRate, tolerance=tolerance, maxIter=maxIter, callback=callback, verbose=verbose)
def demoICA(): t = np.linspace(0.0, 30*np.pi, 1000) s1 = spsig.sawtooth(t) s2 = np.cos(5.0*t) s3 = np.random.uniform(-1.0, 1.0, size=t.size) s = np.vstack((s1,s2,s3)).T m = np.random.random((3,3)) m /= m.sum(axis=0) sMixed = s.dot(m) icaFilt = ICA(sMixed, kurtosis='sub', verbose=True) fig = plt.figure() axOrig = fig.add_subplot(4,1, 1) axOrig.plot(s+util.colsep(s)) axOrig.set_title('Unmixed Signal') axOrig.autoscale(tight=True) axMixed = fig.add_subplot(4,1, 2) axMixed.plot(sMixed+util.colsep(sMixed)) axMixed.set_title('Mixed Signal (random transform)') axMixed.autoscale(tight=True) axUnmixed = fig.add_subplot(4,1, 3) icaFilt.plotTransform(sMixed, ax=axUnmixed) axUnmixed.set_title('ICA Components') axUnmixed.autoscale(tight=True) axCleaned = fig.add_subplot(4,1, 4) icaFilt.plotFilter(sMixed, comp=(0,1,), ax=axCleaned) axCleaned.set_title('Cleaned Signal (First two components kept)') axCleaned.autoscale(tight=True) fig.tight_layout()
def test_rolling_kurt(self): try: from scipy.stats import kurtosis except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_kurt, lambda x: kurtosis(x, bias=False), name='kurt')
def test_nankurt(self): tm.skip_if_no_package('scipy.stats') tm._skip_if_scipy_0_17() from scipy.stats import kurtosis func1 = partial(kurtosis, fisher=True) func = partial(self._skew_kurt_wrap, func=func1) self.check_funs(nanops.nankurt, func, allow_complex=False, allow_str=False, allow_date=False, allow_tdelta=False)
def setUp(self): # Test data + kurtosis value (computed with scipy.stats.kurtosis) self.samples = np.sin(np.linspace(0, 1, 200)) self.actual_kurt = -1.2058303433799713
def statistical_metrics(x): """ Calculates statistical metrics on input array (mean, std, skew, kurtosis). """ metrics = { 'mean': np.mean, 'stdev': np.std, 'skew': stats.skew, 'kurtosis': stats.kurtosis } return {k: fn(x.flatten()) for k, fn in metrics.items()}
def test_gen_usr_distrib(n_samples=100000, verbose=False): rng = np.random.RandomState(0) xs = _gen_usr_distrib(n_samples, ['laplace'], rng) assert_allclose(np.mean(xs), 0, atol=5e-2) assert_allclose(np.std(xs), 1, atol=5e-2) assert_allclose(skew(xs)[0], 0, atol=5e-2) assert_allclose(kurtosis(xs)[0], 3, atol=5e-2) xs = _gen_usr_distrib(n_samples, ['exp'], rng) assert_allclose(np.std(xs), 1, atol=5e-2)
def get_features(df_features): print('use w2v to document presentation') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1) print('nones') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1) df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1) #df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) #df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x))) df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x))) print('z_dist') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_tfidf_cos_sim') print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_w2v_nones') print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1) df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1) df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x)) df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x)) df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x)) df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x)) del df_features['question1_w2v'] del df_features['question2_w2v'] print('all done') print now.strftime('%Y-%m-%d %H:%M:%S') df_features.fillna(0.0) return df_features
def _get_grid_size(data, use_default_square=False): """ Calculate the size of the grid. Parameters ---------- data: array-like The normalized data. use_default_square: bool Define the grid as the minimal possible square. Returns ------- int, int The width and height of the grid. """ # if the grid would be square, this is the minimum size sqr_size = int(np.ceil(np.sqrt(len(data)))) size_x = size_y = sqr_size if not use_default_square: kurt = kurtosis(data) kurt_x, kurt_y = np.int32(np.abs(np.ceil(kurt * 2))) size_x += kurt_x size_y += kurt_y return size_x, size_y
def features(self, q1, q2): q1 = str(q1).lower().split() q2 = str(q2).lower().split() q1 = [w for w in q1 if w not in stopwords] q2 = [w for w in q2 if w not in stopwords] wmd = min(self.model.wmdistance(q1, q2), 10) q1vec = self.sent2vec(q1) q2vec = self.sent2vec(q2) if q1vec is not None and q2vec is not None: cos = cosine(q1vec, q2vec) city = cityblock(q1vec, q2vec) jacc = jaccard(q1vec, q2vec) canb = canberra(q1vec, q2vec) eucl = euclidean(q1vec, q2vec) mink = minkowski(q1vec, q2vec, 3) bray = braycurtis(q1vec, q2vec) q1_skew = skew(q1vec) q2_skew = skew(q2vec) q1_kurt = kurtosis(q1vec) q2_kurt = kurtosis(q2vec) else: cos = -1 city = -1 jacc = -1 canb = -1 eucl = -1 mink = -1 bray = -1 q1_skew = 0 q2_skew = 0 q1_kurt = 0 q2_kurt = 0 return wmd, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def features(self, q1, q2): q1 = str(q1).lower().split() q2 = str(q2).lower().split() q1 = [w for w in q1 if w not in stopwords] q2 = [w for w in q2 if w not in stopwords] wmd = min(self.model.wmdistance(q1, q2), 10) wmd_norm = min(self.model_norm.wmdistance(q1, q2), 10) q1vec = self.sent2vec(q1) q2vec = self.sent2vec(q2) if q1vec is not None and q2vec is not None: cos = cosine(q1vec, q2vec) city = cityblock(q1vec, q2vec) jacc = jaccard(q1vec, q2vec) canb = canberra(q1vec, q2vec) eucl = euclidean(q1vec, q2vec) mink = minkowski(q1vec, q2vec, 3) bray = braycurtis(q1vec, q2vec) q1_skew = skew(q1vec) q2_skew = skew(q2vec) q1_kurt = kurtosis(q1vec) q2_kurt = kurtosis(q2vec) else: cos = -1 city = -1 jacc = -1 canb = -1 eucl = -1 mink = -1 bray = -1 q1_skew = 0 q2_skew = 0 q1_kurt = 0 q2_kurt = 0 return wmd, wmd_norm, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def lightcurve_moments(ftimes, fmags, ferrs): '''This calculates the weighted mean, stdev, median, MAD, percentiles, skew, kurtosis, fraction of LC beyond 1-stdev, and IQR. ''' ndet = len(fmags) if ndet > 9: # now calculate the various things we need series_median = npmedian(fmags) series_wmean = ( npsum(fmags*(1.0/(ferrs*ferrs)))/npsum(1.0/(ferrs*ferrs)) ) series_mad = npmedian(npabs(fmags - series_median)) series_stdev = 1.483*series_mad series_skew = spskew(fmags) series_kurtosis = spkurtosis(fmags) # get the beyond1std fraction series_above1std = len(fmags[fmags > (series_median + series_stdev)]) series_below1std = len(fmags[fmags < (series_median - series_stdev)]) # this is the fraction beyond 1 stdev series_beyond1std = (series_above1std + series_below1std)/float(ndet) # get the magnitude percentiles series_mag_percentiles = nppercentile( fmags, [5.0,10,17.5,25,32.5,40,60,67.5,75,82.5,90,95] ) return { 'median':series_median, 'wmean':series_wmean, 'mad':series_mad, 'stdev':series_stdev, 'skew':series_skew, 'kurtosis':series_kurtosis, 'beyond1std':series_beyond1std, 'mag_percentiles':series_mag_percentiles, 'mag_iqr': series_mag_percentiles[8] - series_mag_percentiles[3], } else: LOGERROR('not enough detections in this magseries ' 'to calculate light curve moments') return None
def create_scipy_features(base_features, sentinel): r"""Calculate the skew, kurtosis, and other statistical features for each row. Parameters ---------- base_features : numpy array The feature dataframe. sentinel : float The number to be imputed for NaN values. Returns ------- sp_features : numpy array The calculated SciPy features. """ logger.info("Creating SciPy Features") # Generate scipy features logger.info("SciPy Feature: geometric mean") row_gmean = sps.gmean(base_features, axis=1) logger.info("SciPy Feature: kurtosis") row_kurtosis = sps.kurtosis(base_features, axis=1) logger.info("SciPy Feature: kurtosis test") row_ktest, pvalue = sps.kurtosistest(base_features, axis=1) logger.info("SciPy Feature: normal test") row_normal, pvalue = sps.normaltest(base_features, axis=1) logger.info("SciPy Feature: skew") row_skew = sps.skew(base_features, axis=1) logger.info("SciPy Feature: skew test") row_stest, pvalue = sps.skewtest(base_features, axis=1) logger.info("SciPy Feature: variation") row_var = sps.variation(base_features, axis=1) logger.info("SciPy Feature: signal-to-noise ratio") row_stn = sps.signaltonoise(base_features, axis=1) logger.info("SciPy Feature: standard error of mean") row_sem = sps.sem(base_features, axis=1) sp_features = np.column_stack((row_gmean, row_kurtosis, row_ktest, row_normal, row_skew, row_stest, row_var, row_stn, row_sem)) sp_features = impute_values(sp_features, 'float64', sentinel) sp_features = StandardScaler().fit_transform(sp_features) # Return new SciPy features logger.info("SciPy Feature Count : %d", sp_features.shape[1]) return sp_features # # Function create_clusters #
def train(self, s, kurtosis, learningRate, tolerance, maxIter, callback, verbose): s = self.prep(s) wPrev = np.empty(self.w.shape) grad = np.empty((self.nComp, self.nComp)) I = np.eye(self.nComp, dtype=self.dtype) n = 1.0/s.shape[0] iteration = 0 while True: y = s.dot(self.w) if kurtosis == 'sub': k = -1 elif kurtosis == 'super': k = 1 elif kurtosis == 'adapt': #k = np.sign(np.mean(1.0-util.fastTanh(y)**2, axis=0) * # np.mean(y**2, axis=0) - # np.mean(y*util.fastTanh(y), axis=0)) k = np.sign(spstat.kurtosis(y, axis=0)) k[np.isclose(k,0.0)] = -1.0 grad[...] = (I - k*util.fastTanh(y).T.dot(y) - y.T.dot(y)).T.dot(self.w) * n wPrev[...] = self.w self.w += learningRate * grad wtol = np.max(np.abs(wPrev-self.w)) if verbose: print '%d %6f' % (iteration, wtol) if callback is not None: callback(iteration, wtol) if wtol < tolerance: self.reason = 'tolerance' break elif np.max(np.abs(self.w)) > 1.0e100: self.reason = 'diverge' break if iteration >= maxIter: self.reason = 'maxiter' break iteration += 1 if verbose: print 'Reason: ' + self.reason self.w /= np.sqrt(np.sum(self.w**2, axis=0)) self.wInv[...] = np.linalg.pinv(self.w)
def compute_basic_descriptives(df, selected_features): """ Compute basic descriptive statistics for the columns in the given data frame. Parameters ---------- df : pandas DataFrame Input data frame containing the feature values. selected_features : list of str List of feature names for which to compute the descriptives. Returns ------- df_desc : pandas DataFrame Data frame containing the descriptives for each of the features. """ # select only feature columns df_desc = df[selected_features] # get the H1 scores scores = df['sc1'] # compute correlations and p-values separately for efficiency cor_series = df_desc.apply(lambda s: pearsonr(s, scores)) cors = cor_series.apply(lambda t: t[0]) pvalues = cor_series.apply(lambda t: t[1]) # create a data frame with all the descriptives df_output = pd.DataFrame({'mean': df_desc.mean(), 'min': df_desc.min(), 'max': df_desc.max(), 'std. dev.': df_desc.std(), 'skewness': df_desc.skew(), 'kurtosis': df_desc.apply(lambda s: kurtosis(s, fisher=False)), 'Correlation': cors, 'p': pvalues, 'N': len(df_desc)}) # reorder the columns to make it look better df_output = df_output[['mean', 'std. dev.', 'min', 'max', 'skewness', 'kurtosis', 'Correlation', 'p', 'N']] return df_output
def get_features(df_features): print('use w2v to document presentation') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1) print('get_w2v') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1) df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1) df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x))) df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x))) df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x))) df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x))) print('z_dist') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_tfidf_cos_sim') print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_w2v_calc') print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1) df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1) df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1) df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x)) df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x)) df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x)) df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x)) df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x)) df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x)) df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x)) df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x)) del df_features['q1_unique_w2v_weight'] del df_features['q2_unique_w2v_weight'] del df_features['q1_unique_w2v'] del df_features['q2_unique_w2v'] print('all done') print now.strftime('%Y-%m-%d %H:%M:%S') df_features.fillna(0.0) return df_features
def _detect_artifacts(ica, raw, start_find, stop_find, ecg_ch, ecg_score_func, ecg_criterion, eog_ch, eog_score_func, eog_criterion, skew_criterion, kurt_criterion, var_criterion, add_nodes): """Aux Function""" from scipy import stats nodes = [] if ecg_ch is not None: nodes += [_ica_node('ECG', ecg_ch, ecg_score_func, ecg_criterion)] if eog_ch not in [None, []]: if not isinstance(eog_ch, list): eog_ch = [eog_ch] for idx, ch in enumerate(eog_ch): nodes += [_ica_node('EOG %02d' % idx, ch, eog_score_func, eog_criterion)] if skew_criterion is not None: nodes += [_ica_node('skewness', None, stats.skew, skew_criterion)] if kurt_criterion is not None: nodes += [_ica_node('kurtosis', None, stats.kurtosis, kurt_criterion)] if var_criterion is not None: nodes += [_ica_node('variance', None, np.var, var_criterion)] if add_nodes is not None: nodes.extend(add_nodes) for node in nodes: scores = ica.score_sources(raw, start=start_find, stop=stop_find, target=node.target, score_func=node.score_func) if isinstance(node.criterion, float): found = list(np.where(np.abs(scores) > node.criterion)[0]) else: found = list(np.atleast_1d(abs(scores).argsort()[node.criterion])) case = (len(found), 's' if len(found) > 1 else '', node.name) logger.info(' found %s artifact%s by %s' % case) ica.exclude += found logger.info('Artifact indices found:\n ' + str(ica.exclude).strip('[]')) if len(set(ica.exclude)) != len(ica.exclude): logger.info(' Removing duplicate indices...') ica.exclude = list(set(ica.exclude)) logger.info('Ready.')
def get_feature_stats(self): # #get input feature feature_input=self.feature_input.currentText() try: if feature_input[0]=='X': try: feature_index=int("".join(feature_input[1:])) feature_index-=1 except: QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X%d.") return elif "".join(feature_input[0]+feature_input[1])=='LD' or "".join(feature_input[0]+feature_input[1])=='PC': try: feature_index=int("".join(feature_input[2:])) feature_index-=1 except: QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X||LD||PC%d.") return else: QtWidgets.QMessageBox.information(self, "Wrong Format","Feature names must be in the format: X%d.") return except: QtWidgets.QMessageBox.information(self, "Data Not Found","Please load a dataset first.") return try: max_value=self.X[:,feature_index].max() min_value=self.X[:,feature_index].min() mean_value=self.X[:,feature_index].mean() std_value=self.X[:,feature_index].std() var_value=self.X[:,feature_index].var() skewness=stats.skew(self.X[:,feature_index]) kurtosis=stats.kurtosis(self.X[:,feature_index],fisher=True) chi2,chi_p_val=chi2_feature_test(self.X,self.y,int(feature_index)) H_kw,kw_p_val=kw_feature_test(self.X,self.y,int(feature_index)) info_gain=information_gain(self.X,self.y,int(feature_index)) gain_rt=gain_ratio(self.X,self.y,int(feature_index)) except: QtWidgets.QMessageBox.information(self, "Wrong Index","Feature Index Out Of Bounds.") return feature_stats="""Statistics:\n\nMinimum Value: """+str(min_value)\ +"""\n\nMaximum Value: """+str(max_value)\ +"""\n\nMean: """+str(mean_value)\ +"""\n\nStandard Deviation: """+str(std_value)\ +"""\n\nVariance: """+str(var_value)\ +"""\n\nSkewness: """+str(skewness)\ +"""\n\nKurtosis: """+str(kurtosis)\ +"""\n\nChi Squared Test: """+str(chi2[0])\ +"""\n\nKruskal-Wallis Test: """+str(H_kw)\ +"""\n\nInformation Gain: """+str(info_gain)\ +"""\n\nGain Ratio: """+str(gain_rt) self.feature_stats.setText(feature_stats)