我们从Python开源项目中,提取了以下42个代码示例,用于说明如何使用scipy.stats.skew()。
def data_preprocess(train,test): outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447] train.drop(train.index[outlier_idx],inplace=True) all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition'])) to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'] all_data = all_data.drop(to_delete,axis=1) train["SalePrice"] = np.log1p(train["SalePrice"]) #log transform skewed numeric features numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) all_data = all_data.fillna(all_data.mean()) X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice return X_train,X_test,y
def data_preprocess(train, test): outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477, 478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169, 1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447] train.drop(train.index[outlier_idx], inplace=True) all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'], test.loc[:, 'MSSubClass':'SaleCondition'])) to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'] all_data = all_data.drop(to_delete, axis=1) train["SalePrice"] = np.log1p(train["SalePrice"]) # log transform skewed numeric features numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) all_data = all_data.fillna(method='ffill') X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice return X_train, X_test, y
def calculate_aggregate(values): agg_measures = { 'avg': np.mean(values), 'std': np.std(values), 'var': np.var(values), 'med': np.median(values), '10p': np.percentile(values, 10), '25p': np.percentile(values, 25), '50p': np.percentile(values, 50), '75p': np.percentile(values, 75), '90p': np.percentile(values, 90), 'iqr': np.percentile(values, 75) - np.percentile(values, 25), 'iqm': interquartile_range_mean(values), 'mad': mean_absolute_deviation(values), 'cov': 1.0 * np.mean(values) / np.std(values), 'gin': gini_coefficient(values), 'skw': stats.skew(values), 'kur': stats.kurtosis(values), 'sum': np.sum(values) } return agg_measures
def calculate_aggregate(values): agg_measures = { 'avg': np.mean(values), 'std': np.std(values), 'var': np.var(values), 'med': np.median(values), '10p': np.percentile(values, 10), '25p': np.percentile(values, 25), '50p': np.percentile(values, 50), '75p': np.percentile(values, 75), '90p': np.percentile(values, 90), 'iqr': np.percentile(values, 75) - np.percentile(values, 25), 'iqm': interquartile_range_mean(values), 'mad': mean_absolute_deviation(values), 'cov': 1.0 * np.mean(values) / np.std(values), 'gin': gini_coefficient(values), 'skw': stats.skew(values), 'kur': stats.kurtosis(values) } return agg_measures
def test_skew(self): tm._skip_if_no_scipy() from scipy.stats import skew alt = lambda x: skew(x, bias=False) self._check_stat_op('skew', alt) # test corner cases, skew() returns NaN unless there's at least 3 # values min_N = 3 for i in range(1, min_N + 1): s = Series(np.ones(i)) df = DataFrame(np.ones((i, i))) if i < min_N: self.assertTrue(np.isnan(s.skew())) self.assertTrue(np.isnan(df.skew()).all()) else: self.assertEqual(0, s.skew()) self.assertTrue((df.skew() == 0).all())
def test_skew(self): try: from scipy.stats import skew except ImportError: raise nose.SkipTest("no scipy.stats.skew") def this_skew(x): if len(x) < 3: return np.nan return skew(x, bias=False) self._check_stat_op('skew', this_skew) # def test_mad(self): # f = lambda x: np.abs(x - x.mean()).mean() # self._check_stat_op('mad', f)
def test_sem(self): def alt(x): if len(x) < 2: return np.nan return np.std(x, ddof=1) / np.sqrt(len(x)) self._check_stat_op('sem', alt) # def test_skew(self): # from scipy.stats import skew # def alt(x): # if len(x) < 3: # return np.nan # return skew(x, bias=False) # self._check_stat_op('skew', alt)
def test_returned_dtype(self): dtypes = [np.int16, np.int32, np.int64, np.float32, np.float64] if hasattr(np, 'float128'): dtypes.append(np.float128) for dtype in dtypes: s = Series(range(10), dtype=dtype) group_a = ['mean', 'std', 'var', 'skew', 'kurt'] group_b = ['min', 'max'] for method in group_a + group_b: result = getattr(s, method)() if is_integer_dtype(dtype) and method in group_a: self.assertTrue( result.dtype == np.float64, "return dtype expected from %s is np.float64, " "got %s instead" % (method, result.dtype)) else: self.assertTrue( result.dtype == dtype, "return dtype expected from %s is %s, " "got %s instead" % (method, dtype, result.dtype))
def ka_display_skewnewss(data): '''show skewness information Parameters ---------- data: pandas dataframe Return ------ df: pandas dataframe ''' numeric_cols = data.columns[data.dtypes != 'object'].tolist() skew_value = [] for i in numeric_cols: skew_value += [skew(data[i])] df = pd.concat( [pd.Series(numeric_cols), pd.Series(data.dtypes[data.dtypes != 'object'].apply(lambda x: str(x)).values) , pd.Series(skew_value)], axis=1) df.columns = ['var_name', 'col_type', 'skew_value'] return df
def mfccPostProcess(directory,fileCount): for count in range(fileCount): print("{0}/{1}".format(count+1,fileCount)) for mfccext in mfccList: mfcc = np.loadtxt(directory+str(count)+mfccext+".csv",delimiter=",") dmfcc = librosa.feature.delta(mfcc) result = np.zeros((mfcc.shape[1],14)) result[:,0] = np.mean(mfcc, axis=0) result[:,1] = np.var(mfcc, axis=0, dtype=np.float64) result[:,2] = stats.skew(mfcc, axis=0) result[:,3] = stats.kurtosis(mfcc, axis=0, fisher=False) result[:,4] = np.median(mfcc, axis=0) result[:,5] = np.min(mfcc, axis=0) result[:,6] = np.max(mfcc, axis=0) result[:,7] = np.mean(dmfcc, axis=0) result[:,8] = np.var(dmfcc, axis=0, dtype=np.float64) result[:,9] = stats.skew(dmfcc, axis=0) result[:,10] = stats.kurtosis(dmfcc, axis=0, fisher=False) result[:,11] = np.median(dmfcc, axis=0) result[:,12] = np.min(dmfcc, axis=0) result[:,13] = np.max(dmfcc, axis=0) result[np.where(np.isnan(result))] = 0 np.savetxt(directory+str(count)+mfccext+"_stat.txt",result.flatten("F"),delimiter=",")
def process(self, obj_data): ''' Apply Skew analysis with results added to the data wrapper @param obj_data: Data wrapper ''' column_names = obj_data.getDefaultColumns() results = defaultdict(dict) # for label, frame in tqdm(obj_data.getIterator()): for label, frame in obj_data.getIterator(): for column in column_names: # dropping missing data in order to remove top and bottom 2% data = frame[column].dropna() # Remove top and bottom 2% rem_num = round(len(data)*0.02) res = skew(data.sort_values(ascending=True)[rem_num:-rem_num]) if isinstance(res, np.ma.masked_array): res = np.float(res.data) results[label][column] = res obj_data.addResult(self.str_description, results)
def skew_correction(df, numerical_features): # Skew correction skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna())) # compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
def ideal_bin_count(data, method="default"): """A theoretically ideal bin count. Parameters ---------- data: array_like or None Data to work on. Most methods don't use this. method: str Name of the method to apply, available values: - default (~sturges) - sqrt - sturges - doane - rice See https://en.wikipedia.org/wiki/Histogram for the description Returns ------- int Number of bins, always >= 1 """ n = data.size if n < 1: return 1 if method == "default": if n <= 32: return 7 else: return ideal_bin_count(data, "sturges") elif method == "sqrt": return int(np.ceil(np.sqrt(n))) elif method == "sturges": return int(np.ceil(np.log2(n)) + 1) elif method == "doane": if n < 3: return 1 from scipy.stats import skew sigma = np.sqrt(6 * (n-2) / (n + 1) * (n + 3)) return int(np.ceil(1 + np.log2(n) + np.log2(1 + np.abs(skew(data)) / sigma))) elif method == "rice": return int(np.ceil(2 * np.power(n, 1 / 3)))
def skew_correction(df, numerical_features): # Skew correction skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna())) # compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float)) # df[skewed_feats] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
def test_skew(self): tm._skip_if_no_scipy() from scipy.stats import skew def alt(x): if len(x) < 3: return np.nan return skew(x, bias=False) self._check_stat_op('skew', alt)
def test_stats_mixed_type(self): # don't blow up self.mixed_frame.std(1) self.mixed_frame.var(1) self.mixed_frame.mean(1) self.mixed_frame.skew(1)
def test_how_compat(self): # in prior versions, we would allow how to be used in the resample # now that its deprecated, we need to handle this in the actual # aggregation functions s = pd.Series( np.random.randn(20), index=pd.date_range('1/1/2000', periods=20, freq='12H')) for how in ['min', 'max', 'median']: for op in ['mean', 'sum', 'std', 'var', 'kurt', 'skew']: for t in ['rolling', 'expanding']: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): dfunc = getattr(pd, "{0}_{1}".format(t, op)) if dfunc is None: continue if t == 'rolling': kwargs = {'window': 5} else: kwargs = {} result = dfunc(s, freq='D', how=how, **kwargs) expected = getattr( getattr(s, t)(freq='D', **kwargs), op)(how=how) assert_series_equal(result, expected)
def test_rolling_skew(self): try: from scipy.stats import skew except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_skew, lambda x: skew(x, bias=False), name='skew')
def test_nanskew(self): tm.skip_if_no_package('scipy.stats') tm._skip_if_scipy_0_17() from scipy.stats import skew func = partial(self._skew_kurt_wrap, func=skew) self.check_funs(nanops.nanskew, func, allow_complex=False, allow_str=False, allow_date=False, allow_tdelta=False)
def setUp(self): # Test data + skewness value (computed with scipy.stats.skew) self.samples = np.sin(np.linspace(0, 1, 200)) self.actual_skew = -0.1875895205961754
def test_constant_series(self): # xref GH 11974 for val in [3075.2, 3075.3, 3075.5]: data = val * np.ones(300) skew = nanops.nanskew(data) self.assertEqual(skew, 0.0)
def test_ground_truth(self): skew = nanops.nanskew(self.samples) self.assertAlmostEqual(skew, self.actual_skew)
def test_nans(self): samples = np.hstack([self.samples, np.nan]) skew = nanops.nanskew(samples, skipna=False) self.assertTrue(np.isnan(skew))
def test_nans_skipna(self): samples = np.hstack([self.samples, np.nan]) skew = nanops.nanskew(samples, skipna=True) tm.assert_almost_equal(skew, self.actual_skew)
def statistical_metrics(x): """ Calculates statistical metrics on input array (mean, std, skew, kurtosis). """ metrics = { 'mean': np.mean, 'stdev': np.std, 'skew': stats.skew, 'kurtosis': stats.kurtosis } return {k: fn(x.flatten()) for k, fn in metrics.items()}
def test_gen_usr_distrib(n_samples=100000, verbose=False): rng = np.random.RandomState(0) xs = _gen_usr_distrib(n_samples, ['laplace'], rng) assert_allclose(np.mean(xs), 0, atol=5e-2) assert_allclose(np.std(xs), 1, atol=5e-2) assert_allclose(skew(xs)[0], 0, atol=5e-2) assert_allclose(kurtosis(xs)[0], 3, atol=5e-2) xs = _gen_usr_distrib(n_samples, ['exp'], rng) assert_allclose(np.std(xs), 1, atol=5e-2)
def get_features(df_features): print('use w2v to document presentation') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1) print('nones') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1) df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1) #df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) #df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']]) df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x))) df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x))) print('z_dist') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_tfidf_cos_sim') print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_w2v_nones') print now.strftime('%Y-%m-%d %H:%M:%S') df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1) df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1) df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1) df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x)) df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x)) df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x)) df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x)) del df_features['question1_w2v'] del df_features['question2_w2v'] print('all done') print now.strftime('%Y-%m-%d %H:%M:%S') df_features.fillna(0.0) return df_features
def mungeskewed(train, test, numeric_feats): ntrain = train.shape[0] test['loss'] = 0 train_test = pd.concat((train, test)).reset_index(drop=True) skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.25] skewed_feats = skewed_feats.index for feats in skewed_feats: train_test[feats] = train_test[feats] + 1 train_test[feats], lam = boxcox(train_test[feats]) return train_test, ntrain
def features(self, q1, q2): q1 = str(q1).lower().split() q2 = str(q2).lower().split() q1 = [w for w in q1 if w not in stopwords] q2 = [w for w in q2 if w not in stopwords] wmd = min(self.model.wmdistance(q1, q2), 10) q1vec = self.sent2vec(q1) q2vec = self.sent2vec(q2) if q1vec is not None and q2vec is not None: cos = cosine(q1vec, q2vec) city = cityblock(q1vec, q2vec) jacc = jaccard(q1vec, q2vec) canb = canberra(q1vec, q2vec) eucl = euclidean(q1vec, q2vec) mink = minkowski(q1vec, q2vec, 3) bray = braycurtis(q1vec, q2vec) q1_skew = skew(q1vec) q2_skew = skew(q2vec) q1_kurt = kurtosis(q1vec) q2_kurt = kurtosis(q2vec) else: cos = -1 city = -1 jacc = -1 canb = -1 eucl = -1 mink = -1 bray = -1 q1_skew = 0 q2_skew = 0 q1_kurt = 0 q2_kurt = 0 return wmd, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def features(self, q1, q2): q1 = str(q1).lower().split() q2 = str(q2).lower().split() q1 = [w for w in q1 if w not in stopwords] q2 = [w for w in q2 if w not in stopwords] wmd = min(self.model.wmdistance(q1, q2), 10) wmd_norm = min(self.model_norm.wmdistance(q1, q2), 10) q1vec = self.sent2vec(q1) q2vec = self.sent2vec(q2) if q1vec is not None and q2vec is not None: cos = cosine(q1vec, q2vec) city = cityblock(q1vec, q2vec) jacc = jaccard(q1vec, q2vec) canb = canberra(q1vec, q2vec) eucl = euclidean(q1vec, q2vec) mink = minkowski(q1vec, q2vec, 3) bray = braycurtis(q1vec, q2vec) q1_skew = skew(q1vec) q2_skew = skew(q2vec) q1_kurt = kurtosis(q1vec) q2_kurt = kurtosis(q2vec) else: cos = -1 city = -1 jacc = -1 canb = -1 eucl = -1 mink = -1 bray = -1 q1_skew = 0 q2_skew = 0 q1_kurt = 0 q2_kurt = 0 return wmd, wmd_norm, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def lightcurve_moments(ftimes, fmags, ferrs): '''This calculates the weighted mean, stdev, median, MAD, percentiles, skew, kurtosis, fraction of LC beyond 1-stdev, and IQR. ''' ndet = len(fmags) if ndet > 9: # now calculate the various things we need series_median = npmedian(fmags) series_wmean = ( npsum(fmags*(1.0/(ferrs*ferrs)))/npsum(1.0/(ferrs*ferrs)) ) series_mad = npmedian(npabs(fmags - series_median)) series_stdev = 1.483*series_mad series_skew = spskew(fmags) series_kurtosis = spkurtosis(fmags) # get the beyond1std fraction series_above1std = len(fmags[fmags > (series_median + series_stdev)]) series_below1std = len(fmags[fmags < (series_median - series_stdev)]) # this is the fraction beyond 1 stdev series_beyond1std = (series_above1std + series_below1std)/float(ndet) # get the magnitude percentiles series_mag_percentiles = nppercentile( fmags, [5.0,10,17.5,25,32.5,40,60,67.5,75,82.5,90,95] ) return { 'median':series_median, 'wmean':series_wmean, 'mad':series_mad, 'stdev':series_stdev, 'skew':series_skew, 'kurtosis':series_kurtosis, 'beyond1std':series_beyond1std, 'mag_percentiles':series_mag_percentiles, 'mag_iqr': series_mag_percentiles[8] - series_mag_percentiles[3], } else: LOGERROR('not enough detections in this magseries ' 'to calculate light curve moments') return None
def create_scipy_features(base_features, sentinel): r"""Calculate the skew, kurtosis, and other statistical features for each row. Parameters ---------- base_features : numpy array The feature dataframe. sentinel : float The number to be imputed for NaN values. Returns ------- sp_features : numpy array The calculated SciPy features. """ logger.info("Creating SciPy Features") # Generate scipy features logger.info("SciPy Feature: geometric mean") row_gmean = sps.gmean(base_features, axis=1) logger.info("SciPy Feature: kurtosis") row_kurtosis = sps.kurtosis(base_features, axis=1) logger.info("SciPy Feature: kurtosis test") row_ktest, pvalue = sps.kurtosistest(base_features, axis=1) logger.info("SciPy Feature: normal test") row_normal, pvalue = sps.normaltest(base_features, axis=1) logger.info("SciPy Feature: skew") row_skew = sps.skew(base_features, axis=1) logger.info("SciPy Feature: skew test") row_stest, pvalue = sps.skewtest(base_features, axis=1) logger.info("SciPy Feature: variation") row_var = sps.variation(base_features, axis=1) logger.info("SciPy Feature: signal-to-noise ratio") row_stn = sps.signaltonoise(base_features, axis=1) logger.info("SciPy Feature: standard error of mean") row_sem = sps.sem(base_features, axis=1) sp_features = np.column_stack((row_gmean, row_kurtosis, row_ktest, row_normal, row_skew, row_stest, row_var, row_stn, row_sem)) sp_features = impute_values(sp_features, 'float64', sentinel) sp_features = StandardScaler().fit_transform(sp_features) # Return new SciPy features logger.info("SciPy Feature Count : %d", sp_features.shape[1]) return sp_features # # Function create_clusters #
def create_images_for_labeling(pars): import scipy.stats as st import os import numpy as np import calblitz as cb from glob import glob try: f_name=pars cdir=os.path.dirname(f_name) print 'loading' m=cb.load(f_name) print 'corr image' img=m.local_correlations(eight_neighbours=True) im=cb.movie(img,fr=1) im.save(os.path.join(cdir,'correlation_image.tif')) print 'std image' img=np.std(m,0) im=cb.movie(np.array(img),fr=1) im.save(os.path.join(cdir,'std_projection.tif')) m1=m.resize(1,1,1./m.fr) print 'median image' img=np.median(m1,0) im=cb.movie(np.array(img),fr=1) im.save(os.path.join(cdir,'median_projection.tif')) print 'save BL' m1=m1-img m1.save(os.path.join(cdir,'MOV_BL.tif')) m1=m1.bilateral_blur_2D() m1.save(os.path.join(cdir,'MOV_BL_BIL.tif')) m=np.array(m1) print 'max image' img=np.max(m,0) im=cb.movie(np.array(img),fr=1) im.save(os.path.join(cdir,'max_projection.tif')) print 'skew image' img=st.skew(m,0) im=cb.movie(img,fr=1) im.save(os.path.join(cdir,'skew_projection.tif')) del m del m1 except Exception, e: return e return f_name
def test_rolling_functions_window_non_shrinkage(self): # GH 7764 s = Series(range(4)) s_expected = Series(np.nan, index=s.index) df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B']) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) df_expected_panel = Panel(items=df.index, major_axis=df.columns, minor_axis=df.columns) functions = [lambda x: (x.rolling(window=10, min_periods=5) .cov(x, pairwise=False)), lambda x: (x.rolling(window=10, min_periods=5) .corr(x, pairwise=False)), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), lambda x: x.rolling(window=10, min_periods=5).mean(), lambda x: x.rolling(window=10, min_periods=5).std(), lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), lambda x: x.rolling( window=10, min_periods=5).quantile(quantile=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum), lambda x: x.rolling(win_type='boxcar', window=10, min_periods=5).mean()] for f in functions: try: s_result = f(s) assert_series_equal(s_result, s_expected) df_result = f(df) assert_frame_equal(df_result, df_expected) except (ImportError): # scipy needed for rolling_window continue functions = [lambda x: (x.rolling(window=10, min_periods=5) .cov(x, pairwise=True)), lambda x: (x.rolling(window=10, min_periods=5) .corr(x, pairwise=True))] for f in functions: df_result_panel = f(df) assert_panel_equal(df_result_panel, df_expected_panel)
def pre_process(df): # LotFrontage's N/A is assigned zero, will it cause problem? df.fillna(value={'MasVnrType': 'None', 'MasVnrArea': 0,'Electrical': 'SBrkr', 'FireplaceQu': 'NoFP', 'GarageType': 'Noga', 'GarageFinish': 'Noga', 'GarageQual': 'Noga', 'Fence': 'NoFence', 'BsmtFinSF1':0,'BsmtFinSF2':0,'BsmtUnfSF':0,'TotalBsmtSF':0,'BsmtFullBath':0,'BsmtHalfBath':0, 'LotFrontage': 0}, inplace=True) df.loc[:, 'YrSold'] = 2016 - df.loc[:, 'YrSold'] df.loc[df.loc[:, 'PoolArea'] != 0, 'PoolArea'] = 1 df.loc[:, 'Porch'] = np.sum(df.loc[:, ['EnclosedPorch', '3SsnPorch', 'ScreenPorch']], axis=1) df.drop(['EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis=1, inplace=True) df.replace({'BsmtFullBath': {3: 2}, 'LotShape': {'IR3': 'IR2'}}, inplace=True) # fill missing values in bsmt df = fill_bsmt_missing(df) def fill_na(df, col_name, value = None): if value == None: value = df[col_name].mean() df.loc[df[col_name].isnull(),col_name] = value fill_na(df, 'Fence','WD') fill_na(df, 'GarageArea') fill_na(df, 'GarageCars') fill_na(df, 'SaleType', df['SaleType'].mode().values[0]) fill_na(df, 'KitchenQual', df['KitchenQual'].mode().values[0]) fill_na(df, 'Functional', df['Functional'].mode().values[0]) fill_na(df, 'Exterior1st', df['Exterior1st'].mode().values[0]) fill_na(df, 'Exterior2nd', df['Exterior2nd'].mode().values[0]) fill_na(df, 'MSZoning', 'RL') bool_cols = np.array([df[col_name].isnull() for col_name in df.columns]) print('rows containing na:',np.sum(bool_cols.any(axis=0))) print('rows all na:',np.sum(bool_cols.all(axis=0))) # log1pskewed_feats numeric_feats = df.dtypes[df.dtypes != "object"].index skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index df[skewed_feats] = np.log1p(df[skewed_feats]) return df #%% #log transform the target: ignore for test data # #train_data = pre_process(train_df.copy()) #test_data = pre_process(test_df.copy())
def fit(self, X): self.data["cols"] = list(set(range(X.shape[1])).difference( np.where(np.all(X == X[0,:], axis = 0))[0])) tX = X[:, self.data["cols"]] if(self.algo == "min-max"): self.data['min'] = np.min(tX, axis=0) self.data['max'] = np.max(tX, axis=0) elif(self.algo == "normal"): self.data['mu'] = np.mean(tX, axis=0) self.data['std'] = np.std(tX, axis=0) elif(self.algo == "inv-normal"): self.data['mu'] = np.mean(tX, axis=0) self.data['std'] = np.std(tX, axis=0) elif(self.algo == "auto-normal"): self.data['min'] = np.min(tX, axis=0) self.data['max'] = np.max(tX, axis=0) tX = (tX-self.data["min"])/(self.data["max"]-self.data["min"]) boxcox = lambda x, lm: (np.sign(x)*np.abs(x)**lm-1)/lm self.data['boxcox'] = np.zeros(tX.shape[1]) for d in range(tX.shape[1]): Xd = tX[:, d] if(np.unique(tX[:, d]).shape[0] < 10): self.data['boxcox'][d] = 1 continue skewness = lambda x: skew(x, bias=False)**2 t_lm = lambda lm: np.log(np.exp(lm[0])+1) boxcox_Xd = lambda lm: boxcox(Xd, t_lm(lm)) obj = lambda lm: skewness(boxcox_Xd(lm)) bounds = [(-5, 5)] lm = minimize(obj, [0.], method='SLSQP', bounds=bounds, options={'ftol': 1e-8, 'maxiter':100, 'disp':False})['x'] self.data['boxcox'][d] = t_lm(lm) lm = self.data['boxcox'][None, :] tX = boxcox(tX, lm) self.data['mu'] = np.mean(tX, axis=0) self.data['std'] = np.std(tX, axis=0) elif(self.algo == "auto-inv-normal"): self.data['min'] = np.min(tX, axis=0) self.data['max'] = np.max(tX, axis=0) tX = (tX-self.data["min"])/(self.data["max"]-self.data["min"]) boxcox = lambda x, lm: (np.sign(x)*np.abs(x)**lm-1)/lm self.data['boxcox'] = np.zeros(tX.shape[1]) for d in range(tX.shape[1]): Xd = tX[:, d] if(np.unique(tX[:, d]).shape[0] < 10): self.data['boxcox'][d] = 1 continue skewness = lambda x: skew(x, bias=False)**2 t_lm = lambda lm: np.log(np.exp(lm[0])+1) boxcox_Xd = lambda lm: boxcox(Xd, t_lm(lm)) obj = lambda lm: skewness(boxcox_Xd(lm)) bounds = [(-5, 5)] lm = minimize(obj, [0.], method='SLSQP', bounds=bounds, options={'ftol': 1e-8, 'maxiter':100, 'disp':False})['x'] self.data['boxcox'][d] = t_lm(lm) lm = self.data['boxcox'][None, :] tX = boxcox(tX, lm) self.data['mu'] = np.mean(tX, axis=0) self.data['std'] = np.std(tX, axis=0)
def get_features(df_features): print('use w2v to document presentation') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1) print('get_w2v') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1) df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1) df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x))) df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x))) df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x))) df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x))) print('z_dist') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_tfidf_cos_sim') print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1) now = datetime.datetime.now() print('z_w2v_calc') print now.strftime('%Y-%m-%d %H:%M:%S') #df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1) df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1) df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1) df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1) df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1) df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x)) df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x)) df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x)) df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x)) df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x)) df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x)) df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x)) df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x)) del df_features['q1_unique_w2v_weight'] del df_features['q2_unique_w2v_weight'] del df_features['q1_unique_w2v'] del df_features['q2_unique_w2v'] print('all done') print now.strftime('%Y-%m-%d %H:%M:%S') df_features.fillna(0.0) return df_features
def _detect_artifacts(ica, raw, start_find, stop_find, ecg_ch, ecg_score_func, ecg_criterion, eog_ch, eog_score_func, eog_criterion, skew_criterion, kurt_criterion, var_criterion, add_nodes): """Aux Function""" from scipy import stats nodes = [] if ecg_ch is not None: nodes += [_ica_node('ECG', ecg_ch, ecg_score_func, ecg_criterion)] if eog_ch not in [None, []]: if not isinstance(eog_ch, list): eog_ch = [eog_ch] for idx, ch in enumerate(eog_ch): nodes += [_ica_node('EOG %02d' % idx, ch, eog_score_func, eog_criterion)] if skew_criterion is not None: nodes += [_ica_node('skewness', None, stats.skew, skew_criterion)] if kurt_criterion is not None: nodes += [_ica_node('kurtosis', None, stats.kurtosis, kurt_criterion)] if var_criterion is not None: nodes += [_ica_node('variance', None, np.var, var_criterion)] if add_nodes is not None: nodes.extend(add_nodes) for node in nodes: scores = ica.score_sources(raw, start=start_find, stop=stop_find, target=node.target, score_func=node.score_func) if isinstance(node.criterion, float): found = list(np.where(np.abs(scores) > node.criterion)[0]) else: found = list(np.atleast_1d(abs(scores).argsort()[node.criterion])) case = (len(found), 's' if len(found) > 1 else '', node.name) logger.info(' found %s artifact%s by %s' % case) ica.exclude += found logger.info('Artifact indices found:\n ' + str(ica.exclude).strip('[]')) if len(set(ica.exclude)) != len(ica.exclude): logger.info(' Removing duplicate indices...') ica.exclude = list(set(ica.exclude)) logger.info('Ready.')
def get_feature_stats(self): # #get input feature feature_input=self.feature_input.currentText() try: if feature_input[0]=='X': try: feature_index=int("".join(feature_input[1:])) feature_index-=1 except: QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X%d.") return elif "".join(feature_input[0]+feature_input[1])=='LD' or "".join(feature_input[0]+feature_input[1])=='PC': try: feature_index=int("".join(feature_input[2:])) feature_index-=1 except: QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X||LD||PC%d.") return else: QtWidgets.QMessageBox.information(self, "Wrong Format","Feature names must be in the format: X%d.") return except: QtWidgets.QMessageBox.information(self, "Data Not Found","Please load a dataset first.") return try: max_value=self.X[:,feature_index].max() min_value=self.X[:,feature_index].min() mean_value=self.X[:,feature_index].mean() std_value=self.X[:,feature_index].std() var_value=self.X[:,feature_index].var() skewness=stats.skew(self.X[:,feature_index]) kurtosis=stats.kurtosis(self.X[:,feature_index],fisher=True) chi2,chi_p_val=chi2_feature_test(self.X,self.y,int(feature_index)) H_kw,kw_p_val=kw_feature_test(self.X,self.y,int(feature_index)) info_gain=information_gain(self.X,self.y,int(feature_index)) gain_rt=gain_ratio(self.X,self.y,int(feature_index)) except: QtWidgets.QMessageBox.information(self, "Wrong Index","Feature Index Out Of Bounds.") return feature_stats="""Statistics:\n\nMinimum Value: """+str(min_value)\ +"""\n\nMaximum Value: """+str(max_value)\ +"""\n\nMean: """+str(mean_value)\ +"""\n\nStandard Deviation: """+str(std_value)\ +"""\n\nVariance: """+str(var_value)\ +"""\n\nSkewness: """+str(skewness)\ +"""\n\nKurtosis: """+str(kurtosis)\ +"""\n\nChi Squared Test: """+str(chi2[0])\ +"""\n\nKruskal-Wallis Test: """+str(H_kw)\ +"""\n\nInformation Gain: """+str(info_gain)\ +"""\n\nGain Ratio: """+str(gain_rt) self.feature_stats.setText(feature_stats)