def anova(data): """ return True is at least one mean is different from the other https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html """ if len(data) == 2: statistic, pvalue = stats.f_oneway(data[0], data[1]) elif len(data) == 3: statistic, pvalue = stats.f_oneway(data[0], data[1], data[2]) elif len(data) == 4: statistic, pvalue = stats.f_oneway(data[0], data[1], data[2], data[3]) else: utils.print_error("TODO ANOVA manage more values") print("ANOVA Statistic " + str(statistic) + " and p-value " + str(pvalue)) if pvalue < 0.05: return True else: return False
def _p_test(self,v,grouped_data,is_continuous,is_categorical, is_normal,min_observed,catlevels, pval=np.nan,ptest='Not tested'): """ Compute p value """ # do not test if any sub-group has no observations if min_observed == 0: warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v)) return pval,ptest # continuous if is_continuous and is_normal: # normally distributed ptest = 'One-way ANOVA' test_stat, pval = stats.f_oneway(*grouped_data) elif is_continuous and not is_normal: # non-normally distributed ptest = 'Kruskal-Wallis' test_stat, pval = stats.kruskal(*grouped_data) # categorical elif is_categorical: # default to chi-squared ptest = 'Chi-squared' chi2, pval, dof, expected = stats.chi2_contingency(grouped_data) # if any expected cell counts are < 5, chi2 may not be valid # if this is a 2x2, switch to fisher exact if expected.min() < 5: if grouped_data.shape == (2,2): ptest = 'Fisher''s exact' oddsratio, pval = stats.fisher_exact(grouped_data) else: ptest = 'Chi-squared (warning: expected count < 5)' warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v)) return pval,ptest
def anova(data): if len(data.groupby(level=1)) <= 2: raise Exception('ANOVA requires a secondary index with three or more values') return pd.DataFrame( [f_oneway(*[v for k, v in data[col].groupby(level=1)]) for col in data.columns], columns=['f', 'p'], index=data.columns)
def main(): """ 1st phase top1 = [70.0, 71.1, 72.5, 70.8, 68.1, 71.9, 71.1, 71.3, 68.4, 70.2] top3 = [75.8, 78.4, 77.8, 77.7, 80.0, 77.8, 78.7, 76.4, 79.1, 77.3] 2nd phase """ x = [53.6, 54.5, 53.7, 52.7, 53.1, 55.5, 55.5, 52.8, 53.7, 52.7] y = [89.7, 89.1, 89.5, 88.7, 89.4, 88.6, 89.8, 89.5, 89.2, 89.7] # Compute the Wilcoxon rank-sum statistic for two samples. wilcoxon = stats.ranksums(x, y) anova = stats.f_oneway(x, y) print "Wilcoxon: " + str(wilcoxon[1]) + "; ANOVA: " + str(anova[1])
def anovaTest(nAlgorithms,hyperVolumeList): anova = [] for i in range(nAlgorithms): algorithm = np.array(hyperVolumeList[i]) j=i+1 while j < nAlgorithms: algorithmCompare = np.array(hyperVolumeList[j]) anvaTest = stats.f_oneway(algorithm, algorithmCompare) anova.append(anvaTest) j +=1 print 'esto es anova' print anova return anova
def test_f_oneway_vs_scipy_stats(): # Test that our f_oneway gives the same result as scipy.stats rng = np.random.RandomState(0) X1 = rng.randn(10, 3) X2 = 1 + rng.randn(10, 3) f, pv = stats.f_oneway(X1, X2) f2, pv2 = f_oneway(X1, X2) assert_true(np.allclose(f, f2)) assert_true(np.allclose(pv, pv2))
def test_f_oneway_ints(): # Smoke test f_oneway on integers: that it does raise casting errors # with recent numpys rng = np.random.RandomState(0) X = rng.randint(10, size=(10, 10)) y = np.arange(10) fint, pint = f_oneway(X, y) # test that is gives the same result as with float f, p = f_oneway(X.astype(np.float), y) assert_array_almost_equal(f, fint, decimal=4) assert_array_almost_equal(p, pint, decimal=4)
def smart_hypothesis_testing(*samples, **options): """Do a smart hypothesis testing.""" fancy = options.get('fancy', True) out = options.get('out', sys.stdout) alpha = options.get('alpha', 0.05) equal_var = options.get('equal_var', True) latex = options.get('latex', True) samples = [np.array(sample, dtype='float') for sample in samples] len_samples = len(samples) out_buffer = StringIO() normality_results = samples_are_normal(*samples) if all(map(itemgetter(0), normality_results)): # all our samples are normal if equal_var: if fancy: out_buffer.write(Template( u"Hypothesis testing:\n\n" "\t$H0: ${mu}1 = ${mu}2{ellipsis} = $mu{len_samples}. " "The means for all groups are equal.\n" "\t$H1: $exists a,b $elementof Samples: ${mu}a $neq ${mu}b. " "At least two of the means are not equal.\n\n" "The significance test one-way analysis of variance (ANOVA) " "was used with a significance level of $alpha={alpha:.2f}.\n" "This test requires that the following " "assumptions are satisfied:\n\n" "1. Samples are independent.\n" "2. Samples are drawn from a normally distributed population.\n" "3. All populations have equal standard deviation.\n\n" "For the assumption of normal distribution two tests were " "performed ($alpha={alpha}): Shapiro Wilk's test " "and D'Agostino and Pearson's test.\n" "None of these tests reject the null hypothesis with " "significance level of $alpha={alpha}, thus it is assumed that data " "follows a normal distribution.\n\n" "").substitute(GREEK_ALPHABET).format( ellipsis=" = ..." if len_samples > 3 else "", **locals() )) statistic, pvalue = f_oneway(*samples) if fancy: if pvalue < alpha: out_buffer.write( u"One can say that samples come from populations " "with different means, since ANOVA rejects the " "null hypothesis " "(statistic={statistic:.2f}, {pvalue_str}).\n" "".format(pvalue_str=_pvalue_to_str(pvalue), **locals()) ) else: out_buffer.write( u"Thus, it was not possible to find evidence that" " the means of populations are different " "(statistic={statistic:.2f},{rho}={pvalue:.2f}).\n" "".format(**locals()) ) _flush_output(out, out_buffer, latex) return statistic, pvalue, f_oneway
def feature_importance_regression(features, target, n_neighbors=3, random_state=None): cont = features.select_dtypes(include=[np.floating]) disc = features.select_dtypes(include=[np.integer, np.bool]) cont_imp = pd.DataFrame(index=cont.columns) disc_imp = pd.DataFrame(index=disc.columns) # Continuous features if cont_imp.index.size > 0: # Pearson correlation pearson = np.array([stats.pearsonr(feature, target) for _, feature in cont.iteritems()]) cont_imp['pearson_r'] = pearson[:, 0] cont_imp['pearson_r_p_value'] = pearson[:, 1] # Mutual information mut_inf = feature_selection.mutual_info_regression(cont, target, discrete_features=False, n_neighbors=n_neighbors, random_state=random_state) cont_imp['mutual_information'] = mut_inf # Discrete features if disc_imp.index.size > 0: # F-test f_tests = defaultdict(dict) for feature in disc.columns: groups = [target[idxs] for idxs in disc.groupby(feature).groups.values()] statistic, p_value = stats.f_oneway(*groups) f_tests[feature]['f_statistic'] = statistic f_tests[feature]['f_p_value'] = p_value f_tests_df = pd.DataFrame.from_dict(f_tests, orient='index') disc_imp['f_statistic'] = f_tests_df['f_statistic'] disc_imp['f_p_value'] = f_tests_df['f_p_value'] # Mutual information mut_inf = feature_selection.mutual_info_regression(disc, target, discrete_features=True, n_neighbors=n_neighbors, random_state=random_state) disc_imp['mutual_information'] = mut_inf return cont_imp, disc_imp