def rank_test(covariates, groups): """ Wilcoxon rank sum test for the distribution of treatment and control covariates. Parameters ---------- covariates : DataFrame Dataframe with one covariate per column. If matches are with replacement, then duplicates should be included as additional rows. groups : array-like treatment assignments, must be 2 groups Returns ------- A list of p-values, one for each column in covariates """ colnames = list(covariates.columns) J = len(colnames) pvalues = np.zeros(J) for j in range(J): var = covariates[colnames[j]] res = ranksums(var[groups == 1], var[groups == 0]) pvalues[j] = res.pvalue return pvalues
def main(): """ 1st phase top1 = [70.0, 71.1, 72.5, 70.8, 68.1, 71.9, 71.1, 71.3, 68.4, 70.2] top3 = [75.8, 78.4, 77.8, 77.7, 80.0, 77.8, 78.7, 76.4, 79.1, 77.3] 2nd phase """ x = [53.6, 54.5, 53.7, 52.7, 53.1, 55.5, 55.5, 52.8, 53.7, 52.7] y = [89.7, 89.1, 89.5, 88.7, 89.4, 88.6, 89.8, 89.5, 89.2, 89.7] # Compute the Wilcoxon rank-sum statistic for two samples. wilcoxon = stats.ranksums(x, y) anova = stats.f_oneway(x, y) print "Wilcoxon: " + str(wilcoxon[1]) + "; ANOVA: " + str(anova[1])
def get_layered_pvals(df, groupcol, valuecol, subset_by, pval_method='kruskalwallis'): """ Get pvalues for all pairwise combinations in groupcol. Performs calculating separately for each group in subset_by columns. In other words, this is a wrapper for groupby(subset_by) + get_all_pvals(). Parameters ---------- df : pandas dataframe tidy dataframe with labels in `groupcol` and values in `valuecol` groupcol, valuecol : str columns in df subset_by : str column to group by pval_method : str {'kruskalwallis', 'ranksums', 'wilcoxon', 'ttest_ind'} statistical method for comparison. Default is 'kruskalwallis' Returns ------- pvals : dict multi-level dictionary, with outside keys as the unique values in df[subset_by] and the inner values as in get_all_pvals() """ pvals = {} for s, subdf in df.groupby(subset_by): pvals[s] = get_all_pvals(subdf, groupcol, valuecol, method=pval_method) return pvals
def compare_otus_teststat(df, Xsmpls, Ysmpls, method='kruskal-wallis', multi_comp=None): """ Compares columns between Xsmpls and Ysmpls, with statistical method=method. Returns dataframe with both the qvals ('p') and test statistic ('test-stat') parameters ---------- df dataframe, samples are in rows and OTUs in columns X,Ysmpls list of samples to compare method statistical method to use for comparison multi_comp str, type of multiple comparison test to do. Currently accepts 'fdr' or None outputs ------- results dataframe with OTUs in rows and 'p' and 'test-stat' in columns """ if method == 'kruskal-wallis': pfun = kruskalwallis elif method == 'wilcoxon' or method == 'ranksums': pfun = ranksums elif method == 'mann-whitney': pfun = mannwhitneyu # Note: prob wanna add some kwargs here to say whether 2sided or not results = pd.DataFrame(index=df.columns, columns=['test-stat', 'p']) for o in df.columns: try: h, p = pfun(df.loc[Xsmpls, o], df.loc[Ysmpls, o]) except: p = 1 h = 0 results.loc[o, 'p'] = p results.loc[o, 'test-stat'] = h if multi_comp == 'fdr': _, results['q'], _, _ = multipletests(results['p'], method='fdr_bh') return results
def computeRankSumZvalsPvals(errRates, lowIsBetter=True): ranks = computeRanks(errRates, onlyFullRows=False) # compute the ranked sums test p-value between different classifiers numClassifiers = errRates.shape[1] dims = (numClassifiers, numClassifiers) zvals = np.empty(dims) pvals = np.empty(dims) for i in range(numClassifiers): zvals[i, i] = 0 pvals[i, i] = 1 for j in range(i+1, numClassifiers): x = errRates.iloc[:, i] y = errRates.iloc[:, j] # compare using all datasets they have in common rowsWithoutNans = np.invert(np.isnan(x) + np.isnan(y)) x = x[rowsWithoutNans] y = y[rowsWithoutNans] zvals[i, j], pvals[i, j] = ranksums(y, x) # cols are indep var zvals[j, i], pvals[j, i] = -zvals[i, j], pvals[i, j] classifierNames = ranks.columns.values zvals = pd.DataFrame(data=zvals, index=classifierNames, columns=classifierNames) pvals = pd.DataFrame(data=pvals, index=classifierNames, columns=classifierNames) return zvals, pvals
def get_all_pvals(df, groupcol, valuecol, method='kruskalwallis'): """ Returns pairwise p-values between all groups in the column `groupcol`. Parameters ---------- df : pandas dataframe tidy dataframe with labels in `groupcol` and values in `valuecol` groupcol, valuecol : str columns in df method : str {'kruskalwallis', 'ranksums', 'wilcoxon', 'ttest_ind'} statistical method for comparison. Default is 'kruskalwallis' Returns ------- pvals : dict dictionary with 'group1_vs_group2' as the keys and p-value as the values """ pvals = {} ## Get all pairwise combinations grps = list(set(df[groupcol])) for g1 in grps: for g2 in grps[grps.index(g1)+1:]: if g1 != g2: ## Grab values x = df[df[groupcol] == g1][valuecol] y = df[df[groupcol] == g2][valuecol] ## Calculate p value if method == 'ranksums' or method == 'wilcoxon': pfun = ranksums elif method == 'ttest_ind': pfun = ttest_ind else: pfun = kruskalwallis try: _, p = pfun(x, y) except: # Should probably have better error handling here... p = np.nan ## Store p value pvals[g1 + '_vs_' + g2] = p return pvals