我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用scipy.stats.chi2_contingency()。
def _p_test(self,v,grouped_data,is_continuous,is_categorical, is_normal,min_observed,catlevels, pval=np.nan,ptest='Not tested'): """ Compute p value """ # do not test if any sub-group has no observations if min_observed == 0: warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v)) return pval,ptest # continuous if is_continuous and is_normal: # normally distributed ptest = 'One-way ANOVA' test_stat, pval = stats.f_oneway(*grouped_data) elif is_continuous and not is_normal: # non-normally distributed ptest = 'Kruskal-Wallis' test_stat, pval = stats.kruskal(*grouped_data) # categorical elif is_categorical: # default to chi-squared ptest = 'Chi-squared' chi2, pval, dof, expected = stats.chi2_contingency(grouped_data) # if any expected cell counts are < 5, chi2 may not be valid # if this is a 2x2, switch to fisher exact if expected.min() < 5: if grouped_data.shape == (2,2): ptest = 'Fisher''s exact' oddsratio, pval = stats.fisher_exact(grouped_data) else: ptest = 'Chi-squared (warning: expected count < 5)' warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v)) return pval,ptest
def fsel(data=[]): # (feature selection, using chi2) """ Returns a {feature: p-value} dict for the given set of (vector, label)-tuples. """ from scipy.stats import chi2_contingency as chi2 f1 = collections.defaultdict(float) # {label: count} f2 = collections.defaultdict(float) # {feature: count} f3 = collections.defaultdict(float) # {feature, label: count} p = {} for v, label in data: f1[label] += 1 for v, label in data: for f in v: f2[f] += 1 f3[f, label] += 1 for f in f2: p[f] = chi2([[f1[label] - f3[f, label] or 0.1 for label in f1], [ f3[f, label] or 0.1 for label in f1]])[1] return p
def _func(self, x, size, ext_size, width): obs = pd.np.array( [[x.freq1, x.freq2], [size - x.freq1 * width, ext_size - x.freq2 * width]]) try: tmp = stats.chi2_contingency(obs, lambda_="log-likelihood") except ValueError as e: print(e) return pd.np.nan return tmp[0]
def test_random_circuits(self): local_simulator = qasm_simulator.QasmSimulator() for circuit in self.rqg.get_circuits(format='QuantumCircuit'): self.log.info(circuit.qasm()) compiled_circuit = openquantumcompiler.compile(circuit.qasm()) shots = 100 min_cnts = int(shots / 10) job_pq = QuantumJob(compiled_circuit, backend='local_projectq_simulator', seed=1, shots=shots) job_py = QuantumJob(compiled_circuit, backend='local_qasm_simulator', seed=1, shots=shots) result_pq = pq_simulator.run(job_pq) result_py = local_simulator.run(job_py) counts_pq = result_pq.get_counts(result_pq.get_names()[0]) counts_py = result_py.get_counts(result_py.get_names()[0]) # filter states with few counts counts_pq = {key:cnt for key,cnt in counts_pq.items() if cnt > min_cnts} counts_py = {key:cnt for key,cnt in counts_py.items() if cnt > min_cnts} self.log.info('local_projectq_simulator: ' + str(counts_pq)) self.log.info('local_qasm_simulator: ' + str(counts_py)) self.assertTrue(counts_pq.keys() == counts_py.keys()) states = counts_py.keys() # contingency table ctable = numpy.array([[counts_pq[key] for key in states], [counts_py[key] for key in states]]) result = chi2_contingency(ctable) self.log.info('chi2_contingency: ' + str(result)) with self.subTest(): self.assertGreater(result[1], 0.01)
def run_test(q1_pos, q2_pos, q1_neg,q2_neg): ''' this method takes four parallel arrays representing a 2X2 contingency table. the length of these parallel arrays denotes the number of tests that will be run, either a chi-squared test or an fisher-exact test are run, epending whether the requriments for a reliable chi-squared test are satisifed. Bonferroni correction is then applied by adjusting the p-values for all of the tests We return two parellel arrays, the first array is the p-values of for the tests, the second array is the test value e.g. the chi-squared value or the fisher-exact oddsratio. ''' input = [q1_pos, q2_pos, q1_neg,q2_neg] n = len(input[0]) if not all(len(x) == n for x in input): raise BaseException ("length of input lists must be of same length") pvalues = [] test_values = [] for i in range(0,n): obs = np.array([ [input[0][i],input[1][i]],[input[2][i],input[3][i]] ]) if useFisherExact(obs): p = fisher_exact(obs)[1] t = fisher_exact(obs)[0] else: p = chi2_contingency(obs)[1] t = chi2_contingency(obs)[0] pvalues.append(p) test_values.append(t) #applying Bonferroni correction adjustedPValues = [ float(p)/float(n) for p in pvalues] return [adjustedPValues, test_values]
def get_p_vals(role1,champ1,single_counts=True,span=3): # Use a chi-squared test to calculate p-values to compare the recommendation # distributions for the top 3 champs vs the next few recommendations. champ1=str(champ2id.get(champ1,champ1)) p_vals = {} for role2 in recs[tier][role1][champ1]: p_vals[role2] = {} if role2=='TOTAL' or role2=='DATA': continue for idx in range(1,4): values = [] for pos_to_compare in range(idx+1,idx+1+span): # Get ids from recs: champ2_1 = str(champ2id[recs[tier][role1][champ1][role2][idx]['champ']]) champ2_2 = str(champ2id[recs[tier][role1][champ1][role2][pos_to_compare]['champ']]) # Get data: N = recs[tier][role1][champ1][role2]['N'] if N > 10: data = sliding_count_recs[tier][role1][champ1][role2] champ2_1_data = np.array(data['DATA'][champ2_1] + [0]*(N-len(data['DATA'][champ2_1]))) champ2_2_data = np.array(data['DATA'][champ2_2] + [0]*(N-len(data['DATA'][champ2_2]))) if single_counts: champ2_1_data[champ2_1_data>0]=1 champ2_2_data[champ2_2_data>0]=1 contingency_mat = np.array([[sum(champ2_1_data), N-sum(champ2_1_data)],[sum(champ2_2_data),N-sum(champ2_2_data)]]) values.append(chi2_contingency(contingency_mat)[1]) else: values.append(1) p_vals[role2][idx] = values return p_vals
def cramers_v_stat(confusion_matrix): """Calculate Cramérs V statistic for categorial-categorial association.""" chi2 = stats.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum() phi2 = chi2 / n r, k = confusion_matrix.shape return math.sqrt(phi2 / min((r-1), (k-1)))
def cramers_v_corrected_stat(confusion_matrix): """Calculate Cramérs V statistic for categorial-categorial association. Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328. """ chi2 = stats.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum() phi2 = chi2 / n r, k = confusion_matrix.shape phi2_corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1)) r_corr = r - ((r-1)**2) / (n-1) k_corr = k - ((k-1)**2) / (n-1) return math.sqrt(phi2_corr / min((r_corr-1), (k_corr-1)))
def feature_importance_classification(features, target, n_neighbors=3, random_state=None): cont = features.select_dtypes(include=[np.floating]) disc = features.select_dtypes(include=[np.integer, np.bool]) cont_imp = pd.DataFrame(index=cont.columns) disc_imp = pd.DataFrame(index=disc.columns) # Continuous features if cont_imp.index.size > 0: # F-test f_test = feature_selection.f_classif(cont, target) cont_imp['f_statistic'] = f_test[0] cont_imp['f_p_value'] = f_test[1] # Mutual information mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False, n_neighbors=n_neighbors, random_state=random_state) cont_imp['mutual_information'] = mut_inf # Discrete features if disc_imp.index.size > 0: # Chi²-test chi2_tests = defaultdict(dict) for feature in disc.columns: cont = pd.crosstab(disc[feature], target) statistic, p_value, _, _ = stats.chi2_contingency(cont) chi2_tests[feature]['chi2_statistic'] = statistic chi2_tests[feature]['chi2_p_value'] = p_value chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index') disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic'] disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value'] # Cramér's V (corrected) disc_imp['cramers_v'] = [ cramers_v_corrected_stat(pd.crosstab(feature, target).values) for _, feature in disc.iteritems() ] # Mutual information mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True, n_neighbors=n_neighbors, random_state=random_state) disc_imp['mutual_information'] = mut_inf return cont_imp, disc_imp