我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用pandas.crosstab()。
def get_python_guangzhou(): frame2 = frame[(frame.kd == 'Python') &(frame.city == u'??') ] cframe = [v for k, v in frame2.to_dict(orient='index').items()] pattern = r'\d{4}-\d{2}-\d{2}' for c in cframe: if re.match(pattern, c['published']): pass else: c['published'] = datetime.datetime.utcnow().strftime("%Y-%m-%d") df = DataFrame(cframe) df['published'] = pd.to_datetime(df['published']) mask = (df['published'] > '2016-04-01') & (df['published'] <= '2016-05-02') dataframe = df.loc[mask] jobframe = pd.crosstab(dataframe.experience, frame.salary, margins=True).sort_values(by='All', ascending=False) jobframe = jobframe.drop('All', axis=0).drop('All', axis=1) pie_chart = pygal.StackedBar() pie_chart.title = u'???python?????' pie_chart.x_labels = jobframe.index for cit, num in jobframe.iteritems(): pie_chart.add("%s" % (cit), num) pie_chart.render_to_file(os.path.dirname(__file__) + '/chart/guangzhou_salary.svg')
def flavor_profile(df,ingr,comp,ingr_comp): sorted_ingredients = df.columns underscore_ingredients=[] for item in sorted_ingredients: underscore_ingredients.append(item.replace(' ','_')) print len(underscore_ingredients), len(sorted_ingredients) ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id') ingr_total = ingr_total.join(comp,how='right',on='compound id') ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id']) ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)] df_flavor = df.values.dot(ingr_flavor.values) print df.shape, df_flavor.shape return df_flavor #normalize flavor matrix with tfidf method
def get_crosstab(self,X,y): ''' ?feature_names????????????? X?DataFrame???????Series?????? y?Series?index???X????????0-1???????? ?????DataFrame?X?Series??????????X?DataFrame?????????????????DataFrame? ''' if len(X.shape)==1: result=pd.crosstab(X,y) else: result={} if self.feature_names is None: if isinstance(X,pd.DataFrame): feature_names=list(X.columns) else: feature_names=[i for i in range(X.shape[1])] else: feature_names=self.feature_names if isinstance(X,pd.DataFrame): for feature in feature_names: result[feature]=pd.crosstab(X[feature],y) else: for feature in feature_names: result[feature]=pd.crosstab(X[:,feature],y) return result
def make(T): log_tr = log[log.order_number_rev>T] # dow dow = pd.crosstab(log_tr.user_id, log_tr.order_dow).add_prefix('user_dow_freq_') dow_ = pd.crosstab(log_tr.user_id, log_tr.order_dow, normalize='index').add_prefix('user_dow_norm_') # timezone timezone = pd.crosstab(log_tr.user_id, log_tr.timezone).add_prefix('user_timezone_freq_') timezone_ = pd.crosstab(log_tr.user_id, log_tr.timezone, normalize='index').add_prefix('user_timezone_norm_') # dow * timezone dow_tz = pd.crosstab(log_tr.user_id, log_tr.dow_tz).add_prefix('user_dow-tz_freq_') dow_tz_ = pd.crosstab(log_tr.user_id, log_tr.dow_tz, normalize='index').add_prefix('user_dow-tz_norm_') tab = pd.concat([dow, dow_, timezone, timezone_, dow_tz, dow_tz_], axis=1) tab.reset_index().to_pickle('../feature/trainT-{}/f103_user.p'.format(T))
def ptr_stats(df): df = df[['CASE DISPOSED STATUS','HCJ Booked','MADE Y / N','PRETRIAL STATUS AT DISPOSITION','bail type made simple']] crosstab = pd.crosstab([df['CASE DISPOSED STATUS'],df['HCJ Booked'],df['MADE Y / N'],df['PRETRIAL STATUS AT DISPOSITION']], df['bail type made simple'], margins=True) print(crosstab) crosstab.to_csv('ptr_stats.csv')
def train_model(split=.25): """Tran model based on the iris dataset. This will split the iris dataset into train and test set, will train a Random Forest CLassifier and fit the trained model to the test dataset. In addition the confusion matrix and features importance will be calculated. Args: split (float): Fraction of observations in the test dataset. Returns: RandomForestClassifier: Trained model. pandas.DataFrame: Confusion matrix. dictionary: Features importance """ iris = load_iris() all_data = pd.DataFrame(iris.data, columns=iris.feature_names) features = all_data.columns.str.replace('\s+', '_').str.replace('\W+', '') all_data['species'] = pd.Categorical.from_codes(iris.target, iris.target_names) train, test = train_test_split(all_data, test_size=split) clf = RandomForestClassifier(n_jobs=1) clf.fit(train.drop('species', axis=1), train.species) preds = clf.predict(test.drop('species', axis=1)) conf_matrix = pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species']) f_importances = list(zip(train.drop('species', axis=1).columns, clf.feature_importances_)) return clf, conf_matrix, f_importances, features
def output_confusion_matrix(self, y, y_pred): assert y.size == y_pred.size print("Actual IDV") print(y.value_counts()) print("Predicted IDV") print(y_pred.value_counts()) print() print("Confusion matrix:") cmat = pd.crosstab(y_pred, y, rownames=['predictions'], colnames=['actual']) print(cmat) sys.stdout.flush() return cmat #-----------------------------------------------------------------------------
def plot_facet(self, data, color, **kwargs): x = kwargs.get("x") y = kwargs.get("y") levels_x = kwargs.get("levels_x") levels_y = kwargs.get("levels_y") #num = [] #date = [] #time = data[self._time_column] #num = data[self._time_column].apply(self.convert_to_datetime) #date = data[self._time_column].apply(self.convert_to_timeseries) #if pd.isnull(num).sum() <= pd.isnull(date).sum(): #data[self._time_column] = num #else: #data[self._time_column] = date #data.dropna(inplace=True) #if len(self._groupby) == 2: #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]]) #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0) #ct = ct[pd.notnull(ct.index)] #else: #ct = pd.crosstab( #data[self._time_column], #pd.Series([""] * len(self._table[self._time_column]), name="")) ## Line plot: #self.vmax = max(self.vmax, ct.values.max()) #ct.plot(ax=plt.gca(), color=self.get_palette())
def plot_facet(self, data, color, **kwargs): x = kwargs.get("x") y = kwargs.get("y") levels_x = kwargs.get("levels_x") levels_y = kwargs.get("levels_y") #num = [] #date = [] #time = data[self._time_column] #num = data[self._time_column].apply(self.convert_to_datetime) #date = data[self._time_column].apply(self.convert_to_timeseries) #if pd.isnull(num).sum() <= pd.isnull(date).sum(): #data[self._time_column] = num #else: #data[self._time_column] = date #data.dropna(inplace=True) #if len(self._groupby) == 2: #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]]) #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0) #ct = ct[pd.notnull(ct.index)] #else: #ct = pd.crosstab( #data[self._time_column], #pd.Series([""] * len(self._table[self._time_column]), name="")) ## Stacked area plot: #if len(self._groupby) == 2: #self.vmax = max(self.vmax, ct.apply(sum, axis=1).max()) #ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
def plot_facet(self, data, color, **kwargs): x = kwargs.get("x") y = kwargs.get("y") levels_x = kwargs.get("levels_x") levels_y = kwargs.get("levels_y") #num = [] #date = [] #time = data[self._time_column] #num = data[self._time_column].apply(self.convert_to_datetime) #date = data[self._time_column].apply(self.convert_to_timeseries) #if pd.isnull(num).sum() <= pd.isnull(date).sum(): #data[self._time_column] = num #else: #data[self._time_column] = date #data.dropna(inplace=True) #if len(self._groupby) == 2: #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]]) #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0) #ct = ct[pd.notnull(ct.index)] #else: #ct = pd.crosstab( #data[self._time_column], #pd.Series([""] * len(self._table[self._time_column]), name="")) ## percentage area plot: ## if there is only one grouping variable (the time column), ## the cross table produces a Series, not a data frame. It ## isn't really very informative to plot it, but we provide ## for this special case anyway_ #if type(ct) == pd.Series: #ct = ct.apply(lambda x: 100) #else: #ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1) #ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
def cal_prob(crosstab): ''' ?????????????????c?????????(N(x=c,y=1)+p)/(N(x=c)+1)? crosstab????DataFrame?index????????column?y???0/1?? ????????????????????????? ''' total=crosstab.sum(axis=0) p=total.loc[1]/total.sum() N=crosstab.sum(axis=1)+1 N1=crosstab[1]+p N.name='' N.index.name='' N1.name='' N1.index.name='' return dict(N1/N)
def cal_woe(crosstab): ''' ???????WOE??????c???WOE???log(r(x=c,y=1)/r(x=c,y=0))? ??r(x=c,y=1)=N(x=c,y=1)/N(y=1)??????r(x=c,y=0)=N(x=c,y=0)/N(y=0)?????? crosstab????DataFrame?index????????column?y???0/1?? ???????????????????WOE? ''' tmp=crosstab.copy() #?????????????? tmp[tmp==0]=1 r=tmp/tmp.sum(axis=0) result=np.log(r[1]/r[0]) return dict(result)
def cal_ks(y,y_prob,pos_label=1,return_split=False,decimals=0): ''' ??KS???????? y: ?????series?????????{0,1}?{-1,1}?? y_prob: ?????dataframe??????????????????????????????????? ?????????series?????????dataframe????? pos_label: int?????positive????? return_split: ?????????? decimals: ????????? ??KS??????????????sklearn??????? ''' y=pd.Series(pd.Series(y).values) if len(y_prob.shape)==1: y_pred=pd.Series(pd.Series(y_prob).values) else: y_pred=pd.Series(pd.DataFrame(y_prob).iloc[:,1].values) Bad=y_pred[y==pos_label] Good=y_pred[y!=pos_label] ks, pvalue = stats.ks_2samp(Bad.values, Good.values) if not return_split: return ks crossfreq=pd.crosstab(y_pred.round(decimals),y) crossdens = crossfreq.cumsum(axis=0) / crossfreq.sum() crossdens['gap'] = abs(crossdens[0] - crossdens[1]) score_split = crossdens[crossdens['gap'] == crossdens['gap'].max()].index[0] return score_split
def get_city_experience(): city_experience = pd.crosstab(frame.city,frame.experience,margins=True).sort_values(by='All',ascending=False)[:11] city_education = city_experience.drop('All',axis=0).drop('All',axis=1) ce_chart = pygal.Bar() ce_chart.title = u'?????????????' ce_chart.x_labels = city_education.index for i in range(len(list(city_education.T.index))): ce_chart.add(city_education.T.index[i], city_education.T.values[i]) ce_chart.render_to_file(os.path.dirname(__file__) + '/chart/city_experience.svg')
def get_city_phase(): city_pahse = pd.crosstab(frame.city,frame.phase,margins=True).sort_values(by='All',ascending=False)[:11] city_pahse = city_pahse.drop('All',axis=0).drop('All',axis=1) funnel_chart = pygal.StackedBar() funnel_chart.title = u'??????????????' funnel_chart.x_labels = city_pahse.index for i in range(len(list(city_pahse.T.index))): funnel_chart.add(city_pahse.T.index[i], city_pahse.T.values[i]) funnel_chart.render_to_file(os.path.dirname(__file__)+'/chart/phase.svg')
def get_city_education(): city_education = pd.crosstab(frame.city,frame.education,margins=True).sort_values(by='All',ascending=False)[:11] city_education = city_education.drop('All',axis=0).drop('All',axis=1) ce_chart = pygal.Bar() ce_chart.title = u'??????????????' ce_chart.x_labels = city_education.index for i in range(len(list(city_education.T.index))): ce_chart.add(city_education.T.index[i], city_education.T.values[i]) ce_chart.render_to_file(os.path.dirname(__file__) + '/chart/city_edu.svg')
def multi(uid): tmp = log[log.user_id==uid] ct = pd.crosstab(tmp.order_number, tmp.product_id).reset_index().set_index('order_number') li = [] for pid in ct.columns: streak = 0 sw_odr = False for onb,odr in enumerate(ct[pid].values): onb+=1 if sw_odr == False and odr == 1: sw_odr = True streak = 1 li.append([uid, pid, onb, streak]) continue if sw_odr == True: if odr == 1 and streak>0: streak += 1 li.append([uid, pid, onb, streak]) elif odr == 1 and streak<=0: streak = 1 li.append([uid, pid, onb, streak]) elif odr == 0 and streak>0: streak = 0 li.append([uid, pid, onb, streak]) elif odr == 0 and streak<=0: streak -= 1 li.append([uid, pid, onb, streak]) return pd.DataFrame(li, columns=['user_id', 'product_id', 'order_number', 'streak'])
def confusion_matrix(Y_true, Y_pred): Y_true = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_true, axis=1)]) Y_pred = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_pred, axis=1)]) return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])
def feature_importance_classification(features, target, n_neighbors=3, random_state=None): cont = features.select_dtypes(include=[np.floating]) disc = features.select_dtypes(include=[np.integer, np.bool]) cont_imp = pd.DataFrame(index=cont.columns) disc_imp = pd.DataFrame(index=disc.columns) # Continuous features if cont_imp.index.size > 0: # F-test f_test = feature_selection.f_classif(cont, target) cont_imp['f_statistic'] = f_test[0] cont_imp['f_p_value'] = f_test[1] # Mutual information mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False, n_neighbors=n_neighbors, random_state=random_state) cont_imp['mutual_information'] = mut_inf # Discrete features if disc_imp.index.size > 0: # Chi²-test chi2_tests = defaultdict(dict) for feature in disc.columns: cont = pd.crosstab(disc[feature], target) statistic, p_value, _, _ = stats.chi2_contingency(cont) chi2_tests[feature]['chi2_statistic'] = statistic chi2_tests[feature]['chi2_p_value'] = p_value chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index') disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic'] disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value'] # Cramér's V (corrected) disc_imp['cramers_v'] = [ cramers_v_corrected_stat(pd.crosstab(feature, target).values) for _, feature in disc.iteritems() ] # Mutual information mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True, n_neighbors=n_neighbors, random_state=random_state) disc_imp['mutual_information'] = mut_inf return cont_imp, disc_imp
def run_knn(trainx, trainy, testx, testy): knn = KNeighborsClassifier(n_neighbors=5) knn.fit(trainx, trainy) pred_y = knn.predict(testx) print(pd.crosstab(testy, pred_y, rownames=['Actual'], colnames=['Predicted'])) print('\nAccuracy: ' + str(accuracy_score(testy, pred_y)))
def calc_model_characteristics(self, performCV=True): # Determine key metrics to analyze the classification model. These # are stored in the classification_output series object belonginf to # this class. for metric in [self.scoring_metric]+self.additional_display_metrics: #Determine for both test and train, except predict: for key,data in self.dp.items(): if key!='predict': name = '%s_%s'%(metric,key) #Case where probabilities to be passed as arguments if base_classification.metrics_map[metric][2]: self.classification_output[name] = \ base_classification.metrics_map[metric][0]( data[self.datablock.target], self.predictions_probabilities[key]) #case where class predictions to be passed as arguments else: self.classification_output[name] = \ base_classification.metrics_map[metric][0]( data[self.datablock.target], self.predictions_class[key]) #Determine confusion matrix: name = 'ConfusionMatrix_%s'%key self.classification_output[name] = pd.crosstab( data[self.datablock.target], self.predictions_class[key] ).to_string() if performCV: cv_score = self.KFold_CrossValidation( scoring_metric=self.scoring_metric) else: cv_score = { 'mean_error': 0.0, 'std_error': 0.0 } self.classification_output['CVMethod'] = \ 'KFold - ' + str(self.cv_folds) self.classification_output['CVScore_mean'] = cv_score['mean_error'] self.classification_output['CVScore_std'] = cv_score['std_error'] self.classification_output['Predictors'] = str(self.predictors)
def printReport(self, printConfusionMatrix, printModelParameters): # Print the metric determined in the previous function. print("\nModel Report") #Outpute the parameters used for modeling if printModelParameters: print('\nModel being built with the following parameters:') print(self.alg.get_params()) if printConfusionMatrix: for key,data in self.dp.items(): if key!='predict': print("\nConfusion Matrix for %s data:"%key) print(pd.crosstab( data[self.datablock.target], self.predictions_class[key]) ) print('Note: rows - actual; col - predicted') print("\nScoring Metric:") for key,data in self.dp.items(): if key!='predict': name = '%s_%s'%(self.scoring_metric,key) print("\t%s (%s): %s" % ( self.scoring_metric, key, "{0:.3%}".format(self.classification_output[name]) ) ) print("\nCV Score for Scoring Metric (%s):"%self.scoring_metric) print("\tMean - %f | Std - %f" % ( self.classification_output['CVScore_mean'], self.classification_output['CVScore_std']) ) if self.additional_display_metrics: print("\nAdditional Scoring Metrics:") for metric in self.additional_display_metrics: for key,data in self.dp.items(): if key!='predict': name = '%s_%s'%(metric,key) print("\t%s (%s): %s" % ( metric, key, "{0:.3%}".format( self.classification_output[name]) ) )
def advanced_scoring_classifiers(probas, actuals, name=None): # pandas Series don't play nice here. Make sure our actuals list is indeed a list actuals = list(actuals) predictions = list(probas) print('Here is our brier-score-loss, which is the default value we optimized for while training, and is the value returned from .score() unless you requested a custom scoring metric') print('It is a measure of how close the PROBABILITY predictions are.') if name != None: print(name) # Sometimes we will be given "flattened" probabilities (only the probability of our positive label), while other times we might be given "nested" probabilities (probabilities of both positive and negative, in a list, for each item). try: probas = [proba[1] for proba in probas] except: pass print(format(brier_score_loss(actuals, probas), '.4f')) print('\nHere is the trained estimator\'s overall accuracy (when it predicts a label, how frequently is that the correct label?)') predicted_labels = [] for pred in probas: if pred >= 0.5: predicted_labels.append(1) else: predicted_labels.append(0) print(format(accuracy_score(y_true=actuals, y_pred=predicted_labels) * 100, '.1f') + '%') print('\nHere is a confusion matrix showing predictions and actuals by label') #it would make sense to use sklearn's confusion_matrix here but it apparently has no labels #took this idea instead from: http://stats.stackexchange.com/a/109015 conf = pd.crosstab(pd.Series(actuals), pd.Series(predicted_labels), rownames=['v Actual v'], colnames=['Predicted >'], margins=True) print(conf) print('Here is the accuracy of our trained estimator at each level of predicted probabilities') # create summary dict summary_dict = OrderedDict() for num in range(0, 110, 10): summary_dict[num] = [] for idx, proba in enumerate(probas): proba = math.floor(int(proba * 100) / 10) * 10 summary_dict[proba].append(actuals[idx]) for k, v in summary_dict.items(): if len(v) > 0: print('Predicted probability: ' + str(k) + '%') actual = sum(v) * 1.0 / len(v) # Format into a prettier number actual = round(actual * 100, 0) print('Actual: ' + str(actual) + '%') print('# preds: ' + str(len(v)) + '\n') print('\n\n')
def test_alex(self): class_index = 0 image_index = 0 total_count = 0.0 accept_sum = 0 actual = [] predict = [] for filename in filenames: #query-feature X=self.read_imagelist(filelist_path + filename + extension) test_num=np.shape(X)[0] out = self.forward_all(data=X) predicts=out[self.outputs[0]] predicts=np.reshape(predicts,(test_num,10)) confusion_array = np.zeros((class_size), dtype = np.int) for i in range(test_num): actual.append(class_index) for j in range(class_size): if np.max(predicts[i]) == predicts[i][j]: confusion_array[j] += 1 predict.append(j) image_index += 1 #print(confusion_array) total_count += test_num accept_sum += confusion_array[class_index] class_index += 1 print 'total:%d' % (round(total_count)) print 'accept:%d' % (accept_sum) print 'reject:%d' % (round(total_count) - accept_sum) print 'accuray:%.4f' % (accept_sum / total_count) #conf_mat = confusion_matrix(actual,predict) #print(conf_mat) #actual = np.array(actual) #predict = np.array(predict) #y_actual = pd.Series(actual, name='Actual') #y_predict = pd.Series(predict, name='Predicted') #df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True) #print(df_confusion) #plot_confusion_matrix(df_confusion) return (accept_sum / total_count) #process a text file
def evaluate(self,metric='cosine'): #sample-feature X=self.read_imagelist(filelist_sample) sample_num=np.shape(X)[0] out = self.forward_all(data=X) feature1=np.float64(out['deepid']) feature1=np.reshape(feature1,(sample_num,feature_size)) #np.savetxt('feature1.txt', feature1, delimiter=',') class_index = 0 image_index = 0 total_count = 0.0 accept_sum = 0 actual = [] predict = [] for filename in filenames: #query-feature X=self.read_imagelist(filelist_path + filename + extension) test_num=np.shape(X)[0] out = self.forward_all(data=X) feature2=np.float64(out['deepid']) feature2=np.reshape(feature2,(test_num,feature_size)) #np.savetxt('feature2.txt', feature2, delimiter=',') #mt=pw.pairwise_distances(feature2, feature1, metric=metric) mt=pw.cosine_similarity(feature2, feature1) false=0 for i in range(test_num): actual.append(class_index) for j in range(sample_num): if np.max(mt[i]) == mt[i][j]: confusion_array[j] += 1 predict.append(j) image_index += 1 total_count += test_num accept_sum += confusion_array[class_index] class_index += 1 print 'total:%d' % (round(total_count)) print 'accept:%d' % (accept_sum) print 'reject:%d' % (round(total_count) - accept_sum) print 'accuray:%.4f' % (accept_sum / total_count) #conf_mat = confusion_matrix(actual,predict) #print(conf_mat) actual = np.array(actual) predict = np.array(predict) y_actual = pd.Series(actual, name='Actual') y_predict = pd.Series(predict, name='Predicted') df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True) print(df_confusion) plot_confusion_matrix(df_confusion) return (accept_sum / total_count) #process a text file
def evaluate2(self,metric='cosine'): feature1=np.fromfile('./features/' + model_name +'-features.dat',dtype=np.float64) feature1=np.reshape(feature1,(class_size,feature_size)) #np.savetxt('feature1.txt', feature1, delimiter=',') class_index = 0 image_index = 0 total_count = 0.0 accept_sum = 0 actual = [] predict = [] for filename in filenames: #query-feature X=self.read_imagelist(filelist_path + filename + extension) test_num=np.shape(X)[0] out = self.forward_all(data=X) feature2=np.float64(out['deepid']) feature2=np.reshape(feature2,(test_num,feature_size)) #np.savetxt('feature2.txt', feature2, delimiter=',') #mt=pw.pairwise_distances(feature2, feature1, metric=metric) mt=pw.cosine_similarity(feature2, feature1) false=0 for i in range(test_num): actual.append(class_index) for j in range(class_size): if np.max(mt[i]) == mt[i][j]: confusion_array[j] += 1 predict.append(j) image_index += 1 total_count += test_num accept_sum += confusion_array[class_index] class_index += 1 print 'total:%d' % (round(total_count)) print 'accept:%d' % (accept_sum) print 'reject:%d' % (round(total_count) - accept_sum) print 'accuray:%.4f' % (accept_sum / total_count) #conf_mat = confusion_matrix(actual,predict) #print(conf_mat) #actual = np.array(actual) #predict = np.array(predict) #y_actual = pd.Series(actual, name='Actual') #y_predict = pd.Series(predict, name='Predicted') #df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True) #print(df_confusion) #plot_confusion_matrix(df_confusion) return (accept_sum / total_count) #process a text file
def _create_significance_table(self,data): """ Create a table containing p values for significance tests. Add features of the distributions and the p values to the dataframe. """ # list features of the variable e.g. matched, paired, n_expected df=pd.DataFrame(index=self.continuous+self.categorical, columns=['continuous','nonnormal','min_observed','pval','ptest']) df.index.rename('variable', inplace=True) df['continuous'] = np.where(df.index.isin(self.continuous),True,False) df['nonnormal'] = np.where(df.index.isin(self.nonnormal),True,False) # list values for each variable, grouped by groupby levels for v in df.index: # compute p value is_continuous = df.loc[v]['continuous'] is_categorical = ~df.loc[v]['continuous'] is_normal = ~df.loc[v]['nonnormal'] # if continuous, group data into list of lists if is_continuous: catlevels = None grouped_data = [] for s in self.groupbylvls: lvl_data = data[data[self.groupby]==s].dropna(subset=[v])[v] grouped_data.append(lvl_data.values) min_observed = len(min(grouped_data,key=len)) # if categorical, create contingency table elif is_categorical: catlevels = sorted(data[v].astype('category').cat.categories) grouped_data = pd.crosstab(data[self.groupby],data[v]) min_observed = grouped_data.sum(axis=1).min() # minimum number of observations across all levels df.loc[v,'min_observed'] = min_observed # compute pvalues df.loc[v,'pval'],df.loc[v,'ptest'] = self._p_test(v, grouped_data,is_continuous,is_categorical, is_normal,min_observed,catlevels) return df
def draw(self, **kwargs): """ Draw time series. """ def plot_facet(data, color, **kwargs): num = [] date = [] time = data[self._time_column] num = data[self._time_column].apply(self.convert_to_datetime) date = data[self._time_column].apply(self.convert_to_timeseries) if pd.isnull(num).sum() <= pd.isnull(date).sum(): data[self._time_column] = num else: data[self._time_column] = date data.dropna(inplace=True) if len(self._groupby) == 2: ct = pd.crosstab(data[self._time_column], data[self._groupby[0]]) ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0) ct = ct[pd.notnull(ct.index)] else: ct = pd.crosstab( data[self._time_column], pd.Series([""] * len(self._table[self._time_column]), name="")) # percentage area plot: if self.percentage: # if there is only one grouping variable (the time column), # the cross table produces a Series, not a data frame. It # isn't really very informative to plot it, but we provide # for this special case anyway_ if type(ct) == pd.Series: ct = ct.apply(lambda x: 100) else: ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1) ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs) else: if self.area: # Stacked area plot: if len(self._groupby) == 2: self.vmax = max(self.vmax, ct.apply(sum, axis=1).max()) ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs) else: # Line plot: self.vmax = max(self.vmax, ct.values.max()) ct.plot(ax=plt.gca(), color=self.get_palette()) self.map_data(plot_facet) if self.percentage: self.g.set(ylim=(0, 100)) else: self.g.set(ylim=(0, self.vmax)) self.g.set_axis_labels(self.options["label_x_axis"], self.options["label_y_axis"]) if len(self._groupby) == 2: self.add_legend()
def draw(self): """ Draw a heat map. """ def get_crosstab(data, row_fact,col_fact, row_names, col_names): ct = pd.crosstab(data[row_fact], data[col_fact]) ct = ct.reindex_axis(row_names, axis=0).fillna(0) ct = ct.reindex_axis(col_names, axis=1).fillna(0) return ct def plot(data, color): ct = get_crosstab( data, self._groupby[0], self._groupby[1], self._levels[0], self._levels[1]) sns.heatmap(ct, robust=True, annot=True, cbar=False, cmap=cmap, fmt="g", vmax=vmax, #ax=plt.gca(), linewidths=1) if len(self._groupby) < 2: # create a dummy cross tab with one dimension containing empty # values: data_column = self._table[self._groupby[0]].reset_index(drop=True) tab = pd.crosstab( pd.Series([""] * len(data_column), name=""), data_column) plot_facet = lambda data, color: sns.heatmap( tab, robust=True, annot=True, cbar=False, cmap=cmap, fmt="g", linewidths=1) else: plot_facet = plot vmax = pd.crosstab( [self._table[x] for x in [self._row_factor, self._groupby[0]] if x != None], [self._table[x] for x in [self._col_factor, self._groupby[1]] if x != None]).values.max() cmap = ListedColormap(self.options["color_palette_values"]) self.map_data(plot_facet)
def plot_ks_cdf(y_true,y_score,pos_label=1,label_map=None,color_map=None,decimals=0, xlabel='Score',ylabel='CumSum',fontsize=12,figsize=(18,8),close=True): ''' ??: ??KS??????????????????? ???: y_true: ?????series?????????{0,1}?{-1,1}?? y_score: ?????series???????????????????? pos_label: int?????positive????? label_map: ???????????????{0:'Good',1:'Bad'}? color_map: ????????????????{0:'g',1:'r'}? decimals: ????????? xlabel: ??????xlabel? ylabel: ??????ylabel? fontsize: int?????? close: ??????? ???: ????????{'ks': KS??'split': KS??????'fig': ?????????}? ''' if label_map is None: label_map={0:'Good',1:'Bad'} ks_dict = {} y_true=pd.Series(y_true) y_score=pd.Series(y_score) y_score_dataframe=pd.concat([y_true,y_score],axis=1) ks=cal_ks(y_true,y_score_dataframe,pos_label=pos_label,return_split=False,decimals=decimals) score_split=cal_ks(y_true,y_score_dataframe,pos_label=pos_label,return_split=True,decimals=decimals) crossfreq = pd.crosstab(y_score.round(decimals),y_true) crossdens = crossfreq.cumsum(axis=0) / crossfreq.sum() color=crossdens.columns.map(lambda xx: color_map.get(xx,None)) crossdens=crossdens.rename(columns=label_map) crossdens.columns.name='' fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) crossdens.plot(kind='line',ax=ax,fontsize=fontsize,color=color) ax.set_xlabel(xlabel,fontsize=fontsize) ax.set_ylabel(ylabel,fontsize=fontsize) ax.set_title('CDF Curve (KS=%.2f, SPLIT=%.*f)'%(ks,decimals,score_split),fontsize=fontsize) if close: plt.close('all') ks_dict['ks'] = ks ks_dict['split'] = score_split ks_dict['fig'] = fig return ks_dict
def create_crosstabs(model): r"""Create cross-tabulations for categorical variables. Parameters ---------- model : alphapy.Model The model object containing the data. Returns ------- model : alphapy.Model The model object with the updated feature map. """ logger.info("Creating Cross-Tabulations") # Extract model data X = model.X_train y = model.y_train # Extract model parameters factors = model.specs['factors'] target_value = model.specs['target_value'] # Iterate through columns, dispatching and transforming each feature. crosstabs = {} for fname in X: if fname in factors: logger.info("Creating crosstabs for feature %s", fname) ct = pd.crosstab(X[fname], y).apply(lambda r : r / r.sum(), axis=1) crosstabs[fname] = ct # Save crosstabs to the feature map model.feature_map['crosstabs'] = crosstabs return model # # Function get_factors #
def concordance(series1, series2, method, nreps=1000): """ Measures the concordance between two pandas Series and returns a pvalue and measure of concordance. Parameters ---------- series1, series2 : pandas Series Series with matching indexes. method : str ['fisher', 'spearman', 'kendalltau', 'empirical', 'cohen'] nreps : int number of repititions to build the null. Only needed if method is 'empirical' Returns ------- measure : float some sort of measure of concordance (e.g. r for the correlation methods, n_observed - mean(n_expected) for empirical, etc) p : float p value of observed concordance between series1 and series2 """ if method == 'fisher': # Note: this automatically ignores any bugs which were not present # in both series. mat = pd.crosstab(series1, series2) return fisher_exact(mat) elif method == 'spearman': return spearmanr(series1, series2) elif method == 'kendalltau': return kendalltau(series1, series2, nan_policy='omit') elif method == 'empirical': return empirical_pval(series1, series2, nreps) elif method == 'cohen': tmp = pd.concat((series1, series2), axis=1).dropna() return cohen_kappa_score(tmp.iloc[:, 0], tmp.iloc[:, 1]), np.nan else: raise ValueError('Unknown concordance method.')
def process_clustering(self): print("K-Means Clustering in progress...") dataset_choice = self.prediction_config.DATASET_LOCATION[self.prediction_config.DATASET_CHOICE] if not "affiliation_column" in dataset_choice or not dataset_choice["affiliation_column"]: return # Explore loaded data df = self.prediction_data target_column = dataset_choice["target_column"] affiliation_column = dataset_choice["affiliation_column"] centroids_quantity = self.prediction_config.CENTROIDS_QUANTITY # Initialise K-Means Clustering Model using specified quantity of clusters (centroids) # for training the model using the whole dataset. kmeans_model = KMeans(n_clusters=centroids_quantity, random_state=1) df_numeric = df.select_dtypes(include=['int', 'int64', 'float64', 'floating'], exclude=['O']) print("Excluding non-numeric columns from K-Means Clustering: ", df.select_dtypes(include=['O']).columns.tolist()) print("All dtypes: ", dict(df.dtypes)) print("Any rows null?: ", df.isnull().values.any()) print("Columns/rows with NaN values: ", df[df.isnull().any(axis=1)]) # Fit the K-Means Model to the DataFrame to calculate the Euclidean Distance of each row # to each cluster (centroid) and return a Numpy array with n_columns. Each column represents a # cluster (centroid) and indicates how far each rows is from the nearest cluster (centroid) # Important Note: Pass only numeric dataframe columns clustered_row_distances = kmeans_model.fit_transform(df_numeric) # Explore clusters to by computing cross-tabulation of the quantity of rows in each clustered_row_distance column # and the checking how they corresponded to unique row values of Affiliation column (i.e. 'party') labels = kmeans_model.labels_ # Show how many are grouped into say Cluster 0 # print(labels.tolist().count(0)) # Count quantity of unique Clusters print("Clusters total count: %r" % (len(labels.tolist()))) print("Clusters unique count: %r" % (len(set(labels.tolist())))) cluster_names = list(map(lambda cluster_name: ("Cluster " + str(cluster_name)) if cluster_name else None, labels)) print("Cross Tabulation between Clustered Labels and Affiliation i.e. 'party' column: \n%r" % (pd.crosstab(index=labels, columns=df[affiliation_column]))) if self.prediction_config.PLOT_KMEANS_OUTLIERS == True: self.example_plot_outliers(df, affiliation_column, labels, cluster_names, clustered_row_distances) # Generate new DataFrame column to be used as Target Column for Prediction Algorithms # (i.e. to detect which roll call votes were most likely to cause extremism such # that Senators would not vote along their own party lines) extremism = (clustered_row_distances ** 3).sum(axis=1) df["extremism"] = extremism df.sort_values("extremism", inplace=True, ascending=False) print("Top 10 observations ranked in order of 'extremism': %r" % (df.head(10))) self.prediction_data.df_listings = df
def fishers_exact_plot(data, condition1, condition2, ax=None, condition1_value=None, alternative="two-sided", **kwargs): """ Perform a Fisher's exact test to compare to binary columns Parameters ---------- data: Pandas dataframe Dataframe to retrieve information from condition1: str First binary column to compare (and used for test sidedness) condition2: str Second binary column to compare ax : Axes, default None Axes to plot on condition1_value: If `condition1` is not a binary column, split on =/!= to condition1_value alternative: Specify the sidedness of the test: "two-sided", "less" or "greater" """ plot = sb.barplot( x=condition1, y=condition2, ax=ax, data=data, **kwargs ) plot.set_ylabel("Percent %s" % condition2) condition1_mask = get_condition_mask(data, condition1, condition1_value) count_table = pd.crosstab(data[condition1], data[condition2]) print(count_table) oddsratio, p_value = fisher_exact(count_table, alternative=alternative) add_significance_indicator(plot=plot, significant=p_value <= 0.05) only_percentage_ticks(plot) if alternative != "two-sided": raise ValueError("We need to better understand the one-sided Fisher's Exact test") sided_str = "two-sided" print("Fisher's Exact Test: OR: {}, p-value={} ({})".format(oddsratio, p_value, sided_str)) return FishersExactResults(oddsratio=oddsratio, p_value=p_value, sided_str=sided_str, with_condition1_series=data[condition1_mask][condition2], without_condition1_series=data[~condition1_mask][condition2], plot=plot)
def rfFitScore(clf, dftrain, dftrain_y, dftest, dftest_y): '''random forest classifier fit and score. clf=RandomForestClassifier, dftrain=train data, dftrain_y=train data Y, dftest=test data, dftest_y=test data Y''' clfit = clf.fit(dftrain, dftrain_y['Y']) # clf.fit(X, y) imp = clfit.feature_importances_ # ndarray of 562 # clfit.fit_transform( X, y=None ) # returns X_new new_y = clfit.predict( dftest ) # returns predicted Y test_score = clfit.score( dftest, dftest_y['Y'] ) print("test score:", test_score) # clfit.oob_score_ if (clf.oob_score): print("oob score", clfit.oob_score_) # calculate test score by other means print("predict True %.3f percent, %d out of %d" % \ ((100 * sum(dftest_y['Y'] == new_y) / dftest_y.shape[0]), \ sum(dftest_y['Y'] == new_y), dftest_y.shape[0])) print("predict False %.3f percent, %d out of %d" % \ ((100 * sum(dftest_y['Y'] != new_y) / dftest_y.shape[0]), \ sum(dftest_y['Y'] != new_y), dftest_y.shape[0])) # new_p = clfit.predict_proba( dftest ) # # probability of each X variable to predict each y class # print("test predict probabilities head:\n", new_p[:5]) # cross table of variable predictions ptab = pd.crosstab(dftest_y['Y'], new_y, \ rownames=['actual'], colnames=['predicted']) print("cross table:\n", ptab) # accuracy: percent labeled correctly # precision: true positives / (true positives + true negatives) # recall: true positives / (true positives + false negatives) precision, recall, fbeta, support = prfs(dftest_y['Y'], new_y) print("precision", precision, "\nrecall", recall, \ "\nfbeta", fbeta, "\nsupport", support) if (clf.oob_score): return test_score, imp, clfit.oob_score_ else: return test_score, imp
def get_data(): f_path = "../dataset/logistic_regression/UCLA_dataset.csv" df = pd.read_csv(f_path) print df.head() print df.describe() print df.std() print pd.crosstab(df['admit'], df['rank'], rownames=['admit']) # df.hist() # pl.show() # dummy_ranks = pd.get_dummies(df['rank'], prefix='rank') # print dummy_ranks.head() # train_cols = df.columns[1:] # lr = sm.Logit(df['admit'], df[train_cols]) # ret = lr.fit() # print ret.summary() train, test = train_test_split(df, test_size=0.2) train_x, train_y = train[train.columns[1:]], train['admit'] test_x, test_y = test[test.columns[1:]], test['admit'] lr = LogisticRegression() lr.fit(train_x, train_y) y_pred = lr.predict(test_x) print accuracy_score(test_y, y_pred) rf = RandomForestClassifier(n_jobs=4) rf.fit(train_x, train_y) Y_pred = rf.predict(test_x) cnf_matrix = confusion_matrix(test_y, Y_pred) print cnf_matrix accuracy_percent = accuracy_score(test_y, Y_pred) print "accuracy is: %s%s" % (accuracy_percent, '%') recall_percent = recall_score(test_y, Y_pred) print "recall is: %s%s" % (recall_percent, '%')