我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用pandas.cut()。
def transform(self, x): """ Parameters: x (Sequence): - ??????? Returns: np.array: - ????????????numpy?? """ s = pd.cut(x, bins=self.bins) d = pd.get_dummies(s) z = d.T.to_dict() re = [] for i, v in z.items(): for j, u in v.items(): if u == 1: re.append(str(j)) return np.array(re)
def test_facet_wrap_expression(): p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)') assert p == 'facet_wrap_expression'
def update(attrname, old, new): new_selected, new_x_factors, new_y_factors = get_subset(dictionary_selector.value, dictionary_selector.value) bins = np.linspace(new_selected.counts.min(), new_selected.counts.max(), 10) # bin labels must be one more than len(colorpalette) new_selected["color"] = pd.cut(new_selected.counts, bins, labels = list(reversed(palettes.Blues9)), include_lowest=True) new_selected["wikidataID"] = new_selected["x"].map(lambda x: wikidataIDs.get(x)) fig.xaxis.axis_label = dictionary_selector.value fig.yaxis.axis_label = dictionary_selector.value fig.title.text = "Top %d fact co-occurrences selected" % top_n.value src = ColumnDataSource(dict( x=new_selected["x"].astype(object), y=new_selected["y"].astype(object), color=new_selected["color"].astype(object), wikidataID=new_selected["wikidataID"], counts=new_selected["counts"].astype(int), raw=new_selected["raw"].astype(int))) source.data.update(src.data) fig.x_range.update(factors=new_x_factors[:top_n.value]) fig.y_range.update(factors=new_y_factors[:top_n.value])
def plot_tendencies(word_list, pos_dic, bin_size, output_dir, file_name): plt.figure() dataframe_list = list() for word in word_list: if word not in pos_dic: raise Exception('Word ' + word + ' not found') df = pd.DataFrame(pos_dic[word], columns=['pos']) df['bins'] = pd.cut(df['pos'], bins=range(0, 100 + bin_size, bin_size), labels=range(0, 100, bin_size)) df = df.groupby(['bins'])['bins'].count() dataframe_list.append(df) df_final = pd.DataFrame(pd.concat(dataframe_list, axis=1)).fillna(0) df_final.columns = word_list ax = df_final.plot() ax.set_xlabel("Position (en % de la longueur de la description)") ax.set_ylabel("Nombre d'occurrences") plt.title('Position des mots dans les descriptions des offres', y=1.08) plt.savefig(os.path.join(output_dir, file_name), bbox_inches='tight')
def discretize(data, bins=5, quantile=False): ''' Creates 'bins' number of bins and discretizes the data. Uses cut function by default. qcut function otherwise. ''' if quantile: new_data = pd.qcut(data, bins, labels=list(range(bins))) else: new_data = pd.cut(data, bins, labels=list(range(bins))) return new_data
def plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size=4000, normalization_by_all=False): count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x], pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y], df[z_boolean]], axis=1) count_table = count_table.groupby([x,z_boolean])[y].value_counts().unstack().fillna(0) count_table = count_table.unstack() count_table_long = pd.melt(count_table.reset_index(), id_vars=x) z_boolean_values = count_table_long[z_boolean].unique() ratio = pd.DataFrame({'ratio':count_table_long.set_index([x,y,z_boolean]).unstack()['value'][z_boolean_values[1]] / ( count_table_long.set_index([x,y,z_boolean]).unstack()['value'].sum(axis=1) )}) count_table_long = count_table_long.set_index([x, y ])[['value']].merge(ratio, left_index=True, right_index=True).reset_index() size_factor = maximal_bubble_size/count_table_long['value'].max() x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \ if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values} y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \ if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values} xticks = np.arange(len(ordered_x_values)) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values] yticks = np.arange(len(ordered_y_values)) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values] xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values] yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values] count_table_long[x] = count_table_long[x].map(x_values_dict) count_table_long[y] = count_table_long[y].map(y_values_dict) plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'], c=count_table_long['ratio'], alpha=0.5, cmap='cool') return count_table_long, xticks, yticks, xticklabels, yticklabels
def test_probabilities(model: ClassifierMixin, X: np.array, y: pd.Series, bins: int = 10, threshold: float = 0.5): """Print confusion matrix based on class probability.""" probs = [p[1] for p in model.predict_proba(X)] print('\tProbabilities') df = pd.DataFrame({'prob': probs, 'label': y}) step = 1 / bins cut_labels = [round(step * f, 1) for f in range(10)] by_prob = (df.groupby(pd.cut(df['prob'], bins, labels=cut_labels)) .agg(['sum', 'count'])['label']) print('\t\tprobs\t1\t0\tacc') for index, row in by_prob.iloc[::-1].iterrows(): ones = row['sum'] if math.isnan(ones): ones = 0 else: ones = int(ones) count = row['count'] zeros = int(count) - ones if count > 0: acc = zeros / count if index < threshold else ones / count else: acc = 0.0 print(f'\t\t{index}\t{ones}\t{zeros}\t{acc:.3f}')
def _discretize_by_width(col, num_bins, labels): maxvalue = col.max() minvalue = col.min() width = float((maxvalue-minvalue))/num_bins bins = [minvalue + x*width for x in range(num_bins)]+[maxvalue] if labels: if len(labels)!=num_bins: raise ValueError('Length of assigned labels not consistent with num_bins!') else: group_names = labels else: group_names = range(num_bins) return pd.cut(col, bins,labels=group_names, include_lowest=True)
def _discretize_by_frequency(col, num_bins, labels): percent = 1.0/num_bins bins = sorted(list(set(col.quantile([x*percent for x in range(num_bins+1)])))) if len(bins)-1 < num_bins: num_bins = len(bins)-1 print('...Only %d bins (unbalanced) generated due to overlapping percentile boundaries.'%num_bins) if labels: if len(labels)!=num_bins: raise ValueError('Length of assigned labels not consistent with num_bins!') else: group_names = labels else: group_names = range(num_bins) return pd.cut(col, bins,labels=group_names, include_lowest=True)
def compute_group(cls, data, scales, **params): bins = params['bins'] breaks = params['breaks'] binwidth = params['binwidth'] boundary = params['boundary'] func = make_summary_fun(params['fun_data'], params['fun_y'], params['fun_ymin'], params['fun_ymax'], params['fun_args']) breaks = fuzzybreaks(scales.x, breaks, boundary, binwidth, bins) data['bin'] = pd.cut(data['x'], bins=breaks, labels=False, include_lowest=True) def func_wrapper(data): """ Add `bin` column to each summary result. """ result = func(data) result['bin'] = data['bin'].iloc[0] return result # This is a plyr::ddply out = groupby_apply(data, 'bin', func_wrapper) centers = (breaks[:-1] + breaks[1:]) * 0.5 bin_centers = centers[out['bin'].values] out['x'] = bin_centers out['bin'] += 1 if isinstance(scales.x, scale_discrete): out['width'] = 0.9 else: out['width'] = np.diff(breaks)[bins-1] return out
def test_facet_grid_expression(): p = g + facet_grid( ['var2', 'pd.cut(var1, (0, 2, 4), include_lowest=True)']) assert p == 'facet_grid_expression'
def plot_tendency(word, pos_dic, bin_size, output_dir, file_name): plt.figure() if word not in pos_dic: raise Exception('Word ' + word + ' notfound') df = pd.DataFrame(pos_dic[word], columns=['pos']) # .groupby(['pos'])['pos'].count() df['bins'] = pd.cut(df['pos'], bins=range(0, 100 + bin_size, bin_size), labels=range(0, 100, bin_size)) df = df.groupby(['bins'])['bins'].count() ax = df.plot(title="Position du mot '" + word + "' dans les descriptions des offres") ax.set_xlabel("Position (en % de la longueur de la description)") ax.set_ylabel("Nombre d'occurrences") plt.savefig(os.path.join(output_dir, file_name), bbox_inches='tight')
def symbolize(self, xs): """ Symbolize a PPA """ alphabet_sz = len(self.alphabet) cutpoints = self.cutpoints[alphabet_sz] return pd.cut(xs, bins = cutpoints, labels = self.alphabet)
def is_not_uniform(idx, nbins=10, allowed_gap=0.75): idx_bins = pd.cut(idx, bins=nbins, labels=False) idx_bin_size = np.bincount(idx_bins) diff = idx_bin_size[:-1] - idx_bin_size[1:] monotonic = (diff < 0).all() or (diff > 0).all() huge_gap = (idx_bin_size.min()*1.0 / idx_bin_size.max()) < allowed_gap return monotonic or huge_gap
def make_object_map(data,field,**kwargs): linear = False for key,value in kwargs.iteritems(): if key == 'linear': linear = value print linear if linear == False: colors,rangelist = make_distributed_range(data,field) else: colors = get_heatmap51() colors2 = colors maxvalue = data[field].max() if maxvalue < 51: totallist = range(maxvalue) colors = reduce_color_list_size(totallist,colors) colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors) else: colors = reduce_color_list_size(range(len(data)),colors) colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors) if not rangelist[0] == 0: rangelist = [0] + rangelist[1:] data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors) return data colors2 = get_heatmap51() if not rangelist[0] == 0: rangelist = [0] + rangelist[1:] data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:]) return data # for a given dataframe and field returns a non used grouped object to multiple operations on
def process_dataset(): data_dir = os.path.dirname(__file__) df = pd.read_csv(os.path.join(data_dir, 'data/frisk/frisk_with_noise.dat'), skiprows=6, delim_whitespace=True) # compute proportion black in precinct, black = 1 # first aggregate by precinct/ethnicity, and sum over populations popdf = df[['pop', 'precinct', 'eth']]. \ groupby(['precinct', 'eth'])['pop'].apply(sum) percent_black = np.array([ popdf[i][1] / float(popdf[i].sum()) for i in xrange(1, 76)] ) precinct_type = pd.cut(percent_black, [0, .1, .4, 1.]) # df['precinct_type'] = precinct_type.codes[df.precinct.values-1] return df
def busmap_by_rectangular_grid(buses, divisions=10): busmap = pd.Series(0, index=buses.index) if isinstance(divisions, tuple): divisions_x, divisions_y = divisions else: divisions_x = divisions_y = divisions gb = buses.groupby([pd.cut(buses.x, divisions_x), pd.cut(buses.y, divisions_y)]) for nk, oks in enumerate(itervalues(gb.groups)): busmap.loc[oks] = nk return busmap
def test_groupby_categorical_unequal_len(self): # GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) # The raises only happens with categorical, not with series of types # category bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
def setUp(self): self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) df = DataFrame({'value': np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] df = df.sort_values(by=['value'], ascending=True) df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) self.cat = df
def test_series_functions_no_warnings(self): df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] with tm.assert_produces_warning(False): df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
def test_assignment_to_dataframe(self): # assignment df = DataFrame({'value': np.array( np.random.randint(0, 10000, 100), dtype='int32')}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] df = df.sort_values(by=['value'], ascending=True) s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) d = s.values df['D'] = d str(df) result = df.dtypes expected = Series( [np.dtype('int32'), com.CategoricalDtype()], index=['value', 'D']) tm.assert_series_equal(result, expected) df['E'] = s str(df) result = df.dtypes expected = Series([np.dtype('int32'), com.CategoricalDtype(), com.CategoricalDtype()], index=['value', 'D', 'E']) tm.assert_series_equal(result, expected) result1 = df['D'] result2 = df['E'] self.assertTrue(result1._data._block.values.equals(d)) # sorting s.name = 'E' self.assertTrue(result2.sort_index().equals(s.sort_index())) cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) df = pd.DataFrame(pd.Series(cat))
def bubble_plot(df, x, y, z_boolean=None, ordered_x_values=None, ordered_y_values=None, bins_x=10, bins_y=10, fontsize=16, figsize=(10,5), maximal_bubble_size=4000, normalization_by_all = False, log=False): """ :param df: dataframe :param x: name of first numerical/categorical field (string) (for x-axis) :param y: name of second numerical/categorical field (string) (for y-axis) :param z_boolean: name of categorical field with two categories / boolean field (for coloring) :param ordered_x_values: the values we would like to map from x categorical variable according to the order we would like to present them :param ordered_y_values: the values we would like to map from the y categorical variable according to the order we would like to present them :param bins_x: the bins for x values if x is numberic :param bins_y: the bins for y values if y is numberic :param normalization_by_all: True - shows joint distribution p(x,y), False - shows conditional distribution p(y|x) :param maximal_bubble_size: if the bubbles are too big or too small this is the parameter you should change! :param log: whether to apply log on the count (influence the size of the bubbles) :return: nice bubble plot, bubble size is propotional to the frequency of the bucket :) """ plt.figure(figsize=figsize) x_is_numeric = df[x].dtype in (float, int) and ordered_x_values is None y_is_numeric = df[y].dtype in (float, int) and ordered_y_values is None count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x], pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y]], axis=1) count_table = count_table.groupby(x)[y].value_counts().unstack().fillna(0) ordered_x_values = count_table.index.values if ordered_x_values is None else ordered_x_values ordered_y_values = count_table.columns if ordered_y_values is None else ordered_y_values if z_boolean is not None: count_table_long, xticks, yticks, xticklabels, yticklabels = plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size, normalization_by_all=normalization_by_all) else: count_table_long, xticks, yticks, xticklabels, yticklabels = plot_without_z(df, x, y, z_boolean, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, normalization_by_all=normalization_by_all, log=log, maximal_bubble_size=maximal_bubble_size ) plt.xticks(xticks, xticklabels,fontsize=fontsize) plt.yticks(yticks, yticklabels,fontsize=fontsize) plt.xlabel(x, fontsize=fontsize) plt.ylabel(y, fontsize=fontsize) if z_boolean is None: plt.title("{} vs {} ".format(y,x),fontsize=fontsize+4); else: plt.title("{} vs {} and {} (in colors)".format(y,x, z_boolean),fontsize=fontsize+4);
def transform_with_woe(model_data): cut_point = model_config.logistic_cut for key in cut_point.keys(): cutss = cut_point[key]['cut_point'] wwoe = cut_point[key]['woe'] model_data[key] = pd.cut(model_data[key],bins=cutss,labels=range(len(cutss) - 1)).map(lambda x:wwoe[x]) return model_data
def create_categorical_features(df, label_list, random_state=None): """ Creates random categorical variables :param df: data frame we're operation on :param label_list: A list of lists, each list is the labels for one categorical variable :param random_state: the numpy RandomState :return: A modified dataframe Example: create_categorical_features(df, [['a','b'], ['red','blue']]) """ random_state = get_random_state(random_state) df = df.copy() n_categorical = len(label_list) # get numeric columns ONCE so we don't have to do it every time we loop: numer_cols = [col for col in df.select_dtypes(include=['number']).columns if col != 'y'] for i in range(0, n_categorical): # we might be out of numerical columns! if not numer_cols: break # chose a random numeric column that isn't y chosen_col = random_state.choice(numer_cols) # pop the chosen_col out of the numer_cols numer_cols.pop(numer_cols.index(chosen_col)) # use cut to convert that column to categorical df[chosen_col] = pd.cut(df[chosen_col], bins=len(label_list[i]), labels=label_list[i]) return df
def binColumns(inputDF, bins=DEFAULT_BIN_COUNT): columns = inputDF.columns binned = pd.DataFrame(columns=columns) for col in columns: s = inputDF[col] binned[col] = pd.cut(s, bins, labels=False) return binned # TBD: Finish refactoring this
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL): data = pd.read_pickle(file_in)['close'] data = data.reshape(-1, 24) data = np.array([data[i:i + 24] for i in range(data.shape[0] - 24 + 1)]) data_s = { 'open_price': np.array([data[i][0][0] for i in range(data.shape[0] - 1)]), 'close_price': np.array([data[i][int(NUM_PIX / 24) - 1][23] for i in range(data.shape[0] - 1)]), 'max_price': np.array([data[i].max() for i in range(data.shape[0] - 1)]), 'min_price': np.array([data[i].min() for i in range(data.shape[0] - 1)]), 'mean_price': np.array([data[i].mean() for i in range(data.shape[0] - 1)]), 'median_price': np.array([np.median(data[i]) for i in range(data.shape[0] - 1)]), 'buy_or_sell': np.array( [int(data[i + 1][int(NUM_PIX / 24) - 1][23] > data[i + 1][0][0]) for i in range(data.shape[0] - 1)]), 'change': np.array( [(data[i + 1][int(NUM_PIX / 24) - 1][23] - data[i + 1][0][0]) / data[i + 1][int(NUM_PIX / 24) - 1][23] * 100 for i in range(data.shape[0] - 1)])} data_s = pd.DataFrame(data_s) bins = [-100, -5, -4, -3, -2, -1.5, -1, - 0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100] labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8] data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels) bins = [-100, -5, -2, 0, 2, 5, 100] labels = [-3, -2, -1, 1, 2, 3] data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels) data = data.reshape(len(data), NUM_PIX) np.save(file_out[0], data[:len(data) - 1]) data_s.to_pickle(file_out[1])
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL): data = pd.read_pickle(file_in)['close'] data = np.array([data[i:i + 576] for i in range(data.shape[0] - 576 + 1)]) data = data.reshape(-1, 576) data_s = { 'open_price': np.array([data[i][0] for i in range(data.shape[0] - 576)]), 'close_price': np.array([data[i][575] for i in range(data.shape[0] - 576)]), 'max_price': np.array([data[i].max() for i in range(data.shape[0] - 576)]), 'min_price': np.array([data[i].min() for i in range(data.shape[0] - 576)]), 'mean_price': np.array([data[i].mean() for i in range(data.shape[0] - 576)]), 'median_price': np.array([np.median(data[i]) for i in range(data.shape[0] - 576)]), 'buy_or_sell': np.array( [int(data[i + 576][575] > data[i + 576][0]) for i in range(data.shape[0] - 576)]), 'change': np.array( [(data[i + 576][575] - data[i + 576][0]) / data[i + 576][575] * 100 for i in range(data.shape[0] - 576)])} data_s = pd.DataFrame(data_s) bins = [-100, -5, -4, -3, -2, -1.5, -1, - 0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100] bins = [0.01 * x for x in bins] labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8] data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels) bins = [-100, -5, -2, 0, 2, 5, 100] bins = [0.01 * x for x in bins] labels = [-3, -2, -1, 1, 2, 3] data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels) np.save(file_out[0], data[:len(data) - 576]) data_s.to_pickle(file_out[1])
def period_by_hours(x, separation): ''' aggrege le x par intervale d'heure. Le calcul pourrait être simple si on interdisait le chevauchement de jour. ''' print(separation) assert isinstance(separation, list) assert all([sep < 24 for sep in separation]) separation.sort() if 0 in separation: separation.append(24) hour_categ = pd.cut(x.dt.hour, separation, right=False) date_categ = x.dt.date return date_categ.astype(str) + ' ' + hour_categ.astype(str) else: hour = x.dt.hour hour_categ = pd.cut(hour, separation, right=False).astype(str) night_categ = '[' + str(separation[-1]) + ', ' + str(separation[0]) + ')' hour_categ[(hour < separation[0]) | (hour >= separation[-1])] = night_categ assert hour_categ.nunique(dropna=False) == len(separation) date_categ = x.dt.date.astype(str) # décalage d'un jour pour les premières heures decale = x.dt.date[x.dt.hour < separation[1]] + pd.DateOffset(days=-1) date_categ[x.dt.hour < separation[1]] = decale.astype(str) assert all(date_categ.str.len() == 10) return date_categ + ' ' + hour_categ ### 4 - special
def predictions_vs_actual_classification(model_results, model_name, n_bins, figsize=(7, 3)): holdout = model_results.holdout_data target = model_results.target bins = np.arange(0, 1.001, 1 / n_bins) bin_mids = (bins[:-1] + bins[1:]) / 2 binned = pd.cut(holdout['prediction'], bins=bins) bin_counts = holdout.groupby(binned)[target].count() bin_means = holdout.groupby(binned)[target].mean() fig = plt.figure(figsize=figsize) plt.suptitle('{0}: Predictions vs Actual'.format(model_name), fontsize=14) ax1 = plt.gca() ax1.grid(False) ax1.bar(bin_mids, bin_counts, width=1/n_bins, color=sns.light_palette('green')[1], label='row count', edgecolor='black') ax1.set_xlabel('predicted probability') ax1.set_ylabel('row count') ax2 = ax1.twinx() ax2.plot(bin_mids, bin_means, linewidth=3, marker='.', markersize=16, label='actual rate') ax2.plot(bins, bins, color=sns.color_palette()[2], label='main diagonal') ax2.set_ylabel('actual rate') handles, labels = ax1.get_legend_handles_labels() handles2, labels2 = ax2.get_legend_handles_labels() legend = plt.legend(handles + handles2, labels + labels2, loc='best', frameon=True, framealpha=0.7) frame = legend.get_frame() frame.set_facecolor('white') return fig
def prepare_input_data(self, input_data, name="", category_map=None): ''' Prepare input data dicts ''' print ("-"*40 + " Preparing %s" % name) X = input_data[self.continuous_columns].values.astype(np.float32) Y = input_data[self.label_column].values.astype(np.float32) Y = Y.reshape([-1, 1]) if self.verbose: print (" Y shape=%s, X shape=%s" % (Y.shape, X.shape)) X_dict = {"wide_X": X} if 'deep' in self.model_type: # map categorical value strings to integers td = input_data if category_map is None: category_map = {} for cc in self.categorical_columns: if not cc in td.columns: continue cc_values = sorted(td[cc].unique()) cc_max = 1+len(cc_values) cc_map = dict(zip(cc_values, range(1, cc_max))) # start from 1 to avoid 0:0 mapping (save 0 for missing) if self.verbose: print (" category %s max=%s, map=%s" % (cc, cc_max, cc_map)) category_map[cc] = cc_map td = td.replace(category_map) # bin ages (cuts off extreme values) age_bins = [ 0, 12, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 80, 65535 ] td['age_binned'] = pd.cut(td['age'], age_bins, labels=False) td = td.replace({'age_binned': {np.nan: 0}}) print (" %d age bins: age bins = %s" % (len(age_bins), age_bins)) X_dict.update({ ("%s_in" % cc): td[cc].values.astype(np.int32).reshape([-1, 1]) for cc in self.categorical_columns}) Y_dict = {"Y": Y} if self.verbose: print ("-"*40) return X_dict, Y_dict, category_map
def discretize(data, vars_to_discretize, n_bins): ''' Accepts data, a dictionary containing dicretization type for selected variables, and a dictionary containing the number of bins for selected variables. Returns data after selected variables have been discretized, together with binning definition for each variable. ''' data_subset = ps.DataFrame(data).copy() bins = {} for i in vars_to_discretize: out = None binning = None # discretize by splitting into equal intervals if vars_to_discretize[i] == 'Equal': out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True) # discretize by frequency elif vars_to_discretize[i] == 'Freq': nb = n_bins[i] while True: try: out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True) break except: nb -= 1 # discretize based on provided bin margins elif vars_to_discretize[i] == 'Bins': out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1 binning = n_bins[i] data_subset.ix[:,i] = out # replace NA variables with and special index (1+max) - # if it has not been done so automatically an in np.digitize data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1 bins[i] = binning return data_subset, bins
def test_groupby_categorical_two_columns(self): # https://github.com/pydata/pandas/issues/8138 d = {'cat': pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40]} test = pd.DataFrame(d) # Grouping on a single column groups_single_key = test.groupby("cat") res = groups_single_key.agg('mean') exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, index=pd.CategoricalIndex(["a", "b", "c"], name="cat")) tm.assert_frame_equal(res, exp) # Grouping on two columns groups_double_key = test.groupby(["cat", "ints"]) res = groups_double_key.agg('mean') exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], "cat": ["a", "a", "b", "b", "c", "c"], "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" ]) tm.assert_frame_equal(res, exp) # GH 10132 for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: c, i = key result = groups_double_key.get_group(key) expected = test[(test.cat == c) & (test.ints == i)] assert_frame_equal(result, expected) d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} test = pd.DataFrame(d) values = pd.cut(test['C1'], [1, 2, 3, 6]) values.name = "cat" groups_double_key = test.groupby([values, 'C2']) res = groups_double_key.agg('mean') nan = np.nan idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"], [1, 2, 3, 4]], names=["cat", "C2"]) exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, nan, nan, nan, nan, 4, 5], "C3": [nan, nan, nan, nan, 10, 100, nan, nan, nan, nan, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp)
def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="softmax_model"): column_list = training_df.columns.tolist() threshold = 5 red_wine_cleaned = training_df.copy() red_wine_cleaned = _outliers(red_wine_cleaned, threshold, column_list[0:-1]) # Bin the data bins = [3, 5, 6, 8] red_wine_cleaned['category'] = pd.cut(red_wine_cleaned.quality, bins, labels=['Bad', 'Average', 'Good'], include_lowest=True) # Only include 'Bad' and 'Good' categories red_wine_newcats = red_wine_cleaned[red_wine_cleaned['category'].isin(['Bad', 'Good'])].copy() bins = [3, 5, 8] red_wine_newcats['category'] = pd.cut(red_wine_newcats.quality, bins, labels=['Bad', 'Good'], include_lowest=True) y_red_wine = red_wine_newcats[['category']].get_values() # Removing fixed_acidity and quality X_red_wine = red_wine_newcats.iloc[:, 1:-2].get_values() y_red_wine_raveled = y_red_wine.ravel() y_red_wine_integers = [y.replace('Bad', '1') for y in y_red_wine_raveled] y_red_wine_integers = [y.replace('Good', '0') for y in y_red_wine_integers] y_red_wine_integers = [np.int(y) for y in y_red_wine_integers] y_one_hot = _dense_to_one_hot(y_red_wine_integers, num_classes=2) X_train, X_test, y_train, y_test = train_test_split(X_red_wine, y_one_hot, test_size=0.2, random_state=42) # model with tf.variable_scope("softmax_regression"): X = tf.placeholder("float", [None, 10]) y, variables = softmax_regression(X) # train y_ = tf.placeholder("float", [None, 2]) cost = -tf.reduce_mean(y_ * tf.log(y)) optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) init = tf.initialize_all_variables() self.sess.run(init) for i in range(100): average_cost = 0 number_of_batches = int(len(X_train) / batch_size) for start, end in zip(range(0, len(X_train), batch_size), range(batch_size, len(X_train), batch_size)): self.sess.run(optimizer, feed_dict={X: X_train[start:end], y_: y_train[start:end]}) # Compute average loss average_cost += self.sess.run(cost, feed_dict={X: X_train[start:end], y_: y_train[start:end]}) / number_of_batches print(self.sess.run(accuracy, feed_dict={X: X_test, y_: y_test})) filename = "data/softmax_regression.ckpt" path = self.save_locally(filename) self.save_to_s3(path, model_name) print("Saved:", path)
def plot_yield_by_quality(): # Close any previous plots plt.close('all') # Read in seqlength and time from ALL_READS dataframe new_yield_data = ALL_READS[['time', "seq_length", "av_qual"]] # Bin qualities qual_bins = [0] + QUALITY_BINS + [new_yield_data["av_qual"].max()] # Cut yield data into quality bins new_yield_data["descriptive_quality"] = pd.cut(new_yield_data["av_qual"], qual_bins, labels=[description for description in reversed(QUALITY_DESCRIPTIONS)]) # Time as index and drop av_qual column new_yield_data.set_index(pd.DatetimeIndex(new_yield_data['time']), inplace=True) new_yield_data.drop('av_qual', axis=1, inplace=True) # Obtain cumulative sum by quality bin in each minute. yield_data_grouped = new_yield_data.groupby("descriptive_quality").apply(lambda d: d.resample("1T").sum().fillna(0))['seq_length'] # Create a dict of dataframes based on groups. yield_data_by_quality = {description: yield_data_grouped[description].to_frame().reset_index() for description in QUALITY_DESCRIPTIONS} for description, yield_df in yield_data_by_quality.items(): yield_df.reset_index(inplace=True) yield_df.set_index("time", inplace=True) yield_df = yield_df.reindex(index=YIELD_DATA.time, fill_value=0) yield_df.reset_index(inplace=True) # Generate a cumulative sum of sequence data yield_df['cumsum_bp'] = yield_df['seq_length'].cumsum() # Convert time to timedelta format and then to float format, in hours. yield_df['duration_tdelta'] = yield_df['time'].apply(lambda t: t - yield_df['time'].min()) yield_df['duration_float'] = yield_df['duration_tdelta'].apply(lambda t: t.total_seconds() / 3600) yield_data_by_quality[description] = yield_df # Set subplots. fig, ax = plt.subplots(1) # Create ticks using numpy linspace. Ideally will create 6 points between 0 and 48 hours. num_points = 7 # Need to include zero point x_ticks = np.linspace(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max(), num_points) ax.set_xticks(x_ticks) # Define axis formatters ax.yaxis.set_major_formatter(FuncFormatter(y_yield_to_human_readable)) ax.xaxis.set_major_formatter(FuncFormatter(x_yield_to_human_readable)) # Set x and y labels and title. ax.set_xlabel("Duration (HH:MM)") ax.set_ylabel("Yield") ax.set_title(f"Yield for {SAMPLE_NAME} over time by quality") ax.stackplot(YIELD_DATA['duration_float'], [yield_data_by_quality[description]['cumsum_bp'] for description in QUALITY_DESCRIPTIONS], colors=QUALITY_COLOURS) # Limits must be set after the plot is created ax.set_xlim(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max()) ax.set_ylim(ymin=0) # Add legend to plot. ax.legend([mpatches.Patch(color=colour) for colour in QUALITY_COLOURS], QUALITY_DESCRIPTIONS, loc=2) # Ensure labels are not missed. fig.tight_layout() savefig(os.path.join(PLOTS_DIR, f"{SAMPLE_NAME.replace(' ', '_')}_yield_plot_by_quality.png"))
def titanic_1(): titanic = sns.load_dataset('titanic') print titanic.head() # survived pclass sex age ...... # 0 0 male 22 # 1 1 1 female 38.0 # 2 1 3 female 26.0 # 3 1 1 female 35.0 # 4 0 3 male 35.0 print titanic.groupby('sex')[['survived']].mean() # survived # sex # female 0.742038 # male 0.188908 print titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack() # class First Second Third # sex # female 0.968085 0.921053 0.500000 # male 0.368852 0.157407 0.135447 print titanic.pivot_table('survived', index='sex', columns='class') # class First Second Third # sex # female 0.968085 0.921053 0.500000 # male 0.368852 0.157407 0.135447 age = pd.cut(titanic['age'], [0, 18, 80]) print titanic.pivot_table('survived', ['sex', age], 'class') # class First Second Third # sex age # female (0, 18] 0.909091 1.000000 0.511628 # (18, 80] 0.972973 0.900000 0.423729 # male (0, 18] 0.800000 0.600000 0.215686 # (18, 80] 0.375000 0.071429 0.133663 print titanic.pivot_table(index='sex', columns='class', aggfunc={'survived': sum, 'fare': 'mean'}) print titanic.pivot_table('survived', index='sex', columns='class', margins=True) # class First Second Third All # sex # female 0.968085 0.921053 0.500000 0.742038 # male 0.368852 0.157407 0.135447 0.188908 # All 0.629630 0.472826 0.242363 0.383838