我们从Python开源项目中,提取了以下16个代码示例,用于说明如何使用pandas.qcut()。
def _compute_stats(self, pred, expo, loss, prem): n_samples, n_groups = pred.shape[0], self.n_groups pred_ser = pd.Series(pred) loss_to_returns = np.sum(loss) / np.sum(prem) rank = pd.qcut(pred_ser, n_groups, labels=False) n_groups = np.amax(rank) + 1 groups = np.arange(n_groups) # if we ever go back to using n_groups... tab = pd.DataFrame({ 'rank': rank, 'pred': pred, 'prem': prem, 'loss': loss, 'expo': expo }) grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank') agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns return tab, agg_rlr, n_groups
def discretize(data, bins=5, quantile=False): ''' Creates 'bins' number of bins and discretizes the data. Uses cut function by default. qcut function otherwise. ''' if quantile: new_data = pd.qcut(data, bins, labels=list(range(bins))) else: new_data = pd.cut(data, bins, labels=list(range(bins))) return new_data
def _recursive_category_gen(col, num_bins): """ Generate number of bins recursively Parameters ---------- col : string the name of the column in the dataframe with the continuous variable num_bins : int how many quantiles Returns ------- num_bins : int categories : list """ bin_labels = range(num_bins) # base case catch if num_bins == 0: raise ValueError('Unable to perform qcut to 0 bins.') # we assume the num_bins count will work try: categories = pd.qcut(x=col, q=num_bins, labels=bin_labels) return num_bins, categories # if it does not, then we need to go down 1 number of bins except ValueError: new_bin_count = num_bins - 1 return _recursive_category_gen(col, new_bin_count)
def categorizeCI2(inputDF, subsampleFactor=10, title=None): #inputDF = normalize(inputDF) binLabels = ['Low', 'Medium', 'High'] indices = range(0, inputDF.shape[0], subsampleFactor) plotDF = inputDF.iloc[indices].copy() plotDF['bin'] = pd.qcut(inputDF['ci'], len(binLabels), labels=binLabels) plotDF.drop(['ci'], axis=1, inplace=True) alpha = 0.3 g = parallel_coordinates(plotDF, 'bin', color=[[0.8,0.0,0.1,alpha], [0.0,0.8,0.1,alpha], [0.1,0.1,0.8,alpha], ]) plt.xticks(rotation=270) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) if title: title += ' (factor=%d)' % subsampleFactor g.set_title(title) return g
def IV_calc(data,var): if data[var].dtypes == "object": dataf = data.groupby([var])['class'].agg(['count','sum']) dataf.columns = ["Total","bad"] dataf["good"] = dataf["Total"] - dataf["bad"] dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum() dataf["good_per"] = dataf["good"]/dataf["good"].sum() dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"]) return dataf else: data['bin_var'] = pd.qcut(data[var].rank(method='first'),10) dataf = data.groupby(['bin_var'])['class'].agg(['count','sum']) dataf.columns = ["Total","bad"] dataf["good"] = dataf["Total"] - dataf["bad"] dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum() dataf["good_per"] = dataf["good"]/dataf["good"].sum() dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"]) return dataf
def y_transform(Y, data, flatten): df_y = data[Y] # if user input 'int' then function will be "greater than value" # if user input 'float' then function will be IQR range # below is for case where prediction is true or false # but the y-feature is in different format (e.g continuous) if flatten == 'mean': df_y = pd.DataFrame(df_y >= df_y.mean()) elif flatten == 'median': df_y = pd.DataFrame(df_y >= df_y.median()) elif flatten == 'mode': df_y = pd.DataFrame(df_y >= df_y.mode()[0]) elif type(flatten) == int: df_y = pd.DataFrame(df_y >= flatten) elif type(flatten) == float: df_y = pd.DataFrame(df_y >= df_y.quantile(flatten)) # below is for case where the y-feature is converted in # to a categorical, either if it's a number or string. elif flatten == 'cat_string': df_y = pd.Categorical(df_y) df_y = pd.DataFrame(pd.Series(df_y).cat.codes) elif flatten == 'cat_numeric': df_y = pd.qcut(df_y, 5, duplicates='drop') df_y = pd.DataFrame(pd.Series(df_y).cat.codes) # for cases when y-feature is already in the format # where the prediction output will be. elif flatten == 'none': df_y = pd.DataFrame(df_y) return df_y
def compute(self, today, assets,out,factor,bins): out[:] = pd.qcut(factor,bins,labels=False)
def add_returns_in_place(df): # modifies df close_prices_returns = compute_returns(df) num_bins = 10 returns_bins = pd.qcut(close_prices_returns, num_bins) bins_categories = returns_bins.values.categories returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False) df['close_price_returns'] = close_prices_returns df['close_price_returns_bins'] = returns_bins df['close_price_returns_labels'] = returns_labels return df, bins_categories
def test_apply_use_categorical_name(self): from pandas import qcut cats = qcut(self.df.C, 4) def get_stats(group): return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()} result = self.df.groupby(cats).D.apply(get_stats) self.assertEqual(result.index.names[0], 'C')
def get_node_colors_by_attr(G, attr, num_bins=None, cmap='viridis', start=0, stop=1): """ Get a list of node colors by binning some continuous-variable attribute into quantiles. Parameters ---------- G : networkx multidigraph attr : string the name of the attribute num_bins : int how many quantiles (default None assigns each node to its own bin) cmap : string name of a colormap start : float where to start in the colorspace stop : float where to end in the colorspace Returns ------- list """ if num_bins is None: num_bins=len(G.nodes()) bin_labels = range(num_bins) attr_values = pd.Series([data[attr] for node, data in G.nodes(data=True)]) cats = pd.qcut(x=attr_values, q=num_bins, labels=bin_labels) colors = get_colors(num_bins, cmap, start, stop) node_colors = [colors[cat] for cat in cats] return node_colors
def get_edge_colors_by_attr(G, attr, num_bins=5, cmap='viridis', start=0, stop=1): """ Get a list of edge colors by binning some continuous-variable attribute into quantiles. Parameters ---------- G : networkx multidigraph attr : string the name of the continuous-variable attribute num_bins : int how many quantiles cmap : string name of a colormap start : float where to start in the colorspace stop : float where to end in the colorspace Returns ------- list """ if num_bins is None: num_bins=len(G.edges()) bin_labels = range(num_bins) attr_values = pd.Series([data[attr] for u, v, key, data in G.edges(keys=True, data=True)]) cats = pd.qcut(x=attr_values, q=num_bins, labels=bin_labels) colors = get_colors(num_bins, cmap, start, stop) edge_colors = [colors[cat] for cat in cats] return edge_colors
def quartileSplit(wine_set): print("This is the quartile split of the wines' quality. I-st column contains the intervals of wines' quality;") print("II-nd - the number of wine samples with the quality in the corresponding interval.") wine_set["quality_quart"] = pd.qcut(wine_set["quality"], 3) print(wine_set.groupby("quality_quart").size())
def bin_data(path, write_path, num_chunks, binning): """Bins the continuous features through bucket or quantile binning Parameter --------- path : str The path where the dataset to be binned is located. write_path : str The path where to save the binned dataset. num_chunks : int The number of file splits to perform on the binned dataset. binning : int The type of binning to perform on the dataset: 0 if bucket binning, 1 if quantile binning. """ # get the list of files found in PATH files = nd.list_files(path=path) df = pd.DataFrame() for file in files: # append the data from CSV files to the dataframe df = df.append(pd.read_csv(filepath_or_buffer=file, names=column_names)) print('appending : {}'.format(file)) # remove dst_ip_add and src_ip_add features df = df.drop(labels=['dst_ip_add', 'src_ip_add'], axis=1) for index in range(len(cols_to_std)): if int(binning) == 0: # bucket binning bins = np.linspace(df[cols_to_std[index]].min(), df[cols_to_std[index]].max(), 10) df[cols_to_std[index]] = np.digitize(df[cols_to_std[index]], bins, right=True) print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max())) if int(binning) == 1: # decile binning df[cols_to_std[index]] = pd.qcut(df[cols_to_std[index]], 10, labels=False, duplicates='drop') print('min : {}, max : {}'.format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max())) for id, df_i in enumerate(np.array_split(df, num_chunks)): # split and save the dataframe to CSV files df_i.to_csv(path_or_buf=os.path.join(write_path, '{id}.csv'.format(id=id)), columns=columns_to_save, header=None, index=False) print('Saving CSV file : {path}'.format(path=os.path.join(write_path, '{id}'.format(id=id))))
def discretize(data, vars_to_discretize, n_bins): ''' Accepts data, a dictionary containing dicretization type for selected variables, and a dictionary containing the number of bins for selected variables. Returns data after selected variables have been discretized, together with binning definition for each variable. ''' data_subset = ps.DataFrame(data).copy() bins = {} for i in vars_to_discretize: out = None binning = None # discretize by splitting into equal intervals if vars_to_discretize[i] == 'Equal': out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True) # discretize by frequency elif vars_to_discretize[i] == 'Freq': nb = n_bins[i] while True: try: out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True) break except: nb -= 1 # discretize based on provided bin margins elif vars_to_discretize[i] == 'Bins': out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1 binning = n_bins[i] data_subset.ix[:,i] = out # replace NA variables with and special index (1+max) - # if it has not been done so automatically an in np.digitize data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1 bins[i] = binning return data_subset, bins
def create_figure(df,x,y,discrete,quantileable,continuous,size,color,controls): xs = df[x.value].values ys = df[y.value].values # x_title = x.value.title() # y_title = y.value.title() x_title = "Marginal Effective Tax Rate" y_title = "Asset Category" source = ColumnDataSource(ColumnDataSource.from_df(df)) kw = dict() if x.value in discrete: kw['x_range'] = sorted(set(xs)) if y.value in discrete: kw['y_range'] = sorted(set(ys)) # kw['title'] = "%s vs %s" % (x_title, y_title) #kw['title'] = "Marginal Effective Tax Rates on Typically Financed Corporate Investments, 2016 Law" # kw['title'] = "Marginal Effective Tax Rates on Corporate Investments, 2016 Law" kw['title'] = "METRs on Corporate Investments, 2016 Law" p = figure(plot_height=400, plot_width=600, tools='pan,box_zoom,reset,hover', **kw) p.xaxis.axis_label = x_title p.yaxis.axis_label = y_title hover = p.select(dict(type=HoverTool)) hover.tooltips = [('Asset', '@Asset')] if x.value in discrete: p.xaxis.major_label_orientation = pd.np.pi / 4 sz = 9 if size.value != 'None': groups = pd.qcut(df[size.value].values, len(SIZES)) sz = [SIZES[xx] for xx in groups.codes] c = "#73000A" if color.value != 'None': groups = pd.qcut(df[color.value].values, len(COLORS)) c = [COLORS[xx] for xx in groups.codes] p.circle(x=xs, y=ys, source=source, color=c, size=sz, line_color="white", alpha=0.6, hover_color='white', hover_alpha=0.5) # p.title.text_color = "black" # p.title.text_font = "Georgia" return p
def log_regression(wine_set): # # examining the data before recoding # print(wine_set["sulphates"].describe()) # wine_set["sulphates_c"] = pd.qcut(wine_set["sulphates"], 4) # print(wine_set.groupby("sulphates_c").size()) # print() # # # print(wine_set["alcohol"].describe()) # wine_set["alcohol_c"] = pd.qcut(wine_set["alcohol"], 4) # print(wine_set.groupby("alcohol_c").size()) # print() # # print(wine_set["quality"].describe()) # wine_set["quality_c"] = pd.qcut(wine_set["quality"], 3) # print(wine_set.groupby("quality_c").size()) # print() # recode quality into 2 groups: 0:{3,4,5,6}, 1:{7,8,9} recode = {3: 0, 4: 0, 5:0, 6:0, 7:1, 8:1, 9:1} wine_set['quality_c'] = wine_set['quality'].map(recode) # recode sulphates into 2 groups: 0: <= mean, 1: > mean def sulphates_to_cat(x): if x['sulphates'] <= wine_set['sulphates'].mean(): return 0 else: return 1 wine_set['sulphates_c'] = wine_set.apply(lambda x: sulphates_to_cat(x), axis=1) # recode alcohol into 2 groups: 0: <= mean , 1: > mean def alcohol_to_cat(x): if x['alcohol'] <= wine_set['alcohol'].mean(): return 0 else: return 1 wine_set['alcohol_c'] = wine_set.apply(lambda x: alcohol_to_cat(x), axis=1) # print(wine_set.head(10)) # logistic regression for sulphates+alcohol -> quality print ("Logistic regression model for the association between wine's quality and sulphates&alcohol") model1 = smf.logit(formula="quality_c ~ sulphates_c + alcohol_c", data=wine_set) results1 = model1.fit() print(results1.summary()) # odds ratios with 95% confidence intervals print("\nConfidence intervals") conf = results1.conf_int() conf['Odds ratio'] = results1.params conf.columns = ['Lower conf.int.', 'Upper conf.int.', 'Odds ratio'] print(numpy.exp(conf))