我们从Python开源项目中,提取了以下45个代码示例,用于说明如何使用pandas.unique()。
def get_levels(self, name): """ Return a set containing all distinct values in the column 'name'. The values are returned in alphabetical order. Parameters ---------- name : string The column name for which the unique values are requested Returns ------- levels : list A unique list of all values that are contained in the specified data column. """ return pd.unique(self._table[name].values.ravel())
def wrapper_nms(proposal_df, overlap=0.65): """Apply non-max-suppresion to a video batch. """ vds_unique = pd.unique(proposal_df['video-name']) new_proposal_df = [] for i, v in enumerate(vds_unique): idx = proposal_df['video-name'] == v p = proposal_df.loc[idx, ['video-name', 'f-init', 'f-end', 'score', 'video-frames']] n_frames = np.int(p['video-frames'].mean()) loc = np.stack((p['f-init'], p['f-end']), axis=-1) loc, score = nms_detections(loc, np.array(p['score']), overlap) n_proposals = score.shape[0] n_frames = np.repeat(p['video-frames'].mean(), n_proposals).astype(int) this_df = pd.DataFrame({'video-name': np.repeat(v, n_proposals), 'f-init': loc[:, 0], 'f-end': loc[:, 1], 'score': score, 'video-frames': n_frames}) new_proposal_df.append(this_df) return pd.concat(new_proposal_df, axis=0)
def get_detected_objects(df, tol=1.0, debug=False): """ Takes a summary dataframe with RV information. Finds the median rv for each star, and removes objects that are more than 'tol' km/s from the median value :param df: A summary dataframe, such as created by get_ccf_summary or find_best_pars :param tol: The tolerance, in km/s, to accept an observation as detected :return: a dataframe containing only detected companions """ secondary_names = pd.unique(df.Secondary) secondary_to_rv = defaultdict(float) for secondary in secondary_names: rv = df.loc[df.Secondary == secondary]['rv'].median() secondary_to_rv[secondary] = rv if debug: for secondary in sorted(secondary_to_rv.keys()): print ('RV for {}: {:.2f} km/s'.format(secondary, secondary_to_rv[secondary])) keys = df.Secondary.values good = df.loc[abs(df.rv.values - np.array(itemgetter(*keys)(secondary_to_rv))) < tol] return good
def list_stars(self, print2screen=False): """ List all of the stars in all of the CCF interfaces Parameters: =========== - print2screen: bool Should we print the stars and dates to screen? Returns: ========= - star_list: list A list of every star in the file, sorted by name. """ stars = [] for inst in self._interfaces.keys(): if print2screen: print('Stars observed with {}: \n============================\n\n'.format(inst)) stars.extend(self._interfaces[inst].list_stars(print2screen=print2screen)) return list(pd.unique(stars))
def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np.array(['2015-01-03T00:00:00.000000000+0000', '2015-01-01T00:00:00.000000000+0000'], dtype='M8[ns]') dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000+0000', '2015-01-01T00:00:00.000000000+0000', '2015-01-01T00:00:00.000000000+0000']) result = algos.unique(dt_index) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) s = pd.Series(dt_index) result = algos.unique(s) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) arr = s.values result = algos.unique(arr) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype)
def rename_brands(phone_models): """ recast all phone brands and model as string integers brand_i and model_j """ brands_table = {} i = 0 for brand in pd.unique(phone_models['phone_brand']): brands_table[brand] = 'brand_%s' %i i += 1 models_table = {} i = 0 for model in pd.unique(phone_models['device_model']): models_table[model] = 'model_%s' %i i += 1 converted = [] for item in zip(phone_models['phone_brand'],phone_models['device_model']): converted.append((brands_table[item[0]],models_table[item[1]])) phone_models['phone_brand'] = [x[0] for x in converted] phone_models['device_model'] = [x[1] for x in converted] return phone_models
def __init__(self, linksfile, ic=None): df = pd.read_csv(linksfile) df['link'] = df.i.map(str) + '_' + df.j.map(str) + '_' + df.k.map(str) df.set_index('link', inplace=True) self.df = df # self.T = len(self.df) SR_stats = pd.read_csv('calvin/data/SR_stats.csv', index_col=0).to_dict() self.min_storage = SR_stats['min'] self.max_storage = SR_stats['max'] if ic: self.apply_ic(ic) # a few network fixes to make things work self.add_ag_region_sinks() self.fix_hydropower_lbs() self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist() self.links = list(zip(df.i,df.j,df.k)) self.networkcheck() # make sure things aren't broken
def make_unique_value_each_column (self, df, node_id): """ Dataframe? ??? ???? ??? ??? ?? ??? ???? Unique Value return in Dataframe Args: params: * df : dataframe * node_id: nnid Returns: json Raises: """ try: data_conf = dict() column_cate_unique = dict() numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] for i, v in df.dtypes.iteritems(): if (str(v) not in numerics): # maybe need float column_cate_unique[i] = df[i].unique().size data_conf['unique_cell_feature'] = column_cate_unique data_conf_json_str = json.dumps(data_conf) data_conf_json = json.loads(data_conf_json_str) return data_conf_json except Exception as e: logging.error("make_unique_value_each_column error : {0}, {1}".format(i,v)) raise e
def makeTable(df, rowsCol, colsCol, dataCol): # df.set_index(rowsCol) uniqRowVals = pd.unique(df[rowsCol]) uniqColVals = pd.unique(df[colsCol]) # "rows col = ", df[rowsCol] # print "uniq row vals", uniqRowVals # print "uniq col vals", uniqColVals # print df[[rowsCol, colsCol, dataCol]] out = pd.DataFrame(index=uniqRowVals, columns=uniqColVals) for rowVal in uniqRowVals: for colVal in uniqColVals: rowsMatch = df[rowsCol] == rowVal colsMatch = df[colsCol] == colVal thisIdx = np.where(rowsMatch * colsMatch)[0][0] out.ix[rowVal][colVal] = df[dataCol][thisIdx] return out
def label_metadata(label_matrix, label_col): # Check whether the column value is given as index (number) or name (string) try: label_col = int(label_col) # If given as number, take the name of the column out of it label_col = label_matrix.columns[label_col] except ValueError: pass import pandas as pd # Get the unique classes in the given column, and how many of them are there unique_classes = pd.unique(label_matrix[label_col].ravel()) #num_classes = unique_classes.shape[0] # Map the unique n classes with a number from 0 to n label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))}) # Replace the given column's values with the mapped equivalent mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist()) # Return the mapped labels as numpy list and the label map (unique classes and number can be obtained from map) return np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],)), np.asarray(label_map) #, unique_classes, num_classes
def label_metadata(label_matrix, label_col): # Check whether the column value is given as index (number) or name (string) try: label_col = int(label_col) # If given as number, take the name of the column out of it label_col = label_matrix.columns[label_col] except ValueError: pass # Get the unique classes in the given column, and how many of them are there unique_classes = pd.unique(label_matrix[label_col].ravel()) # Map the unique n classes with a number from 0 to n label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))}) # Replace the given column values with the mapped equivalent mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist()) # print("label_matrix", label_matrix) # print("mapped_labels", mapped_labels) # Return the mapped labels as ndarray and the label map (unique classes and number can be obtained from map) # np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],)) # Return the mapped labels as DataFrame and the label map (unique classes and number can be obtained from map) return mapped_labels[[label_col]], np.asarray(label_map) #, unique_classes, num_classes
def create_subset(src, dest, n=250): "Given a csv file `src`, create a subset `dest` with `n` unique entities" df = pd.read_csv(src) lics = pd.unique(df["License #"]) sublics = lics[random.sample(range(0,len(lics)), n)] subset = df[df["License #"].isin(sublics)] # Make the column names a little more readable subset.columns = map(clean_column_name, subset.columns) subset.to_csv(dest, index=False)
def convert_categorical(df): onecol = df.columns[1] onecol_name = df.columns.values.tolist()[1] df[onecol] = df[onecol].str.lower() categories = pd.unique(df[onecol]) categories = [x for x in categories if x is not None] try: categories.remove(' ') except: pass categories = [str(x) for x in categories] categories = list(set([str.lower(x).strip() for x in categories])) #replaces spaces in middle of word w underscores categories = list(set([x.replace(" ", '_') for x in categories])) featnames = [] for i in range(len(categories)): if type(categories[i]) is str: newfeatstr = onecol_name+'_is_' + categories[i] featnames.append(newfeatstr) df[newfeatstr] = (df[onecol] == categories[i]) onecol_null = onecol_name + "_is_null" df[onecol_null] = pd.isnull(df[onecol]) df[onecol_null] = df[onecol_null].astype(float) df = df.drop(onecol, axis=1) df[featnames] = df[featnames].astype(float) df = df.groupby(config_db['id_column'], sort = False, as_index=False)[featnames].max() return df, featnames
def _validate_layout(func): def func_wrapper(self): if self._col_wrap: if self._col_wrap > 16: raise VisualizationInvalidLayout else: return func(self) if self._col_factor and len(pd.unique(self._table[self._col_factor].values.ravel())) > 16: raise VisualizationInvalidLayout if self._row_factor and len(pd.unique(self._table[self._row_factor].values.ravel())) > 16: raise VisualizationInvalidLayout return func(self) return func_wrapper
def vectorize (f): from functools import wraps try: from pandas import Series, unique @wraps(f) def vectorized_f (x): # If we're given a scalar value, then simply return it. if not hasattr(x,'__len__'): return f(x) # Get unique values inputs = unique(x) outputs = map(f,inputs) table = dict(zip(inputs,outputs)) result = Series(x).map(table) return result.values except ImportError: def cached_f(x, cache={}): if x not in cache: cache[x] = f(x) return cache[x] @wraps(f) def vectorized_f (x): # If we're given a scalar value, then simply return it. if not hasattr(x,'__len__'): return cached_f(x) return map(cached_f,x) return vectorized_f # The type of data returned by the Buffer iterator.
def saveLabel(self): if not len(self.labelFile): self.labelFile = QtGui.QFileDialog.getSaveFileName(self, 'Save Label File', os.path.expanduser('~'), 'Txt (*.txt)') if len(self.labelFile): self.updateLabelsBuf() if self.labelsBuf is not None: if self.labels is None: self.labels = self.labelsBuf self.labels = self.labels[~self.labels.image.isin(pd.unique(self.labelsBuf.image.ravel()))] self.labelsBuf = self.labelsBuf[self.labelsBuf.cateid.notnull()] self.labels = self.labels.append(self.labelsBuf, ignore_index=True) self.labels.to_csv(self.labelFile, index=False) self.labelsBuf = self.labelsBuf[self.labelsBuf.image == os.path.basename(self.imgsList[self.ith])]
def add_actual_temperature(df, method='excel', filename='SecondaryStar_Temperatures.xls'): """ Add the actual temperature to a given summary dataframe :param df: The dataframe to which we will add the actual secondary star temperature :keyword method: How to get the actual temperature. Options are: - 'spt': Use main-sequence relationships to go from spectral type --> temperature - 'excel': Use tabulated data, available in the file 'SecondaryStar_Temperatures.xls' :keyword filename: The filename of the excel spreadsheet containing the literature temperatures. Needs to have the right format! Ignored if method='spt' :return: copy of the original dataframe, with an extra column for the secondary star temperature """ # First, get a list of the secondary stars in the data secondary_names = pd.unique(df.Secondary) secondary_to_temperature = defaultdict(float) secondary_to_error = defaultdict(float) if method.lower() == 'spt': MS = SpectralTypeRelations.MainSequence() for secondary in secondary_names: star_data = StarData.GetData(secondary) spt = star_data.spectype[0] + re.search('[0-9]\.*[0-9]*', star_data.spectype).group() T_sec = MS.Interpolate(MS.Temperature, spt) secondary_to_temperature[secondary] = T_sec elif method.lower() == 'excel': table = pd.read_excel(filename, 0) for secondary in secondary_names: T_sec = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())]['Literature_Temp'].item() T_error = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())][ 'Literature_error'].item() secondary_to_temperature[secondary] = T_sec secondary_to_error[secondary] = T_error df['Tactual'] = df['Secondary'].map(lambda s: secondary_to_temperature[s]) df['Tact_err'] = df['Secondary'].map(lambda s: secondary_to_error[s]) return
def fit_sigma(df, i): """ Find the largest allowable standard deviation, given the possible values Tactual can take. """ Tmeasured, Tactual, _, _ = get_values(df) Tm = Tmeasured[i] # Get the possible values, and bin those with this measured value possible_values = sorted(pd.unique(df.Tactual)) edges = [(possible_values[i] + possible_values[i+1])/2 for i in range(len(possible_values)-1)] bins = [0] + edges + [9e9] good = df.loc[df.Temperature == Tm] values, _= np.histogram(good.Tactual.values, bins=bins) mean = np.mean(good.Tactual.values) std = np.std(good.Tactual.values, ddof=1) if std > 0: return std sigma_test = np.arange(500, 10, -10) #Just test a bunch of values idx = np.searchsorted(bins, mean) idx = np.argmin(abs(np.array(bins) - mean)) x1 = bins[idx-2] if idx > 2 else -1 x2 = bins[idx-1] x3 = bins[idx] x4 = bins[idx+1] if idx < len(bins)-2 else np.inf N = len(good) probs = [get_probability(x1, x2, x3, x4, N, mean, s) for s in sigma_test] for s, p in zip(sigma_test, probs): if p > 0.5: return s # If we get here, just return a guess value return 200.0 #raise ValueError('No probability > 0!')
def read_hdf5(hdf5_file): """ Reads the hdf5 file into a dataframe. Assumes a very specific format! Parameters: =========== - hdf5_file: string The full path to the hdf5 file. Returns ======== A pandas DataFrame containing summary information """ logging.info('Reading HDF5 file {}'.format(hdf5_file)) hdf5_int = HDF5_Interface(hdf5_file) df = hdf5_int.to_df() # Get the contrast. Split by group and then merge to limit the amount of calculation needed logging.info('Estimating the V-band contrast ratio for each trial') test_vsini = df.vsini.unique()[0] temp = df.loc[(df.rv == 0) & (df.vsini == test_vsini)].drop_duplicates(subset=['star', 'temperature']) temp['contrast'] = temp.apply(lambda r: get_contrast(r, band='V'), axis=1) logging.info('Estimating the luminosity ratio for each trial') temp['lum_ratio'] = temp.apply(get_luminosity_ratio, axis=1) logging.info('Re-merging dataframe') df = pd.merge(df, temp[['star', 'temperature', 'contrast', 'lum_ratio']], on=['star', 'temperature'], how='left') df['logL'] = np.log10(df.lum_ratio) return df
def parse_input(inp, sort_output=True, ensure_unique=True): """ Parse the user input to get a list of integers. Parameters: =========== - inp: string Can be in the form 'a-b', 'a,b,c', 'a-b,c-d', etc. '-' means an inclusive list of every number between a and b ',' means the numbers a and b - sort_output: boolean Sort the output integers? - ensure_unique: boolean Make sure the final list has no repeats? :return: A list of integers """ sublists = inp.split(',') final_list = [] for l in sublists: if '-' in l: first, last = l.split('-') for i in range(int(first), int(last) + 1): final_list.append(i) else: final_list.append(int(l)) if ensure_unique: final_list = pd.unique(final_list) if sort_output: final_list = sorted(final_list) return final_list
def get_ccf(self, params, df=None): """ Get the ccf with the given parameters. Parameters: =========== - params: dictionary: All the parameters necessary to define a single ccf. This should be a python dictionary with the keys: - 'starname': The name of the star. Try self.list_stars() for the options. - 'date': The UT date of the observations. Try self.list_dates() for the options. - 'T': temperature of the model - 'logg': the log(g) of the model - 'vsini': the vsini by which the model was broadened before correlation - '[Fe/H]': the metallicity of the model - 'addmode': The way the order CCFs were added to make a total one. Can be: - 'simple' - 'ml' - 'weighted' - 'dc' - df: a pandas DataFrame such as outputted by _compile_data Returns: ======== -ccf: pandas DataFrame Holds columns of velocity and CCF power """ if df is None: try: df = self._compile_data(params['starname'], params['date']) except KeyError: raise KeyError('Must give get_ccf params with starname and date keywords, if df is not given!') Tvals = df['T'].unique() T = Tvals[np.argmin(abs(Tvals - params['T']))] good = df.loc[(df['T'] == T) & (df.logg == params['logg']) & (df.vsini == params['vsini']) \ & (df['[Fe/H]'] == params['[Fe/H]']) & (df.addmode == params['addmode'])] return pd.DataFrame(data={'velocity': self.velocities, 'CCF': good['ccf'].item()})
def test_ints(self): arr = np.random.randint(0, 100, size=50) result = algos.unique(arr) tm.assertIsInstance(result, np.ndarray)
def test_objects(self): arr = np.random.randint(0, 100, size=50).astype('O') result = algos.unique(arr) tm.assertIsInstance(result, np.ndarray)
def test_object_refcount_bug(self): lst = ['A', 'B', 'C', 'D', 'E'] for i in range(1000): len(algos.unique(lst))
def test_on_index_object(self): mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile( np.arange(5), 5)]) expected = mindex.values expected.sort() mindex = mindex.repeat(2) result = pd.unique(mindex) result.sort() tm.assert_almost_equal(result, expected)
def test_unique_label_indices(): from pandas.hashtable import unique_label_indices a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') left = unique_label_indices(a) right = np.unique(a, return_index=True)[1] tm.assert_numpy_array_equal(left, right) a[np.random.choice(len(a), 10)] = -1 left = unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] tm.assert_numpy_array_equal(left, right)
def __init__(self, data=None, groups=None, **kwargs): super().__init__(data=data, **kwargs) if groups is not None: self.plydata_groups = list(pd.unique(groups))
def _n_distinct(arr): """ Number of unique values in array """ return len(pd.unique(arr))
def test_clean_2017_reimbursements(self): copy(os.path.join(self.fixtures_path, 'reimbursements-2017.xz'), self.path) file_path = os.path.join(self.path, 'reimbursements.xz') self.subject.clean() assert(os.path.exists(file_path)) dataset = pd.read_csv(file_path, compression='xz') all_subquotas = [subquota[1] for subquota in self.subject.subquotas] present_subquotas = pd.unique(dataset['subquota_description']) for subquota in present_subquotas: with self.subTest(): assert(subquota in all_subquotas)
def app_activity_features(): train = pd.read_csv("gender_age_train.csv") test = pd.read_csv("gender_age_test.csv") train.drop(['gender','age','group'],axis=1,inplace=True) data = train.append(test) """ Merge with brand_model table""" device_table = pd.read_csv("phone_brand_device_model.csv") data = pd.merge(data,device_table,how='left',on='device_id') data = data.drop_duplicates() #drop duplicates #note: there is still one device associated with 2 brands/models del device_table print "data build" """ Create dataframe indicating for each device id, which app is present, and how much is it active - merge events and app_events on event_id - group by device_id and app_id, and take the mean of activity """ events = pd.read_csv("events.csv") events = events[events['device_id'].isin(list(data['device_id']))] apps = pd.read_csv("app_events.csv") apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id') apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean() del events print "events build" """Reshape the dataframe so that each app is a new feature""" reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id']))) reshaped[list(pd.unique(apps['app_id']))]=0 for app in list(pd.unique(apps['app_id'])): sliced = apps[apps['app_id']==app] reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values del apps return reshaped ######################################################################################################################################## ########################################################################################################################################
def app_activity_features(): train = pd.read_csv("gender_age_train.csv") test = pd.read_csv("gender_age_test.csv") train.drop(['gender','age','group'],axis=1,inplace=True) data = train.append(test) """ Merge with brand_model table""" device_table = pd.read_csv("phone_brand_device_model.csv") data = pd.merge(data,device_table,how='left',on='device_id') data = data.drop_duplicates() #drop duplicates #note: there is still one device associated with 2 brands/models del device_table print "data build" """ Create dataframe indicating for each device id, which app is present, and how much is it active - merge events and app_events on event_id - group by device_id and app_id, and take the mean of activity """ events = pd.read_csv("events.csv") events = events[events['device_id'].isin(list(data['device_id']))] apps = pd.read_csv("app_events.csv") apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id') apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean() del events print "events build" """Reshape the dataframe so that each app is a new feature""" reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id']))) reshaped[list(pd.unique(apps['app_id']))]=0 for app in list(pd.unique(apps['app_id'])): sliced = apps[apps['app_id']==app] reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values del apps return reshaped
def _process_dataset(anno, sample_rate, n_samples, n_threads): """Processes, and saves MagnaTagATune dataset using multi-processes. Args: anno: Annotation DataFrame contains tags, mp3_path, split, and shard. sample_rate: Sampling rate of the audios. If the sampling rate is different with an audio's original sampling rate, then it re-samples the audio. n_samples: Number of samples one segment contains. n_threads: Number of threads to process the dataset. """ args_queue = Queue() split_and_shard_sets = pd.unique([tuple(x) for x in anno[['split', 'shard']].values]) for split, shard in split_and_shard_sets: assigned_anno = anno[(anno['split'] == split) & (anno['shard'] == shard)] n_shards = anno[anno['split'] == split]['shard'].nunique() args = (assigned_anno, sample_rate, n_samples, split, shard, n_shards) args_queue.put(args) if FLAGS.n_threads > 1: threads = [] for _ in range(FLAGS.n_threads): thread = Thread(target=_process_audio_files, args=[args_queue]) thread.start() threads.append(thread) for thread in threads: thread.join() else: _process_audio_files(args_queue)
def aggregate_regions(fp): # aggregate regions and supply portfolios # easier to do this with pandas by just reading the CSVs again sc = pd.read_csv(fp + '/shortage_cost.csv', index_col=0, parse_dates=True) sv = pd.read_csv(fp + '/shortage_volume.csv', index_col=0, parse_dates=True) flow = pd.read_csv(fp + '/flow.csv', index_col=0, parse_dates=True) demand_nodes = pd.read_csv('calvin/data/demand_nodes.csv', index_col = 0) portfolio = pd.read_csv('calvin/data/portfolio.csv', index_col = 0) for R in demand_nodes.region.unique(): for t in demand_nodes.type.unique(): ix = demand_nodes.index[(demand_nodes.region == R) & (demand_nodes.type == t)] sc['%s_%s' % (R,t)] = sc[ix].sum(axis=1) sv['%s_%s' % (R,t)] = sv[ix].sum(axis=1) for P in portfolio.region.unique(): for k in portfolio.supplytype.unique(): for t in portfolio.type.unique(): ix = portfolio.index[(portfolio.region == P) & (portfolio.type ==t) & (portfolio.supplytype == k)] flow['%s_%s_%s' % (P,k,t)] = flow[ix].sum(axis=1) sc.to_csv(fp + '/shortage_cost.csv') sv.to_csv(fp + '/shortage_volume.csv') flow.to_csv(fp + '/flow.csv')
def remove_debug_links(self): df = self.df ix = df.index[df.index.str.contains('DBUG')] df.drop(ix, inplace=True, axis=0) self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist() self.links = list(zip(df.i,df.j,df.k)) return df
def nominal_to_numeric(array): mapper = {name: i for i, name in enumerate(pd.unique(array))} return np.array([mapper[name] for name in array])
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False): super().__init__(data_dir, work_dir) self.meta = pd.read_csv(data_dir + 'esc50.csv') self.train_folds = train_folds self.validation_folds = validation_folds self.test_folds = test_folds self.class_count = 50 self.bands = 60 self.segment_length = 101 self.esc10 = esc10 if self.esc10: self.class_count = 10 self.meta = self.meta[self.meta['esc10']] self.categories = pd.unique(self.meta.sort_values('target')['category']) self.meta['target'] = self.to_targets(self.meta['category']) else: self.categories = pd.unique(self.meta.sort_values('target')['category']) self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)] self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)] self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)] self._validation_size = len(self.validation_data.meta) self._test_size = len(self.test_data.meta) self._generate_spectrograms() self._populate(self.validation_data) self._populate(self.test_data)
def dataconf_eval_time_check(self, _wf_data_conf_node, _node_name): """ data conf? ???, eval?? unique?? ????. :param data_dfconf_list (nn00001_1_dataconf_node) :return True: """ _value = False if ('evaldata' in _node_name): _value = True return _value
def set_dataconf_for_labels(self, df, label): """ csv? ?? label? distict ?? ??? Extract distinct label values :param wf_data_config, df, nnid, ver, node: :param conf_data: """ #TODO : set_default_dataconf_from_csv ???? ?? ?? label_values = pd.unique(df[label].values.ravel().astype('str')).tolist() return label_values
def test_get_events(self, mock_query): urlread_sideeffect = ["""1|2|3|4|5|6|7|8|9|10|11|12|13 20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA --- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA """] data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size) # assert only first two events events were successfully saved assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2 # AND data to save has length 2: assert len(data) == 2 # now download again, with an url error: urlread_sideeffect = [413, """1|2|3|4|5|6|7|8|9|10|11|12|13 20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA --- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA """, URLError('blabla23___')] data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size) # assert we got the same result as above: assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2 assert len(data) == 2 # and since first response is 413, that having split the request into two, the # second response is our URLError (we could test it better, anyway): assert "blabla23___" in self.log_msg()
def sort_eg_attributes(df, attributes=['doh', 'ldate'], reverse_list=[0, 0], add_columns=False): '''Sort master list attribute columns by employee group in preparation for list construction. The overall master list structure and order is unaffected, only the selected attribute columns are sorted (normally date-related columns such as doh or ldate) inputs df The master data dataframe (does not need to be sorted) attributes columns to sort by eg (inplace) reverse_list If an attribute is to be sorted in reverse order (descending), use a '1' in the list position corresponding to the position of the attribute within the attributes input add_columns If True, an additional column for each sorted attribute will be added to the resultant dataframe, with the suffix '_sort' added to it. ''' date_cols = [] for col in df: if (df[col]).dtype == 'datetime64[ns]': date_cols.append(col) try: df.sort_values(['eg', 'eg_number'], inplace=True) except LookupError: df.sort_values(['eg', 'eg_order'], inplace=True) egs = df.eg.values i = 0 for measure in attributes: data = df[measure].values measure_col = np.empty_like(data) for eg in pd.unique(df.eg): measure_slice = data[egs == eg] measure_slice_index = np.where(egs == eg)[0] measure_slice_sorted = np.sort(measure_slice, axis=0) if reverse_list[i]: measure_slice_invert = measure_slice_sorted[::-1] measure_slice_sorted = measure_slice_invert np.put(measure_col, measure_slice_index, measure_slice_sorted) if add_columns: col_name = measure + '_sort' else: col_name = measure df[col_name] = measure_col if measure in date_cols: df[col_name] = pd.to_datetime(df[col_name].dt.date) i += 1 return df
def unique(lst): """ Return unique elements :class:`pandas.unique` and :class:`numpy.unique` cast mixed type lists to the same type. They are faster, but some times we want to maintain the type. Parameters ---------- lst : list-like List of items Returns ------- out : list Unique items in the order that they appear in the input. Examples -------- >>> import pandas as pd >>> import numpy as np >>> lst = ['one', 'two', 123, 'three'] >>> pd.unique(lst) array(['one', 'two', '123', 'three'], dtype=object) >>> np.unique(lst) array(['123', 'one', 'three', 'two'], dtype='<U5') >>> unique(lst) ['one', 'two', 123, 'three'] pandas and numpy cast 123 to a string!, and numpy does not even maintain the order. """ seen = set() def make_seen(x): seen.add(x) return x return [make_seen(x) for x in lst if x not in seen]
def test_fetch_translate_clean_integration(self): self.subject.fetch() files = ["Ano-{}.csv".format(n) for n in [2017]] files.append('datasets-format.html') for name in files: file_path = os.path.join(self.path, name) assert(os.path.exists(file_path)) self.subject.translate() for name in ["reimbursements-{}.xz".format(n) for n in self.years]: file_path = os.path.join(self.path, name) assert(os.path.exists(file_path)) self.subject.clean() file_path = os.path.join(self.path, 'reimbursements.xz') assert(os.path.exists(file_path)) # test for subquota translation dataset = pd.read_csv(file_path, compression='xz') all_subquotas = ['Maintenance of office supporting parliamentary activity', 'Locomotion, meal and lodging', 'Fuels and lubricants', 'Consultancy, research and technical work', 'Publicity of parliamentary activity', 'Purchase of office supplies', 'Software purchase or renting; Postal services; Subscriptions', 'Security service provided by specialized company', 'Flight tickets', 'Telecommunication', 'Postal services', 'Publication subscriptions', 'Congressperson meal', 'Lodging, except for congressperson from Distrito Federal', 'Automotive vehicle renting or watercraft charter', 'Aircraft renting or charter of aircraft', 'Automotive vehicle renting or charter', 'Watercraft renting or charter', 'Taxi, toll and parking', 'Terrestrial, maritime and fluvial tickets', 'Participation in course, talk or similar event', 'Flight ticket issue'] present_subquotas = pd.unique(dataset['subquota_description']) for subquota in present_subquotas: assert(subquota in all_subquotas)
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False, downsample=True): super().__init__(data_dir, work_dir) self.meta = pd.read_csv(data_dir + 'esc50.csv') self.train_folds = train_folds self.validation_folds = validation_folds self.test_folds = test_folds self.class_count = 50 self.DOWNSAMPLE = downsample self.SEGMENT_LENGTH = 300 self.BANDS = 180 self.WITH_DELTA = False self.FMAX = 16000 self.FFT = 2205 self.HOP = 441 self.esc10 = esc10 if self.esc10: self.class_count = 10 self.meta = self.meta[self.meta['esc10']] self.categories = pd.unique(self.meta.sort_values('target')['category']) self.meta['target'] = self.to_targets(self.meta['category']) else: self.categories = pd.unique(self.meta.sort_values('target')['category']) self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)] self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)] self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)] self._validation_size = len(self.validation_data.meta) self._test_size = len(self.test_data.meta) self._generate_spectrograms() if self.DOWNSAMPLE: self.SEGMENT_LENGTH //= 2 self.BANDS //= 3 self._populate(self.validation_data) self._populate(self.test_data)
def set_dataconf_for_checktype(self, df, node_id, data_dfconf_list): """ csv? ?? column type? ???? data_conf? ??(data_conf? ????? ) ???? ??? Unique ? ?? ??? cell_feature_unique? ???(Keras?) :param wf_data_config, df, nnid, ver, node: :param conf_data: """ try: #TODO : set_default_dataconf_from_csv ???? ?? ?? data_conf = dict() data_conf_unique_v = dict() data_conf_col_unique_v = dict() data_conf_col_type = dict() numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] # Wdnn??? data_dfconf? ??? ??? ?? ??? ??? ?? ??? ?? if len(data_dfconf_list) > 0: _wf_data_conf = wf_data_conf(data_dfconf_list) _cell_feature_unique = _wf_data_conf.cell_feature_unique if hasattr(_wf_data_conf, 'cell_feature_unique') else list() # ?? ???? ????? ??? ? ??? ?? for i, v in df.dtypes.iteritems(): # label column_dtypes = dict() column_unique_value = dict() if (str(v) in numerics): # maybe need float col_type = 'CONTINUOUS' columns_unique_value = list() else: col_type = 'CATEGORICAL' columns_unique_value = pd.unique(df[i].fillna('').values.ravel()).tolist() # null?? ??? column_dtypes['column_type'] = col_type origin_feature_unique = _cell_feature_unique[i].get('column_u_values') if (i in _cell_feature_unique) else list() combined_col_u_list = utils.get_combine_label_list(origin_feature_unique, columns_unique_value) column_unique_value['column_u_values'] = combined_col_u_list #???? ???? ?? ????. data_conf_col_type[i] = column_dtypes data_conf_col_unique_v[i] = column_unique_value data_conf['cell_feature'] = data_conf_col_type data_conf_unique_v['cell_feature_unique'] = data_conf_col_unique_v data_conf_json_str = json.dumps(data_conf) #Json?? ??? data_conf_json = json.loads(data_conf_json_str) data_conf_unique_json_str = json.dumps(data_conf_unique_v) data_conf_unique_json = json.loads(data_conf_unique_json_str) return data_conf_json, data_conf_unique_json except Exception as e: logging.error("set_dataconf_for_checktype {0} {1}".format(e, e.__traceback__.tb_lineno))