我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.to_numeric()。
def checkFSXvalsAgainstADNIMERGE(tadpoleDF, mriADNI1FileFSX, otherSSvisCodeStr, ssNameTag, ignoreMissingCols = False): nrRows, nrCols = tadpoleDF.shape colListOtherSS = list(ssDF.columns.values) colListTadpoleDF = list(tadpoleDF.columns.values) tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]] = \ tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]].apply(pd.to_numeric, errors='coerce') tadpoleDF['HIPPOSUM'] = tadpoleDF['ST29SV%s' % ssNameTag] + tadpoleDF['ST88SV%s' % ssNameTag] for r in range(nrRows): valsNan = np.isnan(tadpoleDF['Hippocampus'][r]) or (np.isnan(tadpoleDF['ST29SV%s' % ssNameTag][r]) and \ np.isnan(tadpoleDF['ST88SV%s' % ssNameTag][r])) if valsNan: continue valsNotEq = tadpoleDF['Hippocampus'][r] != (tadpoleDF['ST29SV%s' % ssNameTag][r] + tadpoleDF['ST88SV%s' % ssNameTag][r]) if valsNotEq: print('entries dont match\n ', tadpoleDF[['RID','VISCODE', 'Hippocampus', 'ST29SV%s' % ssNameTag,\ 'ST88SV%s' % ssNameTag, 'HIPPOSUM']].iloc[r]) # Conclusion: the reason why entries above don't match is because UCSFFSX has duplicate entries for the same subject and viscode.
def load_submission(self, submission_file): loc_submission = pd.read_csv(submission_file, header=None) build_proc_sub = loc_submission[0].str.split(' ').values.tolist() assert len(build_proc_sub[0]) == self.n_classes + len(self.submission_columns) proc_sub = pd.DataFrame.from_records(build_proc_sub, columns=[self.submission_columns + list(range(self.n_classes))]) if self.subset is not None: if type(proc_sub['frame_id'].values[0]) is np.ndarray: mask = [True if x[0] in self.subset else False for x in proc_sub['frame_id'].values] else: # old pandas version mask = [True if x in self.subset else False for x in proc_sub['frame_id'].values] proc_sub = proc_sub[mask] assert np.any(np.array(mask)) num_proc_sub = proc_sub.apply(pd.to_numeric, errors='ignore') grouped_by_vid = num_proc_sub self.submission = grouped_by_vid
def build_dataframe(self): if not self.values.exists(): return pd.DataFrame() # Am I really a programmer or just a lego assembler? # Pandas makes my life at least 20 times easier. df = pd.DataFrame.from_records(self.values, index=self.index_column) # make the columns and labels prettier if self.rename_columns: df = df.rename(columns=self.column_mapping) df.index.name = TIME_COLUMN_NAME try: df.index = df.index.tz_convert(self.user.pytz_timezone) except AttributeError: # if attribute-error means the index is just a regular Index and # that only dates (and not time) was passed df.index = pd.DatetimeIndex(df.index, tz=self.user.pytz_timezone) # cast it as numerics if possible, otherwise if we're dealing with strings, ignore df = df.apply(pd.to_numeric, errors='ignore') return df
def _cond_ind_effects_wrapper(self): """ A wrapper for the conditional indirect effects. :return: pd.DataFrame A DataFrame of effects, se, llci, and ulci, for the conditional indirect effects. """ symb_to_var = self._symb_to_var results = self.estimation_results rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T cols_stats = ["Effect", "Boot SE", "BootLLCI", "BootULCI"] mod_values = self._moderators_values med_values = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]] values = med_values + mod_values rows_levels = np.array([i for i in product(*values)]) cols_levels = ["Mediator"] + [symb_to_var.get(x, x) for x in self._moderators_symb] rows = np.concatenate([rows_levels, rows_stats], axis=1) cols = cols_levels + cols_stats df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0]) return df.apply(pd.to_numeric, args=["ignore"])
def _simple_ind_effects_wrapper(self): """ A wrapper for the indirect effects (and for total/contrast effects if specified) :return: pd.DataFrame A DataFrame of effects, se, llci, and ulci, for the simple/total/constrasts of indirect effects. """ symb_to_var = self._symb_to_var results = self.estimation_results rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T med_names = [symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)] rows_levels = [] if self._options["total"]: rows_levels += ["TOTAL"] rows_levels += med_names if self._options["contrast"]: contrasts = ["Contrast: {} vs. {}".format(a, b) for a, b in combinations(med_names, 2)] rows_levels += contrasts rows_levels = np.array(rows_levels).reshape(-1, 1) rows = np.concatenate([rows_levels, rows_stats], axis=1) cols = ["", "Effect", "Boot SE", "BootLLCI", "BootULCI"] df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0]) return df.apply(pd.to_numeric, args=["ignore"])
def _PMM_index_wrapper(self): """ A wrapper for the Partial Moderated Mediation index. :return: pd.DataFrame A DataFrame of effects, se, llci, and ulci, for the PMM index. """ symb_to_var = self._symb_to_var results = self._PMM_index() rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T cols_stats = ["Index", "Boot SE", "LLCI", "ULCI"] mod_names = [[symb_to_var.get(i, i) for i in self._moderators_symb]] med_names = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]] values = mod_names + med_names rows_levels = np.array([i for i in product(*values)]) cols_levels = ["Moderator", "Mediator"] rows = np.concatenate([rows_levels, rows_stats], axis=1) cols = cols_levels + cols_stats df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0]) return df.apply(pd.to_numeric, args=["ignore"])
def _MMM_index_wrapper(self): """ A wrapper for the Moderated Moderated Mediation index. :return: pd.DataFrame A DataFrame of effects, se, llci, and ulci, for the CMM index. """ symb_to_var = self._symb_to_var results = self._MMM_index() rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T cols_stats = ["Index", "Boot SE", "BootLLCI", "BootULCI"] med_names = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]] rows_levels = np.array([i for i in product(*med_names)]) cols_levels = ["Mediator"] rows = np.concatenate([rows_levels, rows_stats], axis=1) cols = cols_levels + cols_stats df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0]) return df.apply(pd.to_numeric, args=["ignore"])
def clean_data(DT_df, attributes): """data preprocessing""" # DT_df = DT_df.drop(drop_cols, axis=1) DT_df["fs_scan_amt_pre"] = DT_df["fs_scan_amt_pre"].astype(float) DT_df["fs_scan_amt_pos"] = DT_df["fs_scan_amt_pos"].astype(float) DT_df["fs_scan_amt_pos_PF"] = DT_df["fs_scan_amt_pos_PF"].astype(float) DT_df["dyn_margin_amt_pre"] = DT_df["dyn_margin_amt_pre"].astype(float) DT_df["dyn_margin_amt_pos"] = DT_df["dyn_margin_amt_pos"].astype(float) DT_df["dyn_margin_amt_pos_PF"] = DT_df[ "dyn_margin_amt_pos_PF"].astype(float) DT_df["ctl_grp_ind"] = DT_df["ctl_grp_ind"].astype(int) DT_df["mailer_version_id"] = DT_df["mailer_version_id"].astype(int) DT_df["tcm_redeem_md"] = pd.to_numeric(DT_df["tcm_redeem_md"]) for attr in attributes: DT_df[attr] = DT_df[attr].astype(int) fields = attributes + ["fs_scan_amt_pre", "fs_scan_amt_pos", "fs_scan_amt_pos_PF", "dyn_margin_amt_pre", "dyn_margin_amt_pos", "dyn_margin_amt_pos_PF", "ctl_grp_ind", "mailer_version_id", "tcm_redeem_md", "xtra_card_nbr"] DT_df = DT_df[fields] return DT_df
def _get_table(self, column, is_size=True): cols = list(range(5)) cols.append(self.header.index(column)) header = [self.header[c] for c in cols] rows = [ [row[c] for c in cols] for row in self.rows ] if is_size: for row in rows: row[5] = parse_size(row[5]) table = pd.DataFrame.from_records(rows, columns=header) table = table.rename(columns={ 'prog' : 'Program', 'prog2' : 'Program2', 'threads' : 'Threads', 'dataset' : 'Dataset', 'qcut' : 'Quality', }) table['Threads'] = pd.to_numeric(table['Threads']) table['Dataset'] = pd.Categorical(table['Dataset']) table['Program'] = pd.Categorical(table['Program']) table['Program2'] = pd.Categorical(table['Program2']) return table
def __init__(self, filename=TABLE_FILENAME): MS = SpectralTypeRelations.MainSequence() # Read in the table. colspecs=[[0,7], [7,14], [14,21], [21,28], [28,34], [34,40], [40,47], [47,55], [55,63], [63,70], [70,78], [78,86], [86,94], [94,103], [103,110], [110,116], [116,122], [122,130], [130,137], [137,144], [144,151], [151,158]] mam_df = pd.read_fwf(filename, header=20, colspecs=colspecs, na_values=['...'])[:92] # Strip the * from the logAge column. Probably shouldn't but... mam_df['logAge'] = mam_df['logAge'].map(lambda s: s.strip('*') if isinstance(s, basestring) else s) # Convert everything to floats for col in mam_df.columns: mam_df[col] = pd.to_numeric(mam_df[col], errors='ignore') # Add the spectral type number for interpolation mam_df['SpTNum'] = mam_df['SpT'].map(MS.SpT_To_Number) self.mam_df = mam_df
def load_metrics_from_db(db_path, tx_mode, aln_mode): """ Loads the alignment metrics for the mRNA/CDS alignments of transMap/AugustusTM/TMR """ session = tools.sqlInterface.start_session(db_path) metrics_table = tools.sqlInterface.tables[aln_mode][tx_mode]['metrics'] metrics_df = tools.sqlInterface.load_metrics(metrics_table, session) # unstack flattens the long-form data structure metrics_df = metrics_df.set_index(['AlignmentId', 'classifier']).unstack('classifier') metrics_df.columns = [col[1] for col in metrics_df.columns] metrics_df = metrics_df.reset_index() cols = ['AlnCoverage', 'AlnGoodness', 'AlnIdentity', 'PercentUnknownBases'] metrics_df[cols] = metrics_df[cols].apply(pd.to_numeric) metrics_df['OriginalIntrons'] = metrics_df['OriginalIntrons'].fillna('') metrics_df['OriginalIntrons'] = [list(map(int, x)) if len(x[0]) > 0 else [] for x in metrics_df['OriginalIntrons'].str.split(',').tolist()] metrics_df['OriginalIntronsPercent'] = metrics_df['OriginalIntrons'].apply(calculate_vector_support, resolve_nan=1) session.close() return metrics_df
def create_routing_table(bgp=None, ixp_prefixes=None, ixp_asns=None, bgp_compression='infer'): log.info('Creating IP2AS tool.') if bgp_compression == 'infer' and bgp.startswith('http'): bgp_compression = infer_compression(bgp, 'infer') if not isinstance(ixp_prefixes, pd.DataFrame): ixp_prefixes = set(pd.read_csv(ixp_prefixes, comment='#', index_col=0).index.unique()) if ixp_prefixes is not None else set() if not isinstance(ixp_asns, pd.DataFrame): ixp_asns = set(pd.read_csv(ixp_asns, comment='#', index_col=0).index.unique()) if ixp_asns is not None else set() if not isinstance(bgp, pd.DataFrame): bgp_original = pd.read_table(bgp, comment='#', names=['Address', 'Prefixlen', 'ASN'], compression=bgp_compression) bgp = bgp_original[~bgp_original.ASN.str.contains(',|_')].copy() bgp['ASN'] = pd.to_numeric(bgp.ASN) rt = RoutingTable() for address, prefixlen, asn in bgp[~bgp.ASN.isin(ixp_asns)].itertuples(index=False): rt.add_prefix(asn.item(), address, prefixlen) for address, prefixlen, asn in bgp[bgp.ASN.isin(ixp_asns)].itertuples(index=False): rt.add_ixp(address, prefixlen) for prefix in ixp_prefixes: rt.add_ixp(prefix) rt.add_private() rt.add_multicast() rt.add_default() return rt
def assemble_row_metadata(full_df, num_col_metadata, num_data_rows, num_row_metadata): # Extract values row_metadata_row_inds = range(num_col_metadata + 1, num_col_metadata + num_data_rows + 1) row_metadata_col_inds = range(1, num_row_metadata + 1) row_metadata = full_df.iloc[row_metadata_row_inds, row_metadata_col_inds] # Create index from the first column of full_df (after the filler block) row_metadata.index = full_df.iloc[row_metadata_row_inds, 0] # Create columns from the top row of full_df (before cids start) row_metadata.columns = full_df.iloc[0, row_metadata_col_inds] # Rename the index name and columns name row_metadata.index.name = row_index_name row_metadata.columns.name = row_header_name # Convert metadata to numeric if possible row_metadata = row_metadata.apply(lambda x: pd.to_numeric(x, errors="ignore")) return row_metadata
def assemble_col_metadata(full_df, num_col_metadata, num_row_metadata, num_data_cols): # Extract values col_metadata_row_inds = range(1, num_col_metadata + 1) col_metadata_col_inds = range(num_row_metadata + 1, num_row_metadata + num_data_cols + 1) col_metadata = full_df.iloc[col_metadata_row_inds, col_metadata_col_inds] # Transpose so that samples are the rows and headers are the columns col_metadata = col_metadata.T # Create index from the top row of full_df (after the filler block) col_metadata.index = full_df.iloc[0, col_metadata_col_inds] # Create columns from the first column of full_df (before rids start) col_metadata.columns = full_df.iloc[col_metadata_row_inds, 0] # Rename the index name and columns name col_metadata.index.name = column_index_name col_metadata.columns.name = column_header_name # Convert metadata to numeric if possible col_metadata = col_metadata.apply(lambda x: pd.to_numeric(x, errors="ignore")) return col_metadata
def _orderbook_tag_frame(text): # This function can be removed if this pandas feature request is implemented # https://github.com/pandas-dev/pandas/issues/14608 table_str = _table_text(text) root = etree.fromstring(table_str) table_body = root.find('tbody') index = [] data = defaultdict(list) # Iterator of tr objects qty_path = "td[@class='change-cell quantity']" tr_iter = table_body.iter(tag='tr') for tr in tr_iter: index.append(tr.find(path='td').text.strip()) # Quantity Held pos = pd.to_numeric(tr.find(path=qty_path).attrib['value']) data[iem.QUANTITY_HELD].append(pos) # Your Bids data[iem.YOUR_BIDS].append(_num_open_orders(tr, 'yourBidsCell')) # Your Asks data[iem.YOUR_ASKS].append(_num_open_orders(tr, 'yourAsksCell')) return pd.DataFrame(data=data, index=index)
def apply_ht_scores(dataframe): # Load the ht score dataframe ht_scores = pandas.read_csv('{0}ht_scores.csv'.format(config['result_data']), index_col=0) dataframe['phone'] = dataframe['phone'].map(lambda x: re.sub('[^0-9]', '', str(x))) # Make the column a numeric column for merging dataframe['phone'] = pandas.to_numeric(dataframe['phone']) final = dataframe.merge(ht_scores, how='left', left_on='phone', right_index=True) # Drop the content column and drop the index column final.drop('content', axis=1, inplace=True) if os.path.isfile('{0}ad_chars_final.csv'.format(config['result_data'])): lock.acquire() print 'lock has been set for file {0}'.format(file) final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), mode='a', header=False, encoding='utf-8') lock.release() print 'lock has been released for file {0}'.format(file) else: final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), header=True, encoding='utf-8')
def apply_ht_scores(dataframe): # Load the ht score dataframe ht_scores = pandas.read_csv('{0}ht_scores.csv'.format(config['result_data']), index_col=0) dataframe['phone'] = dataframe['phone'].map(lambda x: re.sub('[^0-9]', '', str(x))) # Make the column a numeric column for merging #dataframe['phone'] = pandas.to_numeric(dataframe['phone']) final = dataframe.merge(ht_scores, how='left', left_on='phone', right_index=True) # Drop the content column and drop the index column final.drop('content', axis=1, inplace=True) if os.path.isfile('{0}ad_chars_final.csv'.format(config['result_data'])): lock.acquire() print 'lock has been set for file {0}'.format(file) final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), mode='a', header=False, encoding='utf-8', index=False) lock.release() else: final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), header=True, encoding='utf-8', index=False)
def makeDataFrame(phases): """ Return Pandas DataFrame object, with CIF files as index and ellipsoid parameters as columns (hierarchical by centre atom)""" import pandas as pd from pieface.readcoords import Crystal if isinstance(phases, dict): if isinstance( phases[phases.keys()[0]], Crystal): # We are reading a dict of Crystals: convert to nested dict first alldata = makenesteddict(phases) elif isinstance( phases[phases.keys()[0]], dict ): # Looking at a dict of dicts: assume correct for pandas... alldata = phases d = dict([ (i, pd.DataFrame(alldata[i]).set_index('files')) for i in alldata.keys() ]) # Make dict of DataFrames frame = pd.concat(d, axis=1) if len(frame.index) == 1: # We're looking at a single cif file - unstack DataFrame with atoms as index return frame.ix[frame.index[0]].unstack().apply(pd.to_numeric, errors='ignore') # Need to convert back to float/int when unstacking else: return frame else: raise TypeError("Unknown data format for conversion to DataFrame (expected dict)")
def _return_appropiate_type(self, selected): if isinstance(selected, pd.Series): frame = pd.DataFrame(selected).T if self._required_cols <= set(frame.columns): selected = frame.apply(pd.to_numeric, errors='ignore') else: return selected if (isinstance(selected, pd.DataFrame) and self._required_cols <= set(selected.columns)): molecule = self.__class__(selected) molecule.metadata = self.metadata.copy() molecule._metadata = copy.deepcopy(self._metadata) return molecule else: return selected
def _augment_lmfit_modelresult(result): """Tidy data values and fitted model from `lmfit.model.ModelResult`. """ columns = ['x', 'data', 'best_fit', 'residual'] d = pd.DataFrame(index=range(result.ndata), columns=columns) for col in columns[1:]: d.loc[:, col] = getattr(result, col) independent_vars = result.model.independent_vars if len(independent_vars) == 1: independent_var = independent_vars[0] else: msg = ('Only 1 independent variable is currently supported.\n' 'Found independent variables: %s' % str(independent_vars)) raise NotImplementedError(msg) x_array = result.userkws[independent_var] d.loc[:, 'x'] = x_array if len(result.components) > 1: comp_names = [c.name for c in result.components] for cname, comp in zip(comp_names, result.components): d.loc[:, cname] = comp.eval(x=d.x, **result.values) return d.apply(pd.to_numeric, errors='ignore')
def __init__(self, symbol, *args): super().__init__() self.data = pd.read_csv(open(r"Stock_Data/{}.csv".format(symbol))) self.data = self.data.apply(pd.to_numeric, errors="ignore") self.data.index = self.data["Quarter end"] self.name = symbol if self.data["Price"].dtype in (int, float) and self.data["Cumulative dividends per share"].dtype in (int, float): self.data["Value"] = self.data["Price"] + self.data["Cumulative dividends per share"] # Calculation of the estimated return self.data["Estimated Return"] = self.data["Value"].pct_change() # Calculation of the standard deviation self.data["Standard Deviation"] = self.data["Value"].std() else: self.complete_pricelist = False
def __call__(self, fields, geo_for, geo_in=None, cache=NopCache()): """Special method to make API object invocable. Arguments: * fields: list of variables to return. * geo_* fields must be given as dictionaries, eg: `{'county': '*'}` * cache: cache in which to store results. Not cached by default. """ params = { 'get': ','.join(fields), 'key': self.key, 'for': self._geo2str(geo_for), } if geo_in: params['in'] = self._geo2str(geo_in) j = fetchjson(self.endpoint, cache, self.session, params=params) ret = pd.DataFrame(data=j[1:], columns=j[0]) for field in fields: if self.variables[field].get('predicateType') == 'int': ret[field] = pd.to_numeric(ret[field]) return ret
def read_data(file_name): stock = pd.read_csv(file_name, parse_dates=True, index_col=0) n_samples = len(stock) # ditch samples with NAN values stock = stock.dropna(axis=0) # flip order from newest to oldest to oldest to newest #stock = stock.iloc[::-1] # trim data stock = stock[['Open']] # convert object to floats stock['Open'] = pd.to_numeric(stock['Open'], errors='coerce') # all stock is needed to walk back dates for testing hold out data return stock ############################################################################################# # load and combine stock indexes, matching the dates
def get_qstat_as_df(): """Get the current users output of qstat as a DataFrame. """ user = os.environ.get("USER") try: ret = subprocess.Popen( ["qstat", "-u", str(user)], stdout=subprocess.PIPE, ) df = pd.read_csv(ret.stdout, delimiter="\s+") # drop the first line since it is just one long line df = df.drop(df.index[0]).copy() # convert objects to numeric otherwise numbers are strings df["JOBID"] = pd.to_numeric(df["job-ID"], errors='coerce') # df.set_index("JOBID") df = df.drop('job-ID', 1) except ValueError: logger.exception("No jobs in queues for user {}".format(user)) df = pd.DataFrame() return df
def get_data_from_google(ticker_sym, start, end): """ Returns a data frame of data for a given stock between two dates """ url = "https://www.google.com/finance/historical?q=%s&startdate=%s&enddate=%s&output=csv" % (ticker_sym, start, end) s = requests.get(url).content df = pd.read_csv(io.StringIO(s.decode('utf-8'))) df['Date'] = pd.to_datetime(df['Date']) df['epoch'] = (df['Date'] - datetime(1970,1,1)).dt.total_seconds() * 1000 df.set_index('Date') df['Adj_Close'] = df['Close'] # google's api doens't provide so just assume it's the same cols = ['High', 'Low', 'Volume', 'Open', 'Close', 'Adj_Close'] for c in cols: # cast columns to numeric df[c] = pd.to_numeric(df[c]) return df.iloc[::-1] # reverse the dataframe so index 0 is the earliest date #@memoize #def get_data_for_sym(ticker_sym, start, end): # return list(reversed(get_data_for_sym_from_yahoo(ticker_sym, start, end))) # #res = StockFeature.select().where(Relationship.from_user == self))
def calc_AB(vcf): ''' Calculate allele balance for all samples in a given pdVCF. Also converts DP & GQ to numeric type. Args: vcf: pdVCF with genotype information extracted Notes: ONLY WORKS FOR BIALLELIC VARIANTS ''' sam = vcf.columns.levels[0][0] vcf[sam,'DP'] = pd.to_numeric(vcf[sam,'DP'].str.replace('.', '0')) # bcftools places '.' in empty fields vcf[sam,'GQ'] = pd.to_numeric(vcf[sam,'GQ'].str.replace('.', '0')) AD = vcf.xs('AD', level=1, axis=1).unstack().str.split(",", n=2) DP = vcf.xs('DP', level=1, axis=1).unstack() AB = round(pd.to_numeric(AD.str[1]) / pd.to_numeric(DP), 2) vcf[sam, 'AB'] = AB.tolist() return vcf
def update_distances(self): """ Calculate the distances between the observed series and the stresses. Returns ------- distances: pandas.DataFrame pandas dataframe with the distances between the oseries (index) and the stresses (columns). """ # Make sure these are values, even when actually objects. xo = pd.to_numeric(self.oseries.x) xt = pd.to_numeric(self.stresses.x) yo = pd.to_numeric(self.oseries.y) yt = pd.to_numeric(self.stresses.y) xh, xi = np.meshgrid(xt, xo) yh, yi = np.meshgrid(yt, yo) self.distances = pd.DataFrame(np.sqrt((xh - xi) ** 2 + (yh - yi) ** 2), index=self.oseries.index, columns=self.stresses.index)
def parse(self, entry): data = pd.read_csv(str(entry), engine= "c", sep= "\t", parse_dates= False, index_col= [0, 1]) data.index.names = ["date", "srcid"] # Check for AMT bug that adds row of ('nvsplDate', 'Total_All') with all 0s, drop if exists if data.index[-1][0] == 'nvsplDate': data = data.iloc[:-1, :] ## Pandas cannot seem to handle a MultiIndex with dates; ## slicing syntax becomes even crazier, and often doesn't even work. ## So date conversion is disabled for now. # # Convert dates # datetimes = data.index.get_level_values('date').to_datetime() # data.index.set_levels(datetimes, level= 'date', inplace= True) # Ensure MultiIndex sortedness data.sortlevel(inplace= True) return data.apply(pd.to_numeric, raw= True, errors= "coerce")
def to_numeric(self,columns): ''' Args columns (string or list): column names needed to be converted Returns - ''' if isinstance(columns,str): self.data_df[columns] = pd.to_numeric(self.data_df[columns],errors='coerce') elif isinstance(columns,list): for column in columns: self.data_df[column] = pd.to_numeric(self.data_df[column],errors='coerce') # rename certain columns
def df_coach_bsites(self): df_cols = ['site_num', 'c_score', 'cluster_size', 'algorithm', 'pdb_template_id', 'pdb_template_chain', 'pdb_ligand', 'binding_location_coords', 'c_score_method', 'binding_residues', 'ligand_cluster_counts'] bsites_inf_df = pd.DataFrame.from_records(self.coach_bsites, columns=df_cols).drop_duplicates().reset_index(drop=True) if bsites_inf_df.empty: log.warning('Empty dataframe') return bsites_inf_df else: bsites_inf_df['c_score'] = pd.to_numeric(bsites_inf_df.c_score, errors='coerce') bsites_inf_df['cluster_size'] = pd.to_numeric(bsites_inf_df.cluster_size, errors='coerce') return ssbio.utils.clean_df(bsites_inf_df)
def df_coach_go(self): cols = ['go_id', 'go_term', 'c_score'] go_all_df = pd.DataFrame() for go_list in [self.coach_go_mf, self.coach_go_cc, self.coach_go_bp]: go_df = pd.DataFrame.from_records(go_list, columns=cols).drop_duplicates().reset_index(drop=True) go_df['c_score'] = pd.to_numeric(go_df.c_score, errors='coerce') if go_all_df.empty: go_all_df = go_df else: go_all_df.append(go_df) return go_all_df
def parse_coach_ec_df(infile): """Parse the EC.dat output file of COACH and return a dataframe of results EC.dat contains the predicted EC number and active residues. The columns are: PDB_ID, TM-score, RMSD, Sequence identity, Coverage, Confidence score, EC number, and Active site residues Args: infile (str): Path to EC.dat Returns: DataFrame: Pandas DataFrame summarizing EC number predictions """ ec_df = pd.read_table(infile, delim_whitespace=True, names=['pdb_template', 'tm_score', 'rmsd', 'seq_ident', 'seq_coverage', 'c_score', 'ec_number', 'binding_residues']) ec_df['pdb_template_id'] = ec_df['pdb_template'].apply(lambda x: x[:4]) ec_df['pdb_template_chain'] = ec_df['pdb_template'].apply(lambda x: x[4]) ec_df = ec_df[['pdb_template_id', 'pdb_template_chain', 'tm_score', 'rmsd', 'seq_ident', 'seq_coverage', 'c_score', 'ec_number', 'binding_residues']] ec_df['c_score'] = pd.to_numeric(ec_df.c_score, errors='coerce') return ec_df
def _get_peilmetingen_df(self): """""" doc_df = pd.DataFrame(list(self.get_peilmetingen()), columns=["grondwaterlocatie", "filternummer", "datum", "diepte", "methode", "betrouwbaarheid"]) doc_df["datum"] = pd.to_datetime(doc_df["datum"]) doc_df["diepte"] = pd.to_numeric(doc_df["diepte"]) doc_df = doc_df.set_index("datum") return doc_df
def _get_observaties_df(self): """""" doc_df = pd.DataFrame(list(self.get_observaties()), columns=["grondwaterlocatie", "filternummer", "monsternummer", "datum", "parameter", "waarde", "eenheid", "betrouwbaarheid"]) doc_df["datum"] = pd.to_datetime(doc_df["datum"]) doc_df["waarde"] = pd.to_numeric(doc_df["waarde"]) return doc_df
def filter_fastq_length_meanqual(df, min_len, max_len, min_mqual, max_mqual): querystring = "length >= {0} and meanQual >= {1}".format(min_len, min_mqual) if max_len != None: querystring += " and length <= {}".format(max_len) if max_mqual != None: querystring += " and meanQual <= {}".format(max_mqual) print("Keeping reads that satisfy: {}".format(querystring), file=stderr) filtdf = df.query(querystring) #filtdf["length"] = pd.to_numeric(filtdf["length"], errors='coerce') #filtdf["meanQual"] = pd.to_numeric(filtdf["meanQual"], errors='coerce') return filtdf
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # convert metadata to numeric values where applicable, drop the non-numeric # values, and then drop samples that contain NaNs df = metadata.to_dataframe() df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) # filter categorical columns pre_filtered_cols = set(df.columns) df = df.select_dtypes([numpy.number]).dropna() filtered_categorical_cols = pre_filtered_cols - set(df.columns) # filter 0 variance numerical columns pre_filtered_cols = set(df.columns) df = df.loc[:, df.var() != 0] filtered_zero_variance_cols = pre_filtered_cols - set(df.columns) # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index, strict=False) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'filtered_categorical_cols': ', '.join(filtered_categorical_cols), 'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols), 'result': result})
def from_csv(cls, filepath_or_buffer): # Import pandas lazily since it can take a moment to import try: import pandas as pd except ImportError: raise ImportError("pandas must be installed to use ZiplineBacktestResult") zipline_result = cls() results = pd.read_csv( filepath_or_buffer, parse_dates=["date"], index_col=["dataframe", "index", "date", "column"])["value"] # Extract returns returns = results.loc["returns"].unstack() returns.index = returns.index.droplevel(0).tz_localize("UTC") zipline_result.returns = returns["returns"].astype(float) # Extract positions positions = results.loc["positions"].unstack() positions.index = positions.index.droplevel(0).tz_localize("UTC") zipline_result.positions = positions.astype(float) # Extract transactions transactions = results.loc["transactions"].unstack() transactions.index = transactions.index.droplevel(0).tz_localize("UTC") zipline_result.transactions = transactions.apply(pd.to_numeric, errors='ignore') # Extract benchmark returns benchmark_returns = results.loc["benchmark"].unstack() benchmark_returns.index = benchmark_returns.index.droplevel(0).tz_localize("UTC") zipline_result.benchmark_returns = benchmark_returns["benchmark"].astype(float) # Extract performance dataframe perf = results.loc["perf"].unstack() perf.index = perf.index.droplevel(0).tz_localize("UTC") zipline_result.perf = perf.apply(pd.to_numeric, errors='ignore') return zipline_result
def check_null_or_valid(row_data): """Function that takes a row of data, drops all missing values, and checks if all remaining values are greater than or equal to 0 """ no_na = row_data.dropna()[1:-1] numeric = pd.to_numeric(no_na) ge0 = numeric >= 0 return ge0 # Check whether the first column is 'Life expectancy'
def eliminate_invalids(df, cols): """Eliminate invalid data in ``cols`` of ``df``.""" numdf = df.drop(cols, axis=1).join(df[cols].apply(pd.to_numeric, errors='coerce')) numdf = numdf[~numdf[cols].isnull().apply(np.any, axis=1)] return numdf
def partial_convert_only_numerics(df): """Convert ``df`` numeric cols and try to coerce any errors encountered.""" col_dict = df_cols_by_type(df) partial_convert = partial(pd.to_numeric, errors='coerce') df[col_dict['numeric']].apply(partial_convert) return df # Useful one-liners. # df.select_dtypes(include=['bool']) # list(df.select_dtypes(include=['bool']).columns)
def condition_df(self): """ Do any initial data conditioning that may be required. """ logging.info('Ensure that columns that are supposed to be numeric are numeric') self.df[SET_GHI] = pd.to_numeric(self.df[SET_GHI], errors='coerce') self.df[SET_WINDVEL] = pd.to_numeric(self.df[SET_WINDVEL], errors='coerce') self.df[SET_NIGHT_LIGHTS] = pd.to_numeric(self.df[SET_NIGHT_LIGHTS], errors='coerce') self.df[SET_ELEVATION] = pd.to_numeric(self.df[SET_ELEVATION], errors='coerce') self.df[SET_SLOPE] = pd.to_numeric(self.df[SET_SLOPE], errors='coerce') self.df[SET_LAND_COVER] = pd.to_numeric(self.df[SET_LAND_COVER], errors='coerce') self.df[SET_GRID_DIST_CURRENT] = pd.to_numeric(self.df[SET_GRID_DIST_CURRENT], errors='coerce') self.df[SET_GRID_DIST_PLANNED] = pd.to_numeric(self.df[SET_GRID_DIST_PLANNED], errors='coerce') self.df[SET_SUBSTATION_DIST] = pd.to_numeric(self.df[SET_SUBSTATION_DIST], errors='coerce') self.df[SET_ROAD_DIST] = pd.to_numeric(self.df[SET_ROAD_DIST], errors='coerce') self.df[SET_HYDRO_DIST] = pd.to_numeric(self.df[SET_HYDRO_DIST], errors='coerce') self.df[SET_HYDRO] = pd.to_numeric(self.df[SET_HYDRO], errors='coerce') self.df[SET_SOLAR_RESTRICTION] = pd.to_numeric(self.df[SET_SOLAR_RESTRICTION], errors='coerce') logging.info('Replace null values with zero') self.df.fillna(0, inplace=True) logging.info('Sort by country, Y and X') self.df.sort_values(by=[SET_COUNTRY, SET_Y, SET_X], inplace=True) logging.info('Add columns with location in degrees') project = Proj('+proj=merc +lon_0=0 +k=1 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs') def get_x(row): x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True) return x def get_y(row): x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True) return y self.df[SET_X_DEG] = self.df.apply(get_x, axis=1) self.df[SET_Y_DEG] = self.df.apply(get_y, axis=1)
def saveSlice_CSV(self, outputFilename=outputFilename, xSlice=[], ySlice=[], zSlice=[]): """ Take a slice and save it to csv """ outputFilename += '_slice.csv' # # This defines how 'narrow' slice we want. Why am I writing this if ParaView will do it fark # tol = 1e-2 # # # Pre allocate empty DF here? # slicedData = pd.DataFrame() # # if not xSlice: # # We have some slices along x to make # for point in xSlice: # # we want to slice at all of these points # > xSlice[point] - tol # self.flowData.transpose().loc[(self.flowData.transpose()["x"] > 0.599 & self.flowData.transpose()["x"] < 0.601 & self.flowData.transpose()["z"] == 0), "cf"] # elif not ySlice: # # Slices along y to take # elif not zSlice: # # And slices aong z flowData = self.flowData.apply(pd.to_numeric, errors='ignore') slicedData_indices = (flowData["z"] > -0.01) & (flowData["z"] < 0.01) slicedData = flowData.loc[slicedData_indices] slicedData.to_csv(outputFilename, sep=',', index=0, index_label=0) print "Slices saved in", outputFilename
def saveSlice_CSV(self, outputFilename=outputFilename, xSlice=[], ySlice=[], zSlice=[]): """ Take a slice and save it to csv """ outputFilename += '_slice.csv' # # This defines how 'narrow' slice we want. Why am I writing this if ParaView will do it fark # tol = 1e-2 # # # Pre allocate empty DF here? # slicedData = pd.DataFrame() # # if not xSlice: # # We have some slices along x to make # for point in xSlice: # # we want to slice at all of these points # > xSlice[point] - tol # self.flowData.transpose().loc[(self.flowData.transpose()["x"] > 0.599 & self.flowData.transpose()["x"] < 0.601 & self.flowData.transpose()["z"] == 0), "cf"] # elif not ySlice: # # Slices along y to take # elif not zSlice: # # And slices aong z flowData = self.flowData.apply(pd.to_numeric, errors='ignore') slicedData_indices = (flowData["y"] > 0.598) & (flowData["y"] < 0.602) & (flowData["z"] == 0) slicedData = flowData.loc[slicedData_indices] slicedData.to_csv(outputFilename, sep=',', index=0, index_label=0) print "Slices saved in", outputFilename
def maybe_to_numeric(series): try: return pd.to_numeric(series) except ValueError: return series
def createPriceHistoryReport(self, stock): """ Calls get10YrPriceHistory() to package a price history report into a PANDAS dataframe, then cleans and returns the data. This function will acquire a price history for the provided symbol, which must be a string and a valid stock symbol along with the symbol's exchange, e.g., ('MMM', 'NYSE'). The get10YrPriceHistory() function requires the exchange. After the data is loaded, the function adds a Symbol field to the price history for tracking in the database, reindexes and renames some fields, properly formats the dates into datetime fields, and converts prices from strings to floats. Returns the report as a PANDAS dataframe if successful, otherwise a tuple (False, error message). Example Usage: createPriceHistoryReport(('MMM', 'NYSE')) """ try: # get the raw data from morningstar price_history = self.get10YrPriceHistory(stock) if isinstance(price_history, pd.DataFrame): # the price_history has to exist, or else return the err msg of the function called price_history['Symbol'] = stock[0] # reorganize header order price_history = price_history.reindex(columns=['Symbol','Date','Open','High','Low','Close','Volume']) # rename the Date column for easier processing through SQLite's Date functionality price_history.rename(columns={'Date':'Reference'}, inplace=True) # convert all dates to ISO formatted yyyy-mm-dd strings price_history['Reference'] = price_history['Reference'].apply(lambda x: time.strftime("%Y-%m-%d", time.strptime(x, "%m/%d/%Y"))) # convert volumes to integers # unicode err on ??? value for some volumes goes to NaN price_history['Volume'] = pd.to_numeric(price_history['Volume'].str.replace(',',''), errors='coerce') # set index b/f db commit so no duplicate numeric index columns price_history.set_index(['Symbol'], inplace=True) return price_history except Exception as e: return (False, e) # get10YrPriceHistory # ******************* #
def load_groundtruth(self): gt_labels = pd.read_csv(self.data_path) if self.subset is not None: mask = [True if x in self.subset else False for x in gt_labels['id'].values] gt_labels = gt_labels[mask] assert np.any(np.array(mask)) gt_labels['length'] = pd.to_numeric(gt_labels['length']) gt_labels['actions'].fillna('', inplace=True) self.gt_labels = gt_labels
def read_madx_tracking(file): """Read a MAD-X Tracking onetable=true file to a dataframe.""" column_names = ['ID', 'TURN', 'X', 'PX', 'Y', 'PY', 'T', 'PT', 'S', 'E'] data = pd.read_csv(file, skiprows=MADX_TRACKING_SKIP_ROWS, delim_whitespace=True, names=column_names) return data.apply(pd.to_numeric, errors="ignore").dropna()
def to_dataframe(self, cast_numeric=False): df = self._dataframe.copy() if cast_numeric: df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) return df