我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_table()。
def get_citation_df(args, text): """ Generate citation_df and save it to 'citations.tsv'. """ citation_df = pandas.DataFrame( {'string': get_citation_strings(text)} ) if args.citation_tags_path.is_file(): tag_df = pandas.read_table(args.citation_tags_path) tag_df['string'] = '@tag:' + tag_df.tag for citation in tag_df.citation: is_valid_citation_string('@' + citation) citation_df = citation_df.merge(tag_df[['string', 'citation']], how='left') else: citation_df['citation'] = None logging.info(f'missing {args.citation_tags_path} file: no citation tags set') citation_df.citation.fillna(citation_df.string.astype(str).str.lstrip('@'), inplace=True) citation_df['standard_citation'] = citation_df.citation.map(standardize_citation) citation_df['citation_id'] = citation_df.standard_citation.map(get_citation_id) citation_df = citation_df.sort_values(['standard_citation', 'citation']) citation_df.to_csv(args.citations_path, sep='\t', index=False) check_collisions(citation_df) check_multiple_citation_strings(citation_df) return citation_df
def gerber_green_imai(): """ This is the dataset from Imai (2005) used to replicate and evaluate the field experiment done by Gerber and Green (2000). Notes ----- .. Gerber, Alan S. and Donald P. Green. 2000. "The effects of canvassing, telephone calls, and direct mail on voter turnout: a field experiment." American Political Science Review 94: 653-663. .. Gerber, Alan S. and Donald P. Green. 2005. "Correction to Gerber and Green (2000), replication of disputed findings, and reply to Imai (2005)." American Political Science Review 99: 301-313. .. Imai, Kosuke. 2005. "Do get-out-the-vote calls reduce turnout? The importance of statistical methods for field experiments." American Political Science Review 99: 283-300. """ fin = _os.path.join(data_dir, 'GerberGreenImai.txt') data = pd.read_table(fin, sep = '\s+') data.index = range(data.shape[0]) return data
def table_convert(fmt="csv"): """Convert the SC data into different formats. To make available for download. """ # others netcdf, fits? # https://pandas.pydata.org/pandas-docs/stable/io.html if fmt not in ['tsv', 'csv', 'hdf']: raise NotImplementedError("Conversion format to {} not available.".format(fmt)) name = "data/sweet-cat.{}".format(fmt) if fmt is "tsv": # This is the standard pass else: df = pd.read_table('data/sweet-cat.tsv') if fmt == "hdf": df.to_hdf(name, key="sweetcat", mode="w", format='table') elif fmt == "csv": df.to_csv(name, sep=",", index=False)
def get_info_map(info_link=INFO_LINK): """ Return a :class:`DataFrame` containing the information provided at *info_link*, a link to a tab delineated text file containing information for each USArray MT site. """ df = PD.read_table(info_link, sep='\t', skiprows=1, names=['vnet', 'net', 'sta', 'location', 'lat', 'lon', 'elev', 'start', 'end', 'status', 'install', 'cert'], parse_dates=[7, 8], index_col=2) return df
def predict(self, ifile, efile, ofile): # Load columns = ['documents'] data = pd.read_table(ifile, header=None, names=columns) documents = data['documents'] # Deserialize estimator = pickle.load(open(efile, 'rb')) # Predict probability = estimator.predict_proba(documents) data['labels'] = estimator.predict(documents) data['C1_pr'] = probability[:, 0] data['C2_pr'] = probability[:, 1] # Save columns = ['labels', 'C1_pr', 'C2_pr', 'documents'] data.to_csv( ofile, sep = '\t', columns = columns, index = False )
def expected_result_find_overlaps(): contents = u"""Chromosome Start End Peak Region 0 chr1 3 5 0 gene 1 chr1 3 5 0 tss 2 chr1 12 14 1 gene 3 chr1 200 300 2 exon 4 chr1 200 300 2 exon 5 chr1 200 300 2 gene 6 chr1 200 300 2 tes 7 chr1 200 300 2 tss 8 chr1 240 297 3 exon 9 chr1 240 297 3 gene 10 chr1 240 297 3 tes""" return pd.read_table(StringIO(contents), header=0, sep="\s+")
def expected_result(): contents = """Sample OtherGroup 0 GENE1_KO_ChIP_1 GENE2_KO 1 GENE1_KO_ChIP_1 WT 2 GENE1_KO_ChIP_2 GENE2_KO 3 GENE1_KO_ChIP_2 WT 4 GENE1_KO_ChIP_3 GENE2_KO 5 GENE1_KO_ChIP_3 WT 6 GENE2_KO_ChIP_1 GENE1_KO 7 GENE2_KO_ChIP_1 WT 8 GENE2_KO_ChIP_2 GENE1_KO 9 GENE2_KO_ChIP_2 WT 10 GENE2_KO_ChIP_3 GENE1_KO 11 GENE2_KO_ChIP_3 WT 12 WT_ChIP_1 GENE1_KO 13 WT_ChIP_1 GENE2_KO 14 WT_ChIP_2 GENE1_KO 15 WT_ChIP_2 GENE2_KO 16 WT_ChIP_3 GENE1_KO 17 WT_ChIP_3 GENE2_KO""" return pd.read_table(StringIO(contents), sep="\s+", index_col=0)
def main(): uri, outfile, dataset = get_arguments() fd = tempfile.NamedTemporaryFile() progress = ProgressBar(widgets=[Percentage(), ' ', Bar(), ' ', ETA(), ' ', FileTransferSpeed()]) def update(count, blockSize, totalSize): if progress.maxval is None: progress.maxval = totalSize progress.start() progress.update(min(count * blockSize, totalSize)) urllib.urlretrieve(uri, fd.name, reporthook = update) if dataset == 'zinc12': df = pandas.read_csv(fd.name, delimiter = '\t') df = df.rename(columns={'SMILES':'structure'}) df.to_hdf(outfile, 'table', format = 'table', data_columns = True) elif dataset == 'chembl22': df = pandas.read_table(fd.name,compression='gzip') df = df.rename(columns={'canonical_smiles':'structure'}) df.to_hdf(outfile, 'table', format = 'table', data_columns = True) pass else: df = pandas.read_csv(fd.name, delimiter = '\t') df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
def add_node_attribute(inFile, pedgraph, animal=1, atCol=4, atName="attr1"): """ inFile - pedigree as .txt file pedgraph - Pedigree as a networkX graph object animal - column for the animal ID atCol - column for the attribute atName - name for the attribute """ ped_df = pd.read_table(inFile, header=None, delim_whitespace=True) #print ped_df dic_ped = dict(zip(ped_df[animal - 1], ped_df[atCol - 1])) #print dic_ped correct_dic_ped = {str(k):int(v) for k,v in dic_ped.items()} #print correct_dic_ped for node, value in dic_ped.items(): pedgraph.node[str(node)]["EBV"] = value return correct_dic_ped
def add_ebv_attribute(inFile, pedgraph, animal=1, atCol=4, atName="attr1"): """ inFile - pedigree as .txt file pedgraph - Pedigree as a networkX graph object animal - column for the animal ID atCol - column for the attribute atName - name for the attribute """ ped_df = pd.read_table(inFile, header=None, delim_whitespace=True) #print ped_df dic_ped = dict(zip(ped_df[animal - 1], ped_df[atCol - 1])) #print dic_ped correct_dic_ped = {str(k):int(-v) for k,v in dic_ped.items()} #print correct_dic_ped for node, value in dic_ped.items(): pedgraph.node[str(node)]["EBV"] = value return correct_dic_ped
def _load_knownGene(filename): """ Load UCSC knownGene table. Parameters ---------- filename : str path to knownGene file Returns ------- df : pandas.DataFrame knownGene table if loading was successful, else None """ if filename is None: return None try: df = pd.read_table(filename, names=['name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'proteinID', 'alignID'], index_col=0) df['chrom'] = df['chrom'].str[3:] return df except Exception as err: print(err) return None
def _load_kgXref(filename): """ Load UCSC kgXref table. Parameters ---------- filename : str path to kgXref file Returns ------- df : pandas.DataFrame kgXref table if loading was successful, else None """ if filename is None: return None try: df = pd.read_table(filename, names=['kgID', 'mRNA', 'spID', 'spDisplayID', 'geneSymbol', 'refseq', 'protAcc', 'description', 'rfamAcc', 'tRnaName'], index_col=0, dtype=object) return df except Exception as err: print(err) return None
def on_clipboard(self, button): kwargs = dict() text = self.clipboard.wait_for_text() lines = text[:10000].split('\n')[:-1][:10] counts = set([x.lstrip().count('\t') for x in lines]) if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: kwargs['sep'] = '\t' if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' try: self.data = pd.read_table(StringIO(text), **kwargs) except: print("Unexpected Error: ", sys.exc_info()) else: self.verticalbox.remove(self.scrollable_treelist) self.add_treeview()
def read_data(filename): """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe and a sparse matrix of artist/user/playcount """ # read in triples of user/artist/playcount from the input dataset # get a model based off the input params start = time.time() logging.debug("reading data from %s", filename) data = pandas.read_table(filename, usecols=[0, 2, 3], names=['user', 'artist', 'plays']) # map each artist and user to a unique numeric value data['user'] = data['user'].astype("category") data['artist'] = data['artist'].astype("category") # create a sparse matrix of all the users/plays plays = coo_matrix((data['plays'].astype(numpy.float32), (data['artist'].cat.codes.copy(), data['user'].cat.codes.copy()))) logging.debug("read data file in %s", time.time() - start) return data, plays
def main(): # read and preprocess the movie data movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python') movie = movie_preprocessing(movie) # read the ratings data and merge it with movie data rating = pd.read_table("ratings.dat", sep="::", names=["user_id", "movie_id", "rating", "timestamp"], engine='python') data = pd.merge(rating, movie, on="movie_id") # extract feature from our data set streaming_batch, user_feature, actions, reward_list = feature_extraction(data) streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False) user_feature.to_csv("user_feature.csv", sep='\t') pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False) reward_list.to_csv("reward_list.csv", sep='\t', index=False) action_context = movie[movie['movie_id'].isin(actions)] action_context.to_csv("action_context.csv", sep='\t', index = False)
def get_dataframe_list(args, data_fields=('gene', 'raw_counts')): # get a list of dataframes dfs, files = [], args['files'] or [] # create an index using the filenames # this will prevent having an overlong command line for 100's or 1000's of files if args['file_index']: with open(args['file_index']) as fp: files.extend(fp.readlines()) files = sorted(filter(None, set([f.strip() for f in files]))) # now iterate over the files and get the looooong list of dataframes for f in files: # Get only specific columns with usecols df = pd.read_table(f, usecols=data_fields) dfs.append(df) return dfs, files # a list of dataframes and the files index
def _load_table(self, filepath): """ Load table from file system. :param str filepath: Path to table in CSV, TSV, XLSX or Pandas pickle format. :return: Pandas table :rtype: pandas.core.frame.DataFrame """ _, ext = os.path.splitext(filepath.lower()) if ext == '.tsv': return pd.read_table(filepath, **self.kwargs) if ext == '.csv': return pd.read_csv(filepath, **self.kwargs) if ext == '.xlsx': return pd.read_excel(filepath, **self.kwargs) return pd.read_pickle(filepath, **self.kwargs)
def years(self): df_list=[] k=[str(i) for i in range(1,13)] print k j=[i for i in range(1,13)] result=[] for i in range(1,13): filename='2016-%s.xls' %str(i).zfill(2) #print filename t=pd.read_table(filename,encoding='gbk',dtype={u'????':np.str}) fee=t[u'???'].sum()+t[u'???'].sum()+t[u'????'].sum() print i," fee: " print fee df_list.append(t) result.append(fee) df=pd.concat(df_list,keys=k) #print df #df.to_excel('2016_delivery_order.xls') self.caculation(df) plt.plot(j,result) plt.show()
def __loadPar( self, parname ): """ Frealign files normally have 16 columns, with any number of comment lines that start with 'C' """ # Ergh, cannot have trailing comments with np.loadtxt? self.parCol = [b"N", b"PSI", b"THETA", b"PHI", b"SHX", b"SHY", b"MAG", b"FILM", b"DF1", b"DF2", \ b"ANGAST", b"OCC", b"LogP", b"SIGMA", b"SCORE", b"CHANGE" ] self.par = pandas.read_table( parname, engine='c', sep=' ', header=None, names =self.parCol, quotechar='C' ) #self.par.append( np.loadtxt( parname, comments=b'C' ) ) # TODO: split into a dictionary? # TODO: read comments as well # TODO: use pandas instead? #self.parCol = {b"N":0, b"PSI":1, b"THETA":2, b"PHI":3, b"SHX":4, b"SHY":5, b"MAG":6, b"FILM":7, b"DF1":8, b"DF2":9, # b"ANGAST":10, b"OCC":11, b"LogP":12, b"SIGMA":13, b"SCORE":14, b"CHANGE":15 } #self.parComments = np.loadtxt( parname, comments=b' ' )
def read_cufflinks(sample_path, isoforms=False): ''' Function for reading a Cufflinks quantification result. Returns ------- A pandas.Series with the expression values in the sample. ''' if isoforms: quant_file = sample_path + '/isoforms.fpkm_tracking' else: quant_file = sample_path + '/genes.fpkm_tracking' df = pd.read_table(quant_file, engine='c', usecols=['tracking_id', 'FPKM'], index_col=0, dtype={'tracking_id': np.str, 'FPKM': np.float64}) df['tracking_id'] = df.index df = df.groupby('tracking_id').sum() df['TPM'] = df['FPKM'] / df['FPKM'].sum() * 1e6 df = df.rename(columns={'tracking_id': 'target_id'}) return df['TPM']
def getFeaturesForGenome(genomeId, CDS_ONLY): """ This method gets the features for a particular genomeId frfom PATRIC Parameters genomeId: UniqueId for the genome CDS_ONLY: retrieve only CDS features """ data_table = pd.read_table(PatricURL +genomeId+'/'+genomeId+'.PATRIC.features.tab') print data_table.shape if CDS_ONLY: return data_table[(data_table.feature_type == 'CDS')] else: return data_table
def input_data(): contents = u"""Chromosome Bin End examples/test.bed chr1 887600 887799 0 chr1 994600 994799 0 chr1 1041000 1041199 0 chr1 1325200 1325399 1 chr1 1541600 1541799 1 chr1 1599000 1599199 1 chr1 1770200 1770399 0 chr1 1820200 1820399 1 chr1 1995000 1995199 0 chr1 2063800 2063999 0 chr1 2129400 2129599 0 chr1 2239000 2239199 0 chr1 2318800 2318999 0 chr1 2448200 2448399 1 chr1 3006000 3006199 0 chr1 3046000 3046199 1 chr1 3089200 3089399 0 chr1 3093800 3093999 0 chr1 3096400 3096599 0""" return pd.read_table(StringIO(contents), sep="\s+", index_col=[0, 1, 2])
def expected_result(): c = u"""Bin Chromosome ooo 887600 chr1 1 994600 chr1 1 1041000 chr1 1 1770200 chr1 1 1770400 chr1 1 1995000 chr1 1 2063800 chr1 1 2064000 chr1 1 2129200 chr1 1 2239000 chr1 1 2318800 chr1 1 3006000 chr1 1""" return pd.read_table(StringIO(c), sep="\s+", index_col=[1, 0])
def expected_result(input_bed_file): df = pd.read_table( StringIO(u"""Count Chromosome Bin 2 chr1 39036800 1 chr1 73781000 1 chr1 90059800 1 chr3 55648200 1 chr7 20246600 1 chr7 91135000 1 chr13 100938400 1 chr19 43528800 1 chr19 47108800"""), sep=r"\s+", dtype={"Count": int32, "Bin": int32}) df.columns = [input_bed_file, "Chromosome", "Bin"] return df
def read_dfs(files): full_path = False if not len(files) == len(set([basename(f) for f in files])): logging.info("Matrix-files do not have a unique basename. Using full path in header!") full_path = True dfs = OrderedDict() for f in files: df = pd.read_table(f, header=0, sep=" ", index_col=[0, 1]) df = df[~df.index.duplicated(keep='first')] columns = list(df.columns) file_nick = "Enriched_" + basename(f) if not full_path else "Enriched_" + f columns[0] = file_nick df.columns = columns logging.info("Calling " + f + " " + file_nick + " in matrix file.") dfs[f] = df return dfs
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) lc = [] for split, filename in zip(['train', 'val'], [opts.train_lc, opts.val_lc]): _lc = pd.read_table(filename) _lc['split'] = split _lc['epoch'] = range(1, len(_lc) + 1) lc.append(_lc) lc = pd.concat(lc) plot = plot_lc(lc, metrics=opts.metrics, outputs=opts.outputs) plot.savefig(opts.out_file) log.info('Done!') return 0
def create_routing_table(bgp=None, ixp_prefixes=None, ixp_asns=None, bgp_compression='infer'): log.info('Creating IP2AS tool.') if bgp_compression == 'infer' and bgp.startswith('http'): bgp_compression = infer_compression(bgp, 'infer') if not isinstance(ixp_prefixes, pd.DataFrame): ixp_prefixes = set(pd.read_csv(ixp_prefixes, comment='#', index_col=0).index.unique()) if ixp_prefixes is not None else set() if not isinstance(ixp_asns, pd.DataFrame): ixp_asns = set(pd.read_csv(ixp_asns, comment='#', index_col=0).index.unique()) if ixp_asns is not None else set() if not isinstance(bgp, pd.DataFrame): bgp_original = pd.read_table(bgp, comment='#', names=['Address', 'Prefixlen', 'ASN'], compression=bgp_compression) bgp = bgp_original[~bgp_original.ASN.str.contains(',|_')].copy() bgp['ASN'] = pd.to_numeric(bgp.ASN) rt = RoutingTable() for address, prefixlen, asn in bgp[~bgp.ASN.isin(ixp_asns)].itertuples(index=False): rt.add_prefix(asn.item(), address, prefixlen) for address, prefixlen, asn in bgp[bgp.ASN.isin(ixp_asns)].itertuples(index=False): rt.add_ixp(address, prefixlen) for prefix in ixp_prefixes: rt.add_ixp(prefix) rt.add_private() rt.add_multicast() rt.add_default() return rt
def _mag_hires_helper(year, doy, local_dir, url, coords): fname = str(year)[2:] + doy + '_FGM_' + coords hdf_fname = '{}_{}.hdf'.format(year, doy) hdfloc = os.path.join(local_dir, hdf_fname) if os.path.isfile(hdfloc): return pd.read_hdf(hdfloc) f = helper.load(fname + '.TAB', local_dir, url) if 'error_message' in f.readline(): f.close() os.remove(os.path.join(local_dir, fname + '.TAB')) raise RuntimeError( 'No file named {} exits on remote server'.format(fname)) df = pd.read_table(f, names=['Time', 'Bx', 'By', 'Bz'], delim_whitespace=True, parse_dates=[0], index_col=0) if use_hdf: df.to_hdf(hdfloc, key='data', mode='w') return df
def to_dataframe(lines, **kwargs): names = lines.readline().decode('utf-8').strip().split('\t') types = lines.readline().decode('utf-8').strip().split('\t') dtypes, parse_dates, converters = {}, [], {} for name, chtype in zip(names, types): dtype = CH2PD[chtype] if dtype == 'object': converters[name] = decode_escapes elif dtype.startswith('datetime'): parse_dates.append(name) else: dtypes[name] = dtype return pd.read_table(lines, header=None, names=names, dtype=dtypes, parse_dates=parse_dates, converters=converters, na_values=set(), keep_default_na=False, **kwargs)
def file_processor(data_file): print('Reading bitcoin market data file here: {}.'.format(data_file)) d = pd.read_table(data_file, sep=',', header=None, index_col=0, names=['price', 'volume']) d.index = d.index.map(lambda ts: datetime.datetime.fromtimestamp(int(ts))) d.index.names = ['DateTime_UTC'] p = pd.DataFrame(d['price'].resample('5Min').ohlc()) p.columns = ['price_open', 'price_high', 'price_low', 'price_close'] v = pd.DataFrame(d['volume'].resample('5Min').sum()) v.columns = ['volume'] p['volume'] = v['volume'] # drop NaN values. # for example sometimes we don't have data for like one hour in a row. # So we have NaN buckets of 5Min in this particular hour. # Our convention is to avoid those NaN values and drop them! p = p.dropna() p.to_csv('/tmp/bitcoin_coinbase_M5.csv', sep='\t') return p
def _metadata(self) -> pd.DataFrame: """ Read the meta.txt file in the data set base directory containing general data set metadata. The meta.txt file is read only once and cached. Returns ------- pandas.DataFrame The metadata contained in the meta.txt file as a pandas DataFrame Raises ------ IOError If the data set cannot be parsed """ if not self.can_parse(): raise IOError("unable to parse DCASE dataset at {}".format(self._basedir)) if self._metadata_cache is None: self._metadata_cache = pd.read_table(str(self._basedir / "meta.txt"), header=None) # noinspection PyTypeChecker return self._metadata_cache
def test_1000_sep(self): data = """A|B|C 1|2,334|5 10|13|10. """ expected = DataFrame({ 'A': [1, 10], 'B': [2334, 13], 'C': [5, 10.] }) df = self.read_csv(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected)
def test_custom_na_values(self): data = """A,B,C ignore,this,row 1,NA,3 -1.#IND,5,baz 7,8,NaN """ expected = [[1., nan, 3], [nan, 5, nan], [7, 8, nan]] df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) tm.assert_almost_equal(df.values, expected) df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], skiprows=[1]) tm.assert_almost_equal(df2.values, expected) df3 = self.read_table(StringIO(data), sep=',', na_values='baz', skiprows=[1]) tm.assert_almost_equal(df3.values, expected)
def test_duplicate_columns(self): for engine in ['python', 'c']: data = """A,A,B,B,B 1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ # check default beahviour df = self.read_table(StringIO(data), sep=',', engine=engine) self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) df = self.read_table(StringIO(data), sep=',', engine=engine, mangle_dupe_cols=False) self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B']) df = self.read_table(StringIO(data), sep=',', engine=engine, mangle_dupe_cols=True) self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
def test_no_header(self): data = """1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ df = self.read_table(StringIO(data), sep=',', header=None) df_pref = self.read_table(StringIO(data), sep=',', prefix='X', header=None) names = ['foo', 'bar', 'baz', 'quux', 'panda'] df2 = self.read_table(StringIO(data), sep=',', names=names) expected = [[1, 2, 3, 4, 5.], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] tm.assert_almost_equal(df.values, expected) tm.assert_almost_equal(df.values, df2.values) self.assert_numpy_array_equal(df_pref.columns, ['X0', 'X1', 'X2', 'X3', 'X4']) self.assert_numpy_array_equal(df.columns, lrange(5)) self.assert_numpy_array_equal(df2.columns, names)
def test_file(self): # FILE if sys.version_info[:2] < (2, 6): raise nose.SkipTest("file:// not supported with Python < 2.6") dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table') local_table = self.read_table(localtable) try: url_table = self.read_table('file://localhost/' + localtable) except URLError: # fails on some systems raise nose.SkipTest("failing on %s" % ' '.join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table)
def test_1000_sep_with_decimal(self): data = """A|B|C 1|2,334.01|5 10|13|10. """ expected = DataFrame({ 'A': [1, 10], 'B': [2334.01, 13], 'C': [5, 10.] }) df = self.read_csv(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected)
def test_trailing_spaces(self): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" expected = pd.DataFrame([[1., 2., 4.], [5.1, np.nan, 10.]]) # this should ignore six lines including lines with trailing # whitespace and blank lines. issues 8661, 8679 df = self.read_csv(StringIO(data.replace(',', ' ')), header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data.replace(',', ' ')), header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) # test skipping set of rows after a row with trailing spaces, issue # #8983 expected = pd.DataFrame({"A": [1., 5.1], "B": [2., np.nan], "C": [4., 10]}) df = self.read_table(StringIO(data.replace(',', ' ')), delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected)
def test_fallback_to_python(self): # GH 6607 data = 'a b c\n1 2 3' # specify C-unsupported options with python-unsupported option # (options will be ignored on fallback, raise) with tm.assertRaisesRegexp(ValueError, 'Falling back'): pd.read_table(StringIO(data), sep=None, delim_whitespace=False, dtype={'a': float}) with tm.assertRaisesRegexp(ValueError, 'Falling back'): pd.read_table(StringIO(data), sep='\s', dtype={'a': float}) with tm.assertRaisesRegexp(ValueError, 'Falling back'): pd.read_table(StringIO(data), skip_footer=1, dtype={'a': float}) # specify C-unsupported options without python-unsupported options with tm.assert_produces_warning(parsers.ParserWarning): pd.read_table(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): pd.read_table(StringIO(data), sep='\s') with tm.assert_produces_warning(parsers.ParserWarning): pd.read_table(StringIO(data), skip_footer=1)
def download_bacterial_genomes(outfile='outfile.txt'): assembly_summary_file=r'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt' if os.path.exists('assembly_summary.txt'): os.remove('assembly_summary.txt') #Download the file using wget sysyem call subprocess.call("wget "+assembly_summary_file, shell=True) #Reformat the file to pandas-friendly format subprocess.call("sed -i '1d' assembly_summary.txt",shell=True) subprocess.call("sed -i 's/^# //' assembly_summary.txt", shell=True) #Read the file as a dataframe - using read_table #Use read_table if the column separator is tab assembly_sum = pd.read_table('assembly_summary.txt') #filter the dataframe and save the URLs of the complete genomes in a new file my_df=assembly_sum[(assembly_sum['version_status'] == 'latest') & (assembly_sum['assembly_level']=='Complete Genome') ] my_df=my_df[['ftp_path','assembly_accession','asm_name']] #output_file.write my_df.to_csv(outfile,mode='w',index=False,header=None) process_url_file(outfile) return #function to download reference genomes #this function downloads latest version human reference genome by default
def download_refseq_genome(taxid=9606,outfile='refseq_genome.txt'): assembly_summary_file="ftp://ftp.ncbi.nih.gov/genomes/refseq/assembly_summary_refseq.txt" if os.path.exists('assembly_summary_refseq.txt'): os.remove('assembly_summary_refseq.txt') #Download the file using wget sysyem call subprocess.call("wget "+assembly_summary_file, shell=True) #Reformat the file to pandas-friendly format subprocess.call("sed -i '1d' assembly_summary_refseq.txt",shell=True) subprocess.call("sed -i 's/^# //' assembly_summary_refseq.txt", shell=True) #Read the file as a dataframe - using read_table #Use read_table if the column separator is tab assembly_sum = pd.read_table('assembly_summary_refseq.txt') my_df=assembly_sum[(assembly_sum['taxid'] == taxid) & ((assembly_sum['refseq_category'] == 'reference genome') | (assembly_sum['refseq_category'] == 'representative genome') )] my_df=my_df[['ftp_path','assembly_accession','asm_name']] #Process the newly created file and download genomes from NCBI website my_df.to_csv(outfile,mode='w',index=False,header=None) process_url_file(outfile) return #format genbank files to generate kraken-friendly formatted fasta files
def inputOriginalfile(): """Asks user to input the name of the original websearch input file and reads it into a dataframe Returns: dataframe, str: original websearch file as dataframe, original websearch filename as string """ originalFile = input( "Name of original masses/retention times file:-\n\n ") originalDF = pd.read_table(originalFile, sep=',') # rename MZ and Time from the lipidFinder output originalDF.rename(columns={'MZ': 'ORIGINAL_MASS'}, inplace=True) originalDF.rename(columns={'Time': 'RETENTION_TIME'}, inplace=True) return originalDF, originalFile
def categoryRename(mergeDF): """Lipid categories are renamed to the standard lipid category names as per LIPIDMAPS. The categories_map.csv is used to 'map' old category names to new category names Args: mergeDF (dataframe): input dataframe Returns: dataframe: output dataframe """ categoryFileDF = pd.read_table( "categories_map.csv", sep=',', keep_default_na=False) # new way: make the categories df into a dictionary - much faster!! catMap = dict(list(zip(categoryFileDF.old_category, categoryFileDF.new_category))) mergeDF['CATEGORY'] = mergeDF['CATEGORY'].map(catMap) return mergeDF
def parse_star(starfile, keep_index=True): headers = [] foundheader = False ln = 0 with open(starfile, 'rU') as f: for l in f: if l.startswith("_rln"): foundheader = True lastheader = True if keep_index: head = l.rstrip() else: head = l.split('#')[0].rstrip().lstrip('_') headers.append(head) else: lastheader = False if foundheader and not lastheader: break ln += 1 star = pd.read_table(starfile, skiprows=ln, delimiter='\s+', header=None) star.columns = headers return star
def parseIonoFile(in_file, compression='infer'): iono_columns = ( "day", "year", "rec_latitude", "rec_longitude", "los_tec", "los_tec_err", "vertical_tec", "azimuth", "elevation", "mapping_function", "pp_latitude", "pp_longitude", "satellite", "site", "recBias", "recBiasErr" ) data = pd.read_table(in_file,header=None, sep='\s+', names=iono_columns, compression=compression) data['time'] = pd.to_datetime(data.loc[:,'year'].apply(str) + '-01-01') \ + pd.to_timedelta(data.iloc[:,0],unit='day') data.set_index('time', inplace=True) data.sort_index(inplace=True) return data
def convert_tables(self): """ Based on the confidence score, convert xmap file and two corresponding cmap files into "pandas table". """ pd.set_option('display.width',200) with open ('%s.table' % self.name, 'a') as xmap_table: with open (self.xmap) as xmap: for line in xmap: if line.startswith('#h'): hearder = line[3:] xmap_table.write(hearder) if line[0]!='#': xmap_table.write(line) with open ('%s.rtable' % self.name, 'a') as rcmap_table: with open (self.rcmap) as rcmap: for line in rcmap: if line.startswith('#h'): hearder = line[3:] rcmap_table.write(hearder) if line[0]!='#': rcmap_table.write(line) with open ('%s.qtable' % self.name, 'a') as qcmap_table: with open (self.qcmap) as qcmap: for line in qcmap: if line.startswith('#h'): hearder = line[3:] qcmap_table.write(hearder) if line[0]!='#': qcmap_table.write(line) self.XmapTable = pd.read_table('%s.table' % self.name) headers_x = ['RefContigID','RefStartPos','RefEndPos','QryContigID','QryStartPos', 'QryEndPos','Orientation', 'Confidence','QryLen','RefLen', 'Alignment'] self.filtered_XmapTable = self.XmapTable[self.XmapTable['Confidence']>=self.confidence_score][headers_x].reset_index(drop=True) headers_r = ['CMapId','ContigLength','NumSites','SiteID','Position'] self.RcmapTable = pd.read_table('%s.rtable' % self.name)[headers_r] headers_q = ['CMapId','ContigLength','NumSites','SiteID','Position','Coverage'] self.QcmapTable = pd.read_table('%s.qtable' % self.name)[headers_q] os.remove('%s.table' % self.name) os.remove('%s.rtable' % self.name) os.remove('%s.qtable' % self.name)
def dehejia_wahba(): """ Data from Dehejia and Wahba (1999, 2002) used to replicate and evaluate the matching results of Lalonde (1986). .. Dehejia, Rajeev and Sadek Wahba. 1999. "Causal effects in non-experimental studies: Reevaluating the evaluation of training programs." Journal of the American Statistical Association 94 (448): 1053-1062. .. Dehejia, Rajeev and Sadek Wahba. 2002. "Propensity score matching methods for non- experimental causal studies." Review of Economics and Statistics 84: 151-161. .. LaLonde, Robert. 1986. "Evaluating the econometric evaluations of training programs with experimental data." American Economic Review 76 (4): 604-620. """ names = ['Treated', 'Age', 'Education', 'Black', 'Hispanic', 'Married', 'Nodegree', 'RE74', 'RE75', 'RE78'] fin_tr = _os.path.join(data_dir, 'nswre74_treated.txt') fin_ct = _os.path.join(data_dir, 'nswre74_control.txt') treated = pd.read_table(fin_tr, sep = '\s+', header = None, names = names) control = pd.read_table(fin_ct, sep='\s+', header = None, names = names) data = pd.concat([treated, control]) data.index = range(data.shape[0]) return data
def uniprot_reviewed_checker(uniprot_id): """Check if a single UniProt ID is reviewed or not. Args: uniprot_id: Returns: bool: If the entry is reviewed """ query_string = 'id:' + uniprot_id uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab')) uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0) uni_rev_df = uni_rev_df.fillna(False) uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)] uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True) uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False) uni_rev_dict_adder = uni_rev_df.to_dict()['Status'] return uni_rev_dict_adder[uniprot_id]
def uniprot_sites(uniprot_id): """Retrieve a list of UniProt sites parsed from the feature file Sites are defined here: http://www.uniprot.org/help/site and here: http://www.uniprot.org/help/function_section Args: uniprot_id: Valid UniProt ID Returns: """ r = requests.post('http://www.uniprot.org/uniprot/%s.gff' % uniprot_id) gff = StringIO(r.content.decode('utf-8')) feats = list(GFF.parse(gff)) if len(feats) > 1: log.warning('Too many sequences in GFF') else: return feats[0].features # try: # gff_df = pd.read_table(gff, sep='\t', skiprows=2, header=None) # except ValueError as e: # log.error('Error retrieving feature table') # print(e) # return pd.DataFrame() # # gff_df.drop([0, 1, 5, 6, 7, 9], axis=1, inplace=True) # gff_df.columns = ['type', 'seq_start', 'seq_end', 'notes'] # # return gff_df