Python pandas 模块,read_table() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_table()

项目:manubot    作者:greenelab    | 项目源码 | 文件源码
def get_citation_df(args, text):
    """
    Generate citation_df and save it to 'citations.tsv'.
    """
    citation_df = pandas.DataFrame(
        {'string': get_citation_strings(text)}
    )
    if args.citation_tags_path.is_file():
        tag_df = pandas.read_table(args.citation_tags_path)
        tag_df['string'] = '@tag:' + tag_df.tag
        for citation in tag_df.citation:
            is_valid_citation_string('@' + citation)
        citation_df = citation_df.merge(tag_df[['string', 'citation']], how='left')
    else:
        citation_df['citation'] = None
        logging.info(f'missing {args.citation_tags_path} file: no citation tags set')
    citation_df.citation.fillna(citation_df.string.astype(str).str.lstrip('@'), inplace=True)
    citation_df['standard_citation'] = citation_df.citation.map(standardize_citation)
    citation_df['citation_id'] = citation_df.standard_citation.map(get_citation_id)
    citation_df = citation_df.sort_values(['standard_citation', 'citation'])
    citation_df.to_csv(args.citations_path, sep='\t', index=False)
    check_collisions(citation_df)
    check_multiple_citation_strings(citation_df)
    return citation_df
项目:pscore_match    作者:kellieotto    | 项目源码 | 文件源码
def gerber_green_imai():
    """
    This is the dataset from Imai (2005) used to replicate and evaluate
    the field experiment done by Gerber and Green (2000).

    Notes
    -----
    .. Gerber, Alan S. and Donald P. Green. 2000. "The effects of canvassing,
    telephone calls, and direct mail on voter turnout: a field experiment."
    American Political Science Review 94: 653-663.

    .. Gerber, Alan S. and Donald P. Green. 2005. "Correction to Gerber and Green (2000),
    replication of disputed findings, and reply to Imai (2005)." American Political 
    Science Review 99: 301-313.

    .. Imai, Kosuke. 2005. "Do get-out-the-vote calls reduce turnout? The importance of 
    statistical methods for field experiments." American Political Science Review 99: 
    283-300.
    """
    fin = _os.path.join(data_dir, 'GerberGreenImai.txt')
    data = pd.read_table(fin, sep = '\s+')
    data.index = range(data.shape[0])
    return data
项目:SWEETer-Cat    作者:DanielAndreasen    | 项目源码 | 文件源码
def table_convert(fmt="csv"):
    """Convert the SC data into different formats.

    To make available for download.
    """
    # others netcdf, fits?
    # https://pandas.pydata.org/pandas-docs/stable/io.html
    if fmt not in ['tsv', 'csv', 'hdf']:
        raise NotImplementedError("Conversion format to {} not available.".format(fmt))
    name = "data/sweet-cat.{}".format(fmt)
    if fmt is "tsv":  # This is the standard
        pass
    else:
        df = pd.read_table('data/sweet-cat.tsv')
        if fmt == "hdf":
            df.to_hdf(name, key="sweetcat", mode="w", format='table')
        elif fmt == "csv":
            df.to_csv(name, sep=",", index=False)
项目:pyrsss    作者:butala    | 项目源码 | 文件源码
def get_info_map(info_link=INFO_LINK):
    """
    Return a :class:`DataFrame` containing the information provided at
    *info_link*, a link to a tab delineated text file containing
    information for each USArray MT site.
    """
    df = PD.read_table(info_link,
                       sep='\t',
                       skiprows=1,
                       names=['vnet',
                              'net',
                              'sta',
                              'location',
                              'lat',
                              'lon',
                              'elev',
                              'start',
                              'end',
                              'status',
                              'install',
                              'cert'],
                       parse_dates=[7, 8],
                       index_col=2)
    return df
项目:TextCategorization    作者:Y-oHr-N    | 项目源码 | 文件源码
def predict(self, ifile, efile, ofile):
        # Load
        columns          = ['documents']
        data             = pd.read_table(ifile, header=None, names=columns)
        documents        = data['documents']

        # Deserialize
        estimator        = pickle.load(open(efile, 'rb'))

        # Predict
        probability      = estimator.predict_proba(documents)
        data['labels']   = estimator.predict(documents)
        data['C1_pr']    = probability[:, 0]
        data['C2_pr']    = probability[:, 1]

        # Save
        columns          = ['labels', 'C1_pr', 'C2_pr', 'documents']
        data.to_csv(
            ofile,
            sep          = '\t',
            columns      = columns,
            index        = False
        )
项目:chip_seq_pipeline    作者:biocore-ntnu    | 项目源码 | 文件源码
def expected_result_find_overlaps():

    contents = u"""Chromosome  Start  End  Peak Region
0        chr1      3    5     0   gene
1        chr1      3    5     0    tss
2        chr1     12   14     1   gene
3        chr1    200  300     2   exon
4        chr1    200  300     2   exon
5        chr1    200  300     2   gene
6        chr1    200  300     2    tes
7        chr1    200  300     2    tss
8        chr1    240  297     3   exon
9        chr1    240  297     3   gene
10       chr1    240  297     3    tes"""

    return pd.read_table(StringIO(contents), header=0, sep="\s+")
项目:chip_seq_pipeline    作者:biocore-ntnu    | 项目源码 | 文件源码
def expected_result():
    contents = """Sample OtherGroup
0    GENE1_KO_ChIP_1    GENE2_KO
1    GENE1_KO_ChIP_1         WT
2    GENE1_KO_ChIP_2    GENE2_KO
3    GENE1_KO_ChIP_2         WT
4    GENE1_KO_ChIP_3    GENE2_KO
5    GENE1_KO_ChIP_3         WT
6   GENE2_KO_ChIP_1     GENE1_KO
7   GENE2_KO_ChIP_1         WT
8   GENE2_KO_ChIP_2     GENE1_KO
9   GENE2_KO_ChIP_2         WT
10  GENE2_KO_ChIP_3     GENE1_KO
11  GENE2_KO_ChIP_3         WT
12       WT_ChIP_1     GENE1_KO
13       WT_ChIP_1    GENE2_KO
14       WT_ChIP_2     GENE1_KO
15       WT_ChIP_2    GENE2_KO
16       WT_ChIP_3     GENE1_KO
17       WT_ChIP_3    GENE2_KO"""

    return pd.read_table(StringIO(contents), sep="\s+", index_col=0)
项目:keras-molecules    作者:maxhodak    | 项目源码 | 文件源码
def main():
    uri, outfile, dataset = get_arguments()
    fd = tempfile.NamedTemporaryFile()
    progress = ProgressBar(widgets=[Percentage(), ' ', Bar(), ' ', ETA(), ' ', FileTransferSpeed()])

    def update(count, blockSize, totalSize):
        if progress.maxval is None:
            progress.maxval = totalSize
            progress.start()
        progress.update(min(count * blockSize, totalSize))

    urllib.urlretrieve(uri, fd.name, reporthook = update)
    if dataset == 'zinc12':
        df = pandas.read_csv(fd.name, delimiter = '\t')
        df = df.rename(columns={'SMILES':'structure'})
        df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
    elif dataset == 'chembl22':
        df = pandas.read_table(fd.name,compression='gzip')
        df = df.rename(columns={'canonical_smiles':'structure'})
        df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
        pass
    else:
        df = pandas.read_csv(fd.name, delimiter = '\t')
        df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
项目:PedWorks    作者:BrnCPrz    | 项目源码 | 文件源码
def add_node_attribute(inFile, pedgraph, animal=1, atCol=4, atName="attr1"):
    """
    inFile - pedigree as .txt file
    pedgraph - Pedigree as a networkX graph object
    animal - column for the animal ID
    atCol - column for the attribute
    atName - name for the attribute
    """
    ped_df = pd.read_table(inFile, header=None, delim_whitespace=True)
    #print ped_df
    dic_ped = dict(zip(ped_df[animal - 1], ped_df[atCol - 1]))
    #print dic_ped
    correct_dic_ped = {str(k):int(v) for k,v in dic_ped.items()}
    #print correct_dic_ped
    for node, value in dic_ped.items():
        pedgraph.node[str(node)]["EBV"] = value

    return correct_dic_ped
项目:PedWorks    作者:BrnCPrz    | 项目源码 | 文件源码
def add_ebv_attribute(inFile, pedgraph, animal=1, atCol=4, atName="attr1"):
    """
    inFile - pedigree as .txt file
    pedgraph - Pedigree as a networkX graph object
    animal - column for the animal ID
    atCol - column for the attribute
    atName - name for the attribute
    """
    ped_df = pd.read_table(inFile, header=None, delim_whitespace=True)
    #print ped_df
    dic_ped = dict(zip(ped_df[animal - 1], ped_df[atCol - 1]))
    #print dic_ped
    correct_dic_ped = {str(k):int(-v) for k,v in dic_ped.items()}
    #print correct_dic_ped
    for node, value in dic_ped.items():
        pedgraph.node[str(node)]["EBV"] = value

    return correct_dic_ped
项目:lineage    作者:apriha    | 项目源码 | 文件源码
def _load_knownGene(filename):
        """ Load UCSC knownGene table.

        Parameters
        ----------
        filename : str
            path to knownGene file

        Returns
        -------
        df : pandas.DataFrame
            knownGene table if loading was successful, else None
        """
        if filename is None:
            return None

        try:
            df = pd.read_table(filename, names=['name', 'chrom', 'strand', 'txStart', 'txEnd',
                                                'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts',
                                                'exonEnds', 'proteinID', 'alignID'], index_col=0)
            df['chrom'] = df['chrom'].str[3:]
            return df
        except Exception as err:
            print(err)
            return None
项目:lineage    作者:apriha    | 项目源码 | 文件源码
def _load_kgXref(filename):
        """ Load UCSC kgXref table.

        Parameters
        ----------
        filename : str
            path to kgXref file

        Returns
        -------
        df : pandas.DataFrame
            kgXref table if loading was successful, else None
        """
        if filename is None:
            return None

        try:
            df = pd.read_table(filename, names=['kgID', 'mRNA', 'spID', 'spDisplayID',
                                                'geneSymbol', 'refseq', 'protAcc',
                                                'description', 'rfamAcc', 'tRnaName'], index_col=0,
                               dtype=object)
            return df
        except Exception as err:
            print(err)
            return None
项目:pyspc    作者:carlosqsilva    | 项目源码 | 文件源码
def on_clipboard(self, button):
        kwargs = dict()
        text = self.clipboard.wait_for_text()
        lines = text[:10000].split('\n')[:-1][:10]

        counts = set([x.lstrip().count('\t') for x in lines])
        if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
            kwargs['sep'] = '\t'

        if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None:
            kwargs['sep'] = '\s+'

        try:
            self.data = pd.read_table(StringIO(text), **kwargs)
        except:
            print("Unexpected Error: ", sys.exc_info())
        else:
            self.verticalbox.remove(self.scrollable_treelist)
            self.add_treeview()
项目:implicit    作者:benfred    | 项目源码 | 文件源码
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    logging.debug("reading data from %s", filename)
    data = pandas.read_table(filename,
                             usecols=[0, 2, 3],
                             names=['user', 'artist', 'plays'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(numpy.float32),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    logging.debug("read data file in %s", time.time() - start)
    return data, plays
项目:striatum    作者:ntucllab    | 项目源码 | 文件源码
def main():
    # read and preprocess the movie data
    movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')
    movie = movie_preprocessing(movie)

    # read the ratings data and merge it with movie data
    rating = pd.read_table("ratings.dat", sep="::",
                           names=["user_id", "movie_id", "rating", "timestamp"], engine='python')
    data = pd.merge(rating, movie, on="movie_id")

    # extract feature from our data set
    streaming_batch, user_feature, actions, reward_list = feature_extraction(data)
    streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False)
    user_feature.to_csv("user_feature.csv", sep='\t')
    pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False)
    reward_list.to_csv("reward_list.csv", sep='\t', index=False)

    action_context = movie[movie['movie_id'].isin(actions)]
    action_context.to_csv("action_context.csv", sep='\t', index = False)
项目:cgchack    作者:gaurav-kaushik    | 项目源码 | 文件源码
def get_dataframe_list(args, data_fields=('gene', 'raw_counts')):

    # get a list of dataframes
    dfs, files = [], args['files'] or []
    # create an index using the filenames
    # this will prevent having an overlong command line for 100's or 1000's of files
    if args['file_index']:
        with open(args['file_index']) as fp:
            files.extend(fp.readlines())
    files = sorted(filter(None, set([f.strip() for f in files])))
    # now iterate over the files and get the looooong list of dataframes
    for f in files:
        # Get only specific columns with usecols
        df = pd.read_table(f, usecols=data_fields)
        dfs.append(df)
    return dfs, files # a list of dataframes and the files index
项目:nuts-ml    作者:maet3608    | 项目源码 | 文件源码
def _load_table(self, filepath):
        """
        Load table from file system.

        :param str filepath: Path to table in CSV, TSV, XLSX or
                   Pandas pickle format.
        :return: Pandas table
        :rtype: pandas.core.frame.DataFrame
        """
        _, ext = os.path.splitext(filepath.lower())
        if ext == '.tsv':
            return pd.read_table(filepath, **self.kwargs)
        if ext == '.csv':
            return pd.read_csv(filepath, **self.kwargs)
        if ext == '.xlsx':
            return pd.read_excel(filepath, **self.kwargs)
        return pd.read_pickle(filepath, **self.kwargs)
项目:stock    作者:Rockyzsu    | 项目源码 | 文件源码
def years(self):
        df_list=[]
        k=[str(i) for i in range(1,13)]
        print k
        j=[i for i in range(1,13)]
        result=[]
        for i in range(1,13):
            filename='2016-%s.xls' %str(i).zfill(2)
            #print filename
            t=pd.read_table(filename,encoding='gbk',dtype={u'????':np.str})
            fee=t[u'???'].sum()+t[u'???'].sum()+t[u'????'].sum()
            print i," fee: "
            print fee
            df_list.append(t)
            result.append(fee)
        df=pd.concat(df_list,keys=k)
        #print df
        #df.to_excel('2016_delivery_order.xls')
        self.caculation(df)
        plt.plot(j,result)
        plt.show()
项目:python-mrcz    作者:em-MRCZ    | 项目源码 | 文件源码
def __loadPar( self, parname ):
        """
        Frealign files normally have 16 columns, with any number of comment lines that start with 'C'
        """
        # Ergh, cannot have trailing comments with np.loadtxt?  
        self.parCol = [b"N", b"PSI", b"THETA", b"PHI", b"SHX", b"SHY", b"MAG", b"FILM", b"DF1", b"DF2", \
                     b"ANGAST", b"OCC", b"LogP", b"SIGMA", b"SCORE", b"CHANGE" ]

        self.par = pandas.read_table( parname, engine='c', sep=' ', header=None, names =self.parCol, quotechar='C'  )
        #self.par.append( np.loadtxt( parname, comments=b'C' ) )
        # TODO: split into a dictionary?  
        # TODO: read comments as well
        # TODO: use pandas instead?
        #self.parCol = {b"N":0, b"PSI":1, b"THETA":2, b"PHI":3, b"SHX":4, b"SHY":5, b"MAG":6, b"FILM":7, b"DF1":8, b"DF2":9, 
        #             b"ANGAST":10, b"OCC":11, b"LogP":12, b"SIGMA":13, b"SCORE":14, b"CHANGE":15 }
        #self.parComments = np.loadtxt( parname, comments=b' ' )
项目:readquant    作者:Teichlab    | 项目源码 | 文件源码
def read_cufflinks(sample_path, isoforms=False):
    ''' Function for reading a Cufflinks quantification result.

    Returns
    -------
    A pandas.Series with the expression values in the sample.
    '''
    if isoforms:
        quant_file = sample_path + '/isoforms.fpkm_tracking'
    else:
        quant_file = sample_path + '/genes.fpkm_tracking'
    df = pd.read_table(quant_file, engine='c',
                                   usecols=['tracking_id', 'FPKM'],
                                   index_col=0,
                                   dtype={'tracking_id': np.str, 'FPKM': np.float64})

    df['tracking_id'] = df.index
    df = df.groupby('tracking_id').sum()
    df['TPM'] = df['FPKM'] / df['FPKM'].sum() * 1e6

    df = df.rename(columns={'tracking_id': 'target_id'})
    return df['TPM']
项目:Gene-prediction    作者:sriram2093    | 项目源码 | 文件源码
def getFeaturesForGenome(genomeId, CDS_ONLY):
    """
    This method gets the features for a particular genomeId frfom PATRIC

    Parameters

    genomeId: UniqueId for the genome
    CDS_ONLY: retrieve only CDS features
    """
    data_table = pd.read_table(PatricURL
                               +genomeId+'/'+genomeId+'.PATRIC.features.tab')


    print data_table.shape

    if CDS_ONLY:
        return data_table[(data_table.feature_type == 'CDS')]

    else:
        return data_table
项目:epic    作者:biocore-ntnu    | 项目源码 | 文件源码
def input_data():
    contents = u"""Chromosome Bin End examples/test.bed
chr1 887600 887799 0
chr1 994600 994799 0
chr1 1041000 1041199 0
chr1 1325200 1325399 1
chr1 1541600 1541799 1
chr1 1599000 1599199 1
chr1 1770200 1770399 0
chr1 1820200 1820399 1
chr1 1995000 1995199 0
chr1 2063800 2063999 0
chr1 2129400 2129599 0
chr1 2239000 2239199 0
chr1 2318800 2318999 0
chr1 2448200 2448399 1
chr1 3006000 3006199 0
chr1 3046000 3046199 1
chr1 3089200 3089399 0
chr1 3093800 3093999 0
chr1 3096400 3096599 0"""

    return pd.read_table(StringIO(contents), sep="\s+", index_col=[0, 1, 2])
项目:epic    作者:biocore-ntnu    | 项目源码 | 文件源码
def expected_result():

    c = u"""Bin Chromosome ooo
887600       chr1 1
994600       chr1 1
1041000       chr1 1
1770200       chr1 1
1770400       chr1 1
1995000       chr1 1
2063800       chr1 1
2064000       chr1 1
2129200       chr1 1
2239000       chr1 1
2318800       chr1 1
3006000       chr1 1"""

    return pd.read_table(StringIO(c), sep="\s+", index_col=[1, 0])
项目:epic    作者:biocore-ntnu    | 项目源码 | 文件源码
def expected_result(input_bed_file):

    df = pd.read_table(
        StringIO(u"""Count Chromosome Bin
    2 chr1 39036800
    1 chr1 73781000
    1 chr1 90059800
    1 chr3 55648200
    1 chr7 20246600
    1 chr7 91135000
    1 chr13 100938400
    1 chr19 43528800
    1 chr19 47108800"""),
        sep=r"\s+",
        dtype={"Count": int32,
               "Bin": int32})
    df.columns = [input_bed_file, "Chromosome", "Bin"]
    return df
项目:epic    作者:biocore-ntnu    | 项目源码 | 文件源码
def read_dfs(files):

    full_path = False
    if not len(files) == len(set([basename(f) for f in files])):
        logging.info("Matrix-files do not have a unique basename. Using full path in header!")
        full_path = True

    dfs = OrderedDict()
    for f in files:
        df = pd.read_table(f, header=0, sep=" ", index_col=[0, 1])

        df = df[~df.index.duplicated(keep='first')]

        columns = list(df.columns)
        file_nick = "Enriched_" + basename(f) if not full_path else "Enriched_" + f
        columns[0] = file_nick
        df.columns = columns

        logging.info("Calling " + f + " " + file_nick + " in matrix file.")
        dfs[f] = df

    return dfs
项目:deepcpg    作者:cangermueller    | 项目源码 | 文件源码
def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        lc = []
        for split, filename in zip(['train', 'val'],
                                   [opts.train_lc, opts.val_lc]):
            _lc = pd.read_table(filename)
            _lc['split'] = split
            _lc['epoch'] = range(1, len(_lc) + 1)
            lc.append(_lc)
        lc = pd.concat(lc)

        plot = plot_lc(lc, metrics=opts.metrics, outputs=opts.outputs)
        plot.savefig(opts.out_file)

        log.info('Done!')

        return 0
项目:MAP-IT    作者:alexmarder    | 项目源码 | 文件源码
def create_routing_table(bgp=None, ixp_prefixes=None, ixp_asns=None, bgp_compression='infer'):
    log.info('Creating IP2AS tool.')
    if bgp_compression == 'infer' and bgp.startswith('http'):
        bgp_compression = infer_compression(bgp, 'infer')
    if not isinstance(ixp_prefixes, pd.DataFrame):
        ixp_prefixes = set(pd.read_csv(ixp_prefixes, comment='#', index_col=0).index.unique()) if ixp_prefixes is not None else set()
    if not isinstance(ixp_asns, pd.DataFrame):
        ixp_asns = set(pd.read_csv(ixp_asns, comment='#', index_col=0).index.unique()) if ixp_asns is not None else set()
    if not isinstance(bgp, pd.DataFrame):
        bgp_original = pd.read_table(bgp, comment='#', names=['Address', 'Prefixlen', 'ASN'], compression=bgp_compression)
        bgp = bgp_original[~bgp_original.ASN.str.contains(',|_')].copy()
        bgp['ASN'] = pd.to_numeric(bgp.ASN)
    rt = RoutingTable()
    for address, prefixlen, asn in bgp[~bgp.ASN.isin(ixp_asns)].itertuples(index=False):
        rt.add_prefix(asn.item(), address, prefixlen)
    for address, prefixlen, asn in bgp[bgp.ASN.isin(ixp_asns)].itertuples(index=False):
        rt.add_ixp(address, prefixlen)
    for prefix in ixp_prefixes:
        rt.add_ixp(prefix)
    rt.add_private()
    rt.add_multicast()
    rt.add_default()
    return rt
项目:heliopy    作者:heliopython    | 项目源码 | 文件源码
def _mag_hires_helper(year, doy, local_dir, url, coords):
    fname = str(year)[2:] + doy + '_FGM_' + coords

    hdf_fname = '{}_{}.hdf'.format(year, doy)
    hdfloc = os.path.join(local_dir, hdf_fname)
    if os.path.isfile(hdfloc):
        return pd.read_hdf(hdfloc)

    f = helper.load(fname + '.TAB', local_dir, url)
    if 'error_message' in f.readline():
        f.close()
        os.remove(os.path.join(local_dir, fname + '.TAB'))
        raise RuntimeError(
            'No file named {} exits on remote server'.format(fname))

    df = pd.read_table(f, names=['Time', 'Bx', 'By', 'Bz'],
                       delim_whitespace=True,
                       parse_dates=[0], index_col=0)

    if use_hdf:
        df.to_hdf(hdfloc, key='data', mode='w')

    return df
项目:nci-workshop    作者:sbg    | 项目源码 | 文件源码
def get_dataframe_list(args, data_fields=('gene', 'raw_counts')):

    # get a list of dataframes
    dfs, files = [], args['files'] or []
    # create an index using the filenames
    # this will prevent having an overlong command line for 100's or 1000's of files
    if args['file_index']:
        with open(args['file_index']) as fp:
            files.extend(fp.readlines())
    files = sorted(filter(None, set([f.strip() for f in files])))
    # now iterate over the files and get the looooong list of dataframes
    for f in files:
        # Get only specific columns with usecols
        df = pd.read_table(f, usecols=data_fields)
        dfs.append(df)
    return dfs, files # a list of dataframes and the files index
项目:pandahouse    作者:kszucs    | 项目源码 | 文件源码
def to_dataframe(lines, **kwargs):
    names = lines.readline().decode('utf-8').strip().split('\t')
    types = lines.readline().decode('utf-8').strip().split('\t')

    dtypes, parse_dates, converters = {}, [], {}
    for name, chtype in zip(names, types):
        dtype = CH2PD[chtype]
        if dtype == 'object':
            converters[name] = decode_escapes
        elif dtype.startswith('datetime'):
            parse_dates.append(name)
        else:
            dtypes[name] = dtype

    return pd.read_table(lines, header=None, names=names, dtype=dtypes,
                         parse_dates=parse_dates, converters=converters,
                         na_values=set(), keep_default_na=False, **kwargs)
项目:deep-learning-bitcoin    作者:philipperemy    | 项目源码 | 文件源码
def file_processor(data_file):
    print('Reading bitcoin market data file here: {}.'.format(data_file))
    d = pd.read_table(data_file, sep=',', header=None, index_col=0, names=['price', 'volume'])
    d.index = d.index.map(lambda ts: datetime.datetime.fromtimestamp(int(ts)))
    d.index.names = ['DateTime_UTC']
    p = pd.DataFrame(d['price'].resample('5Min').ohlc())
    p.columns = ['price_open', 'price_high', 'price_low', 'price_close']
    v = pd.DataFrame(d['volume'].resample('5Min').sum())
    v.columns = ['volume']
    p['volume'] = v['volume']

    # drop NaN values.
    # for example sometimes we don't have data for like one hour in a row.
    # So we have NaN buckets of 5Min in this particular hour.
    # Our convention is to avoid those NaN values and drop them!
    p = p.dropna()
    p.to_csv('/tmp/bitcoin_coinbase_M5.csv', sep='\t')
    return p
项目:auDeep    作者:auDeep    | 项目源码 | 文件源码
def _metadata(self) -> pd.DataFrame:
        """
        Read the meta.txt file in the data set base directory containing general data set metadata.

        The meta.txt file is read only once and cached.

        Returns
        -------
        pandas.DataFrame
            The metadata contained in the meta.txt file as a pandas DataFrame

        Raises
        ------
        IOError
            If the data set cannot be parsed
        """
        if not self.can_parse():
            raise IOError("unable to parse DCASE dataset at {}".format(self._basedir))
        if self._metadata_cache is None:
            self._metadata_cache = pd.read_table(str(self._basedir / "meta.txt"), header=None)

        # noinspection PyTypeChecker
        return self._metadata_cache
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_1000_sep(self):
        data = """A|B|C
1|2,334|5
10|13|10.
"""
        expected = DataFrame({
            'A': [1, 10],
            'B': [2334, 13],
            'C': [5, 10.]
        })

        df = self.read_csv(StringIO(data), sep='|', thousands=',')
        tm.assert_frame_equal(df, expected)

        df = self.read_table(StringIO(data), sep='|', thousands=',')
        tm.assert_frame_equal(df, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_custom_na_values(self):
        data = """A,B,C
ignore,this,row
1,NA,3
-1.#IND,5,baz
7,8,NaN
"""
        expected = [[1., nan, 3],
                    [nan, 5, nan],
                    [7, 8, nan]]

        df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
        tm.assert_almost_equal(df.values, expected)

        df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
                              skiprows=[1])
        tm.assert_almost_equal(df2.values, expected)

        df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
                              skiprows=[1])
        tm.assert_almost_equal(df3.values, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_duplicate_columns(self):
        for engine in ['python', 'c']:
            data = """A,A,B,B,B
    1,2,3,4,5
    6,7,8,9,10
    11,12,13,14,15
    """
            # check default beahviour
            df = self.read_table(StringIO(data), sep=',', engine=engine)
            self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])

            df = self.read_table(StringIO(data), sep=',',
                                 engine=engine, mangle_dupe_cols=False)
            self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B'])

            df = self.read_table(StringIO(data), sep=',',
                                 engine=engine, mangle_dupe_cols=True)
            self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_no_header(self):
        data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
        df = self.read_table(StringIO(data), sep=',', header=None)
        df_pref = self.read_table(StringIO(data), sep=',', prefix='X',
                                  header=None)

        names = ['foo', 'bar', 'baz', 'quux', 'panda']
        df2 = self.read_table(StringIO(data), sep=',', names=names)
        expected = [[1, 2, 3, 4, 5.],
                    [6, 7, 8, 9, 10],
                    [11, 12, 13, 14, 15]]
        tm.assert_almost_equal(df.values, expected)
        tm.assert_almost_equal(df.values, df2.values)

        self.assert_numpy_array_equal(df_pref.columns,
                                      ['X0', 'X1', 'X2', 'X3', 'X4'])
        self.assert_numpy_array_equal(df.columns, lrange(5))

        self.assert_numpy_array_equal(df2.columns, names)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_file(self):

        # FILE
        if sys.version_info[:2] < (2, 6):
            raise nose.SkipTest("file:// not supported with Python < 2.6")
        dirpath = tm.get_data_path()
        localtable = os.path.join(dirpath, 'salary.table')
        local_table = self.read_table(localtable)

        try:
            url_table = self.read_table('file://localhost/' + localtable)
        except URLError:
            # fails on some systems
            raise nose.SkipTest("failing on %s" %
                                ' '.join(platform.uname()).strip())

        tm.assert_frame_equal(url_table, local_table)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_1000_sep_with_decimal(self):
        data = """A|B|C
1|2,334.01|5
10|13|10.
"""

        expected = DataFrame({
            'A': [1, 10],
            'B': [2334.01, 13],
            'C': [5, 10.]
        })

        df = self.read_csv(StringIO(data), sep='|', thousands=',')
        tm.assert_frame_equal(df, expected)

        df = self.read_table(StringIO(data), sep='|', thousands=',')
        tm.assert_frame_equal(df, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_trailing_spaces(self):
        data = "A B C  \nrandom line with trailing spaces    \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n   \n5.1,NaN,10.0\n"
        expected = pd.DataFrame([[1., 2., 4.],
                                 [5.1, np.nan, 10.]])
        # this should ignore six lines including lines with trailing
        # whitespace and blank lines.  issues 8661, 8679
        df = self.read_csv(StringIO(data.replace(',', '  ')),
                           header=None, delim_whitespace=True,
                           skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True)
        tm.assert_frame_equal(df, expected)
        df = self.read_table(StringIO(data.replace(',', '  ')),
                             header=None, delim_whitespace=True,
                             skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True)
        tm.assert_frame_equal(df, expected)
        # test skipping set of rows after a row with trailing spaces, issue
        # #8983
        expected = pd.DataFrame({"A": [1., 5.1], "B": [2., np.nan],
                                 "C": [4., 10]})
        df = self.read_table(StringIO(data.replace(',', '  ')),
                             delim_whitespace=True,
                             skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True)
        tm.assert_frame_equal(df, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_fallback_to_python(self):
        # GH 6607
        data = 'a b c\n1 2 3'

        # specify C-unsupported options with python-unsupported option
        # (options will be ignored on fallback, raise)
        with tm.assertRaisesRegexp(ValueError, 'Falling back'):
            pd.read_table(StringIO(data), sep=None,
                          delim_whitespace=False, dtype={'a': float})
        with tm.assertRaisesRegexp(ValueError, 'Falling back'):
            pd.read_table(StringIO(data), sep='\s', dtype={'a': float})
        with tm.assertRaisesRegexp(ValueError, 'Falling back'):
            pd.read_table(StringIO(data), skip_footer=1, dtype={'a': float})

        # specify C-unsupported options without python-unsupported options
        with tm.assert_produces_warning(parsers.ParserWarning):
            pd.read_table(StringIO(data), sep=None, delim_whitespace=False)
        with tm.assert_produces_warning(parsers.ParserWarning):
            pd.read_table(StringIO(data), sep='\s')
        with tm.assert_produces_warning(parsers.ParserWarning):
            pd.read_table(StringIO(data), skip_footer=1)
项目:MiscScripts    作者:sejmodha    | 项目源码 | 文件源码
def download_bacterial_genomes(outfile='outfile.txt'):
    assembly_summary_file=r'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt'
    if os.path.exists('assembly_summary.txt'):
       os.remove('assembly_summary.txt')
    #Download the file using wget sysyem call
    subprocess.call("wget "+assembly_summary_file, shell=True)
    #Reformat the file to pandas-friendly format
    subprocess.call("sed -i '1d' assembly_summary.txt",shell=True)
    subprocess.call("sed -i 's/^# //' assembly_summary.txt", shell=True)
    #Read the file as a dataframe - using read_table
    #Use read_table if the column separator is tab
    assembly_sum = pd.read_table('assembly_summary.txt')
    #filter the dataframe and save the URLs of the complete genomes in a new file
    my_df=assembly_sum[(assembly_sum['version_status'] == 'latest') &
                   (assembly_sum['assembly_level']=='Complete Genome') 
                  ]
    my_df=my_df[['ftp_path','assembly_accession','asm_name']]
    #output_file.write
    my_df.to_csv(outfile,mode='w',index=False,header=None)
    process_url_file(outfile)
    return

#function to download reference genomes 
#this function downloads latest version human reference genome by default
项目:MiscScripts    作者:sejmodha    | 项目源码 | 文件源码
def download_refseq_genome(taxid=9606,outfile='refseq_genome.txt'):
    assembly_summary_file="ftp://ftp.ncbi.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
    if os.path.exists('assembly_summary_refseq.txt'):
        os.remove('assembly_summary_refseq.txt')
    #Download the file using wget sysyem call
    subprocess.call("wget "+assembly_summary_file, shell=True)
    #Reformat the file to pandas-friendly format
    subprocess.call("sed -i '1d' assembly_summary_refseq.txt",shell=True)
    subprocess.call("sed -i 's/^# //' assembly_summary_refseq.txt", shell=True)
    #Read the file as a dataframe - using read_table
    #Use read_table if the column separator is tab
    assembly_sum = pd.read_table('assembly_summary_refseq.txt')
    my_df=assembly_sum[(assembly_sum['taxid'] == taxid) &
                       ((assembly_sum['refseq_category'] == 'reference genome') |
                        (assembly_sum['refseq_category'] == 'representative genome')
                       )]
    my_df=my_df[['ftp_path','assembly_accession','asm_name']]
    #Process the newly created file and download genomes from NCBI website
    my_df.to_csv(outfile,mode='w',index=False,header=None)
    process_url_file(outfile)
    return

#format genbank files to generate kraken-friendly formatted fasta files
项目:LipidFinder    作者:cjbrasher    | 项目源码 | 文件源码
def inputOriginalfile():
    """Asks user to input the name of the original websearch input file 
    and reads it into a dataframe

    Returns:
        dataframe, str: original websearch file as dataframe, original websearch 
        filename as string
    """
    originalFile = input(
        "Name of original masses/retention times file:-\n\n    ")
    originalDF = pd.read_table(originalFile, sep=',')
    # rename MZ and Time from the lipidFinder output
    originalDF.rename(columns={'MZ': 'ORIGINAL_MASS'}, inplace=True)
    originalDF.rename(columns={'Time': 'RETENTION_TIME'}, inplace=True)

    return originalDF, originalFile
项目:LipidFinder    作者:cjbrasher    | 项目源码 | 文件源码
def categoryRename(mergeDF):
    """Lipid categories are renamed to the standard lipid category names
    as per LIPIDMAPS. The categories_map.csv is used to 'map' old category
    names to new category names

    Args:
        mergeDF (dataframe): input dataframe

    Returns:
        dataframe: output dataframe
    """
    categoryFileDF = pd.read_table(
        "categories_map.csv", sep=',', keep_default_na=False)

    # new way: make the categories df into a dictionary - much faster!!
    catMap = dict(list(zip(categoryFileDF.old_category,
                           categoryFileDF.new_category)))
    mergeDF['CATEGORY'] = mergeDF['CATEGORY'].map(catMap)

    return mergeDF
项目:pyem    作者:asarnow    | 项目源码 | 文件源码
def parse_star(starfile, keep_index=True):
    headers = []
    foundheader = False
    ln = 0
    with open(starfile, 'rU') as f:
        for l in f:
            if l.startswith("_rln"):
                foundheader = True
                lastheader = True
                if keep_index:
                    head = l.rstrip()
                else:
                    head = l.split('#')[0].rstrip().lstrip('_')
                headers.append(head)
            else:
                lastheader = False
            if foundheader and not lastheader:
                break
            ln += 1
    star = pd.read_table(starfile, skiprows=ln, delimiter='\s+', header=None)
    star.columns = headers
    return star
项目:scikit-dataaccess    作者:MITHaystack    | 项目源码 | 文件源码
def parseIonoFile(in_file, compression='infer'):
    iono_columns = ( "day",
                     "year",
                     "rec_latitude",
                     "rec_longitude",
                     "los_tec",
                     "los_tec_err",
                     "vertical_tec",
                     "azimuth",
                     "elevation",
                     "mapping_function",
                     "pp_latitude",
                     "pp_longitude",
                     "satellite",
                     "site",
                     "recBias",
                     "recBiasErr" )

    data =  pd.read_table(in_file,header=None, sep='\s+',
                          names=iono_columns,
                          compression=compression)

    data['time'] = pd.to_datetime(data.loc[:,'year'].apply(str) + '-01-01') \
                   + pd.to_timedelta(data.iloc[:,0],unit='day')



    data.set_index('time', inplace=True)
    data.sort_index(inplace=True)

    return data
项目:BioNanoAnalyst    作者:AppliedBioinformatics    | 项目源码 | 文件源码
def convert_tables(self):
        """
        Based on the confidence score, convert xmap file and two corresponding cmap files
        into "pandas table".
        """
        pd.set_option('display.width',200)
        with open ('%s.table' % self.name, 'a') as xmap_table:
            with open (self.xmap) as xmap:
                for line in xmap:
                    if line.startswith('#h'):
                        hearder = line[3:]
                        xmap_table.write(hearder)
                    if line[0]!='#':
                        xmap_table.write(line)
        with open ('%s.rtable' % self.name, 'a') as rcmap_table:
            with open (self.rcmap) as rcmap:
                for line in rcmap:
                    if line.startswith('#h'):
                        hearder = line[3:]
                        rcmap_table.write(hearder)
                    if line[0]!='#':
                        rcmap_table.write(line)
        with open ('%s.qtable' % self.name, 'a') as qcmap_table:
            with open (self.qcmap) as qcmap:
                for line in qcmap:
                    if line.startswith('#h'):
                        hearder = line[3:]
                        qcmap_table.write(hearder)
                    if line[0]!='#':
                        qcmap_table.write(line)
        self.XmapTable = pd.read_table('%s.table' % self.name)
        headers_x = ['RefContigID','RefStartPos','RefEndPos','QryContigID','QryStartPos',
        'QryEndPos','Orientation', 'Confidence','QryLen','RefLen', 'Alignment']
        self.filtered_XmapTable = self.XmapTable[self.XmapTable['Confidence']>=self.confidence_score][headers_x].reset_index(drop=True)
        headers_r = ['CMapId','ContigLength','NumSites','SiteID','Position']
        self.RcmapTable = pd.read_table('%s.rtable' % self.name)[headers_r]
        headers_q = ['CMapId','ContigLength','NumSites','SiteID','Position','Coverage']
        self.QcmapTable = pd.read_table('%s.qtable' % self.name)[headers_q]
        os.remove('%s.table' % self.name)
        os.remove('%s.rtable' % self.name)
        os.remove('%s.qtable' % self.name)
项目:pscore_match    作者:kellieotto    | 项目源码 | 文件源码
def dehejia_wahba():
    """
    Data from Dehejia and Wahba (1999, 2002) used to replicate and evaluate the matching
    results of Lalonde (1986).

    .. Dehejia, Rajeev and Sadek Wahba. 1999. "Causal effects in non-experimental studies: 
    Reevaluating the evaluation of training programs." Journal of the American Statistical
    Association 94 (448): 1053-1062.

    .. Dehejia, Rajeev and Sadek Wahba. 2002. "Propensity score matching methods for non-
    experimental causal studies." Review of Economics and Statistics 84: 151-161.

    .. LaLonde, Robert. 1986. "Evaluating the econometric evaluations of training programs 
    with experimental data." American Economic Review 76 (4): 604-620.
    """
    names = ['Treated', 'Age', 'Education', 'Black', 'Hispanic', 'Married',
             'Nodegree', 'RE74', 'RE75', 'RE78']
    fin_tr = _os.path.join(data_dir, 'nswre74_treated.txt')
    fin_ct = _os.path.join(data_dir, 'nswre74_control.txt')
    treated = pd.read_table(fin_tr, sep = '\s+',
                            header = None, names = names)
    control = pd.read_table(fin_ct, sep='\s+', 
                            header = None, names = names)
    data = pd.concat([treated, control])
    data.index = range(data.shape[0])
    return data
项目:ssbio    作者:SBRG    | 项目源码 | 文件源码
def uniprot_reviewed_checker(uniprot_id):
    """Check if a single UniProt ID is reviewed or not.

    Args:
        uniprot_id:

    Returns:
        bool: If the entry is reviewed

    """

    query_string = 'id:' + uniprot_id

    uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab'))
    uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0)
    uni_rev_df = uni_rev_df.fillna(False)
    uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)]

    uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True)
    uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False)
    uni_rev_dict_adder = uni_rev_df.to_dict()['Status']

    return uni_rev_dict_adder[uniprot_id]
项目:ssbio    作者:SBRG    | 项目源码 | 文件源码
def uniprot_sites(uniprot_id):
    """Retrieve a list of UniProt sites parsed from the feature file

    Sites are defined here: http://www.uniprot.org/help/site and here: http://www.uniprot.org/help/function_section

    Args:
        uniprot_id: Valid UniProt ID

    Returns:

    """

    r = requests.post('http://www.uniprot.org/uniprot/%s.gff' % uniprot_id)
    gff = StringIO(r.content.decode('utf-8'))

    feats = list(GFF.parse(gff))
    if len(feats) > 1:
        log.warning('Too many sequences in GFF')
    else:
        return feats[0].features

    # try:
    #     gff_df = pd.read_table(gff, sep='\t', skiprows=2, header=None)
    # except ValueError as e:
    #     log.error('Error retrieving feature table')
    #     print(e)
    #     return pd.DataFrame()
    #
    # gff_df.drop([0, 1, 5, 6, 7, 9], axis=1, inplace=True)
    # gff_df.columns = ['type', 'seq_start', 'seq_end', 'notes']
    #
    # return gff_df