Python pandas 模块，read_table() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用pandas.read_table()。

项目：manubot 作者：greenelab | 项目源码 | 文件源码

def get_citation_df(args, text):
    """
    Generate citation_df and save it to 'citations.tsv'.
    """
    citation_df = pandas.DataFrame(
        {'string': get_citation_strings(text)}
    )
    if args.citation_tags_path.is_file():
        tag_df = pandas.read_table(args.citation_tags_path)
        tag_df['string'] = '@tag:' + tag_df.tag
        for citation in tag_df.citation:
            is_valid_citation_string('@' + citation)
        citation_df = citation_df.merge(tag_df[['string', 'citation']], how='left')
    else:
        citation_df['citation'] = None
        logging.info(f'missing {args.citation_tags_path} file: no citation tags set')
    citation_df.citation.fillna(citation_df.string.astype(str).str.lstrip('@'), inplace=True)
    citation_df['standard_citation'] = citation_df.citation.map(standardize_citation)
    citation_df['citation_id'] = citation_df.standard_citation.map(get_citation_id)
    citation_df = citation_df.sort_values(['standard_citation', 'citation'])
    citation_df.to_csv(args.citations_path, sep='\t', index=False)
    check_collisions(citation_df)
    check_multiple_citation_strings(citation_df)
    return citation_df

项目：pscore_match 作者：kellieotto | 项目源码 | 文件源码

def gerber_green_imai():
    """
    This is the dataset from Imai (2005) used to replicate and evaluate
    the field experiment done by Gerber and Green (2000).

    Notes
    -----
    .. Gerber, Alan S. and Donald P. Green. 2000. "The effects of canvassing,
    telephone calls, and direct mail on voter turnout: a field experiment."
    American Political Science Review 94: 653-663.

    .. Gerber, Alan S. and Donald P. Green. 2005. "Correction to Gerber and Green (2000),
    replication of disputed findings, and reply to Imai (2005)." American Political 
    Science Review 99: 301-313.

    .. Imai, Kosuke. 2005. "Do get-out-the-vote calls reduce turnout? The importance of 
    statistical methods for field experiments." American Political Science Review 99: 
    283-300.
    """
    fin = _os.path.join(data_dir, 'GerberGreenImai.txt')
    data = pd.read_table(fin, sep = '\s+')
    data.index = range(data.shape[0])
    return data

项目：SWEETer-Cat 作者：DanielAndreasen | 项目源码 | 文件源码

def table_convert(fmt="csv"):
    """Convert the SC data into different formats.

    To make available for download.
    """
    # others netcdf, fits?
    # https://pandas.pydata.org/pandas-docs/stable/io.html
    if fmt not in ['tsv', 'csv', 'hdf']:
        raise NotImplementedError("Conversion format to {} not available.".format(fmt))
    name = "data/sweet-cat.{}".format(fmt)
    if fmt is "tsv":  # This is the standard
        pass
    else:
        df = pd.read_table('data/sweet-cat.tsv')
        if fmt == "hdf":
            df.to_hdf(name, key="sweetcat", mode="w", format='table')
        elif fmt == "csv":
            df.to_csv(name, sep=",", index=False)

项目：pyrsss 作者：butala | 项目源码 | 文件源码

def get_info_map(info_link=INFO_LINK):
    """
    Return a :class:`DataFrame` containing the information provided at
    *info_link*, a link to a tab delineated text file containing
    information for each USArray MT site.
    """
    df = PD.read_table(info_link,
                       sep='\t',
                       skiprows=1,
                       names=['vnet',
                              'net',
                              'sta',
                              'location',
                              'lat',
                              'lon',
                              'elev',
                              'start',
                              'end',
                              'status',
                              'install',
                              'cert'],
                       parse_dates=[7, 8],
                       index_col=2)
    return df

项目：TextCategorization 作者：Y-oHr-N | 项目源码 | 文件源码

def predict(self, ifile, efile, ofile):
        # Load
        columns          = ['documents']
        data             = pd.read_table(ifile, header=None, names=columns)
        documents        = data['documents']

        # Deserialize
        estimator        = pickle.load(open(efile, 'rb'))

        # Predict
        probability      = estimator.predict_proba(documents)
        data['labels']   = estimator.predict(documents)
        data['C1_pr']    = probability[:, 0]
        data['C2_pr']    = probability[:, 1]

        # Save
        columns          = ['labels', 'C1_pr', 'C2_pr', 'documents']
        data.to_csv(
            ofile,
            sep          = '\t',
            columns      = columns,
            index        = False
        )

项目：chip_seq_pipeline 作者：biocore-ntnu | 项目源码 | 文件源码

def expected_result_find_overlaps():

    contents = u"""Chromosome  Start  End  Peak Region
0        chr1      3    5     0   gene
1        chr1      3    5     0    tss
2        chr1     12   14     1   gene
3        chr1    200  300     2   exon
4        chr1    200  300     2   exon
5        chr1    200  300     2   gene
6        chr1    200  300     2    tes
7        chr1    200  300     2    tss
8        chr1    240  297     3   exon
9        chr1    240  297     3   gene
10       chr1    240  297     3    tes"""

    return pd.read_table(StringIO(contents), header=0, sep="\s+")

项目：chip_seq_pipeline 作者：biocore-ntnu | 项目源码 | 文件源码

def expected_result():
    contents = """Sample OtherGroup
0    GENE1_KO_ChIP_1    GENE2_KO
1    GENE1_KO_ChIP_1         WT
2    GENE1_KO_ChIP_2    GENE2_KO
3    GENE1_KO_ChIP_2         WT
4    GENE1_KO_ChIP_3    GENE2_KO
5    GENE1_KO_ChIP_3         WT
6   GENE2_KO_ChIP_1     GENE1_KO
7   GENE2_KO_ChIP_1         WT
8   GENE2_KO_ChIP_2     GENE1_KO
9   GENE2_KO_ChIP_2         WT
10  GENE2_KO_ChIP_3     GENE1_KO
11  GENE2_KO_ChIP_3         WT
12       WT_ChIP_1     GENE1_KO
13       WT_ChIP_1    GENE2_KO
14       WT_ChIP_2     GENE1_KO
15       WT_ChIP_2    GENE2_KO
16       WT_ChIP_3     GENE1_KO
17       WT_ChIP_3    GENE2_KO"""

    return pd.read_table(StringIO(contents), sep="\s+", index_col=0)

项目：keras-molecules 作者：maxhodak | 项目源码 | 文件源码

def main():
    uri, outfile, dataset = get_arguments()
    fd = tempfile.NamedTemporaryFile()
    progress = ProgressBar(widgets=[Percentage(), ' ', Bar(), ' ', ETA(), ' ', FileTransferSpeed()])

    def update(count, blockSize, totalSize):
        if progress.maxval is None:
            progress.maxval = totalSize
            progress.start()
        progress.update(min(count * blockSize, totalSize))

    urllib.urlretrieve(uri, fd.name, reporthook = update)
    if dataset == 'zinc12':
        df = pandas.read_csv(fd.name, delimiter = '\t')
        df = df.rename(columns={'SMILES':'structure'})
        df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
    elif dataset == 'chembl22':
        df = pandas.read_table(fd.name,compression='gzip')
        df = df.rename(columns={'canonical_smiles':'structure'})
        df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
        pass
    else:
        df = pandas.read_csv(fd.name, delimiter = '\t')
        df.to_hdf(outfile, 'table', format = 'table', data_columns = True)

项目：PedWorks 作者：BrnCPrz | 项目源码 | 文件源码

def add_node_attribute(inFile, pedgraph, animal=1, atCol=4, atName="attr1"):
    """
    inFile - pedigree as .txt file
    pedgraph - Pedigree as a networkX graph object
    animal - column for the animal ID
    atCol - column for the attribute
    atName - name for the attribute
    """
    ped_df = pd.read_table(inFile, header=None, delim_whitespace=True)
    #print ped_df
    dic_ped = dict(zip(ped_df[animal - 1], ped_df[atCol - 1]))
    #print dic_ped
    correct_dic_ped = {str(k):int(v) for k,v in dic_ped.items()}
    #print correct_dic_ped
    for node, value in dic_ped.items():
        pedgraph.node[str(node)]["EBV"] = value

    return correct_dic_ped

项目：PedWorks 作者：BrnCPrz | 项目源码 | 文件源码

def add_ebv_attribute(inFile, pedgraph, animal=1, atCol=4, atName="attr1"):
    """
    inFile - pedigree as .txt file
    pedgraph - Pedigree as a networkX graph object
    animal - column for the animal ID
    atCol - column for the attribute
    atName - name for the attribute
    """
    ped_df = pd.read_table(inFile, header=None, delim_whitespace=True)
    #print ped_df
    dic_ped = dict(zip(ped_df[animal - 1], ped_df[atCol - 1]))
    #print dic_ped
    correct_dic_ped = {str(k):int(-v) for k,v in dic_ped.items()}
    #print correct_dic_ped
    for node, value in dic_ped.items():
        pedgraph.node[str(node)]["EBV"] = value

    return correct_dic_ped

项目：lineage 作者：apriha | 项目源码 | 文件源码

def _load_knownGene(filename):
        """ Load UCSC knownGene table.

        Parameters
        ----------
        filename : str
            path to knownGene file

        Returns
        -------
        df : pandas.DataFrame
            knownGene table if loading was successful, else None
        """
        if filename is None:
            return None

        try:
            df = pd.read_table(filename, names=['name', 'chrom', 'strand', 'txStart', 'txEnd',
                                                'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts',
                                                'exonEnds', 'proteinID', 'alignID'], index_col=0)
            df['chrom'] = df['chrom'].str[3:]
            return df
        except Exception as err:
            print(err)
            return None

项目：lineage 作者：apriha | 项目源码 | 文件源码

def _load_kgXref(filename):
        """ Load UCSC kgXref table.

        Parameters
        ----------
        filename : str
            path to kgXref file

        Returns
        -------
        df : pandas.DataFrame
            kgXref table if loading was successful, else None
        """
        if filename is None:
            return None

        try:
            df = pd.read_table(filename, names=['kgID', 'mRNA', 'spID', 'spDisplayID',
                                                'geneSymbol', 'refseq', 'protAcc',
                                                'description', 'rfamAcc', 'tRnaName'], index_col=0,
                               dtype=object)
            return df
        except Exception as err:
            print(err)
            return None

项目：pyspc 作者：carlosqsilva | 项目源码 | 文件源码

def on_clipboard(self, button):
        kwargs = dict()
        text = self.clipboard.wait_for_text()
        lines = text[:10000].split('\n')[:-1][:10]

        counts = set([x.lstrip().count('\t') for x in lines])
        if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
            kwargs['sep'] = '\t'

        if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None:
            kwargs['sep'] = '\s+'

        try:
            self.data = pd.read_table(StringIO(text), **kwargs)
        except:
            print("Unexpected Error: ", sys.exc_info())
        else:
            self.verticalbox.remove(self.scrollable_treelist)
            self.add_treeview()

项目：implicit 作者：benfred | 项目源码 | 文件源码

def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    logging.debug("reading data from %s", filename)
    data = pandas.read_table(filename,
                             usecols=[0, 2, 3],
                             names=['user', 'artist', 'plays'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(numpy.float32),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    logging.debug("read data file in %s", time.time() - start)
    return data, plays

项目：striatum 作者：ntucllab | 项目源码 | 文件源码

def main():
    # read and preprocess the movie data
    movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')
    movie = movie_preprocessing(movie)

    # read the ratings data and merge it with movie data
    rating = pd.read_table("ratings.dat", sep="::",
                           names=["user_id", "movie_id", "rating", "timestamp"], engine='python')
    data = pd.merge(rating, movie, on="movie_id")

    # extract feature from our data set
    streaming_batch, user_feature, actions, reward_list = feature_extraction(data)
    streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False)
    user_feature.to_csv("user_feature.csv", sep='\t')
    pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False)
    reward_list.to_csv("reward_list.csv", sep='\t', index=False)

    action_context = movie[movie['movie_id'].isin(actions)]
    action_context.to_csv("action_context.csv", sep='\t', index = False)

项目：cgchack 作者：gaurav-kaushik | 项目源码 | 文件源码

def get_dataframe_list(args, data_fields=('gene', 'raw_counts')):

    # get a list of dataframes
    dfs, files = [], args['files'] or []
    # create an index using the filenames
    # this will prevent having an overlong command line for 100's or 1000's of files
    if args['file_index']:
        with open(args['file_index']) as fp:
            files.extend(fp.readlines())
    files = sorted(filter(None, set([f.strip() for f in files])))
    # now iterate over the files and get the looooong list of dataframes
    for f in files:
        # Get only specific columns with usecols
        df = pd.read_table(f, usecols=data_fields)
        dfs.append(df)
    return dfs, files # a list of dataframes and the files index

项目：nuts-ml 作者：maet3608 | 项目源码 | 文件源码

def _load_table(self, filepath):
        """
        Load table from file system.

        :param str filepath: Path to table in CSV, TSV, XLSX or
                   Pandas pickle format.
        :return: Pandas table
        :rtype: pandas.core.frame.DataFrame
        """
        _, ext = os.path.splitext(filepath.lower())
        if ext == '.tsv':
            return pd.read_table(filepath, **self.kwargs)
        if ext == '.csv':
            return pd.read_csv(filepath, **self.kwargs)
        if ext == '.xlsx':
            return pd.read_excel(filepath, **self.kwargs)
        return pd.read_pickle(filepath, **self.kwargs)

项目：stock 作者：Rockyzsu | 项目源码 | 文件源码

def years(self):
        df_list=[]
        k=[str(i) for i in range(1,13)]
        print k
        j=[i for i in range(1,13)]
        result=[]
        for i in range(1,13):
            filename='2016-%s.xls' %str(i).zfill(2)
            #print filename
            t=pd.read_table(filename,encoding='gbk',dtype={u'????':np.str})
            fee=t[u'???'].sum()+t[u'???'].sum()+t[u'????'].sum()
            print i," fee: "
            print fee
            df_list.append(t)
            result.append(fee)
        df=pd.concat(df_list,keys=k)
        #print df
        #df.to_excel('2016_delivery_order.xls')
        self.caculation(df)
        plt.plot(j,result)
        plt.show()

项目：python-mrcz 作者：em-MRCZ | 项目源码 | 文件源码

def __loadPar( self, parname ):
        """
        Frealign files normally have 16 columns, with any number of comment lines that start with 'C'
        """
        # Ergh, cannot have trailing comments with np.loadtxt?  
        self.parCol = [b"N", b"PSI", b"THETA", b"PHI", b"SHX", b"SHY", b"MAG", b"FILM", b"DF1", b"DF2", \
                     b"ANGAST", b"OCC", b"LogP", b"SIGMA", b"SCORE", b"CHANGE" ]

        self.par = pandas.read_table( parname, engine='c', sep=' ', header=None, names =self.parCol, quotechar='C'  )
        #self.par.append( np.loadtxt( parname, comments=b'C' ) )
        # TODO: split into a dictionary?  
        # TODO: read comments as well
        # TODO: use pandas instead?
        #self.parCol = {b"N":0, b"PSI":1, b"THETA":2, b"PHI":3, b"SHX":4, b"SHY":5, b"MAG":6, b"FILM":7, b"DF1":8, b"DF2":9, 
        #             b"ANGAST":10, b"OCC":11, b"LogP":12, b"SIGMA":13, b"SCORE":14, b"CHANGE":15 }
        #self.parComments = np.loadtxt( parname, comments=b' ' )

项目：readquant 作者：Teichlab | 项目源码 | 文件源码

def read_cufflinks(sample_path, isoforms=False):
    ''' Function for reading a Cufflinks quantification result.

    Returns
    -------
    A pandas.Series with the expression values in the sample.
    '''
    if isoforms:
        quant_file = sample_path + '/isoforms.fpkm_tracking'
    else:
        quant_file = sample_path + '/genes.fpkm_tracking'
    df = pd.read_table(quant_file, engine='c',
                                   usecols=['tracking_id', 'FPKM'],
                                   index_col=0,
                                   dtype={'tracking_id': np.str, 'FPKM': np.float64})

    df['tracking_id'] = df.index
    df = df.groupby('tracking_id').sum()
    df['TPM'] = df['FPKM'] / df['FPKM'].sum() * 1e6

    df = df.rename(columns={'tracking_id': 'target_id'})
    return df['TPM']

项目：Gene-prediction 作者：sriram2093 | 项目源码 | 文件源码

def getFeaturesForGenome(genomeId, CDS_ONLY):
    """
    This method gets the features for a particular genomeId frfom PATRIC

    Parameters

    genomeId: UniqueId for the genome
    CDS_ONLY: retrieve only CDS features
    """
    data_table = pd.read_table(PatricURL
                               +genomeId+'/'+genomeId+'.PATRIC.features.tab')


    print data_table.shape

    if CDS_ONLY:
        return data_table[(data_table.feature_type == 'CDS')]

    else:
        return data_table

项目：epic 作者：biocore-ntnu | 项目源码 | 文件源码

def input_data():
    contents = u"""Chromosome Bin End examples/test.bed
chr1 887600 887799 0
chr1 994600 994799 0
chr1 1041000 1041199 0
chr1 1325200 1325399 1
chr1 1541600 1541799 1
chr1 1599000 1599199 1
chr1 1770200 1770399 0
chr1 1820200 1820399 1
chr1 1995000 1995199 0
chr1 2063800 2063999 0
chr1 2129400 2129599 0
chr1 2239000 2239199 0
chr1 2318800 2318999 0
chr1 2448200 2448399 1
chr1 3006000 3006199 0
chr1 3046000 3046199 1
chr1 3089200 3089399 0
chr1 3093800 3093999 0
chr1 3096400 3096599 0"""

    return pd.read_table(StringIO(contents), sep="\s+", index_col=[0, 1, 2])

项目：epic 作者：biocore-ntnu | 项目源码 | 文件源码

def expected_result():

    c = u"""Bin Chromosome ooo
887600       chr1 1
994600       chr1 1
1041000       chr1 1
1770200       chr1 1
1770400       chr1 1
1995000       chr1 1
2063800       chr1 1
2064000       chr1 1
2129200       chr1 1
2239000       chr1 1
2318800       chr1 1
3006000       chr1 1"""

    return pd.read_table(StringIO(c), sep="\s+", index_col=[1, 0])

项目：epic 作者：biocore-ntnu | 项目源码 | 文件源码

def expected_result(input_bed_file):

    df = pd.read_table(
        StringIO(u"""Count Chromosome Bin
    2 chr1 39036800
    1 chr1 73781000
    1 chr1 90059800
    1 chr3 55648200
    1 chr7 20246600
    1 chr7 91135000
    1 chr13 100938400
    1 chr19 43528800
    1 chr19 47108800"""),
        sep=r"\s+",
        dtype={"Count": int32,
               "Bin": int32})
    df.columns = [input_bed_file, "Chromosome", "Bin"]
    return df

项目：epic 作者：biocore-ntnu | 项目源码 | 文件源码

def read_dfs(files):

    full_path = False
    if not len(files) == len(set([basename(f) for f in files])):
        logging.info("Matrix-files do not have a unique basename. Using full path in header!")
        full_path = True

    dfs = OrderedDict()
    for f in files:
        df = pd.read_table(f, header=0, sep=" ", index_col=[0, 1])

        df = df[~df.index.duplicated(keep='first')]

        columns = list(df.columns)
        file_nick = "Enriched_" + basename(f) if not full_path else "Enriched_" + f
        columns[0] = file_nick
        df.columns = columns

        logging.info("Calling " + f + " " + file_nick + " in matrix file.")
        dfs[f] = df

    return dfs

项目：deepcpg 作者：cangermueller | 项目源码 | 文件源码

def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        lc = []
        for split, filename in zip(['train', 'val'],
                                   [opts.train_lc, opts.val_lc]):
            _lc = pd.read_table(filename)
            _lc['split'] = split
            _lc['epoch'] = range(1, len(_lc) + 1)
            lc.append(_lc)
        lc = pd.concat(lc)

        plot = plot_lc(lc, metrics=opts.metrics, outputs=opts.outputs)
        plot.savefig(opts.out_file)

        log.info('Done!')

        return 0

项目：MAP-IT 作者：alexmarder | 项目源码 | 文件源码

def create_routing_table(bgp=None, ixp_prefixes=None, ixp_asns=None, bgp_compression='infer'):
    log.info('Creating IP2AS tool.')
    if bgp_compression == 'infer' and bgp.startswith('http'):
        bgp_compression = infer_compression(bgp, 'infer')
    if not isinstance(ixp_prefixes, pd.DataFrame):
        ixp_prefixes = set(pd.read_csv(ixp_prefixes, comment='#', index_col=0).index.unique()) if ixp_prefixes is not None else set()
    if not isinstance(ixp_asns, pd.DataFrame):
        ixp_asns = set(pd.read_csv(ixp_asns, comment='#', index_col=0).index.unique()) if ixp_asns is not None else set()
    if not isinstance(bgp, pd.DataFrame):
        bgp_original = pd.read_table(bgp, comment='#', names=['Address', 'Prefixlen', 'ASN'], compression=bgp_compression)
        bgp = bgp_original[~bgp_original.ASN.str.contains(',|_')].copy()
        bgp['ASN'] = pd.to_numeric(bgp.ASN)
    rt = RoutingTable()
    for address, prefixlen, asn in bgp[~bgp.ASN.isin(ixp_asns)].itertuples(index=False):
        rt.add_prefix(asn.item(), address, prefixlen)
    for address, prefixlen, asn in bgp[bgp.ASN.isin(ixp_asns)].itertuples(index=False):
        rt.add_ixp(address, prefixlen)
    for prefix in ixp_prefixes:
        rt.add_ixp(prefix)
    rt.add_private()
    rt.add_multicast()
    rt.add_default()
    return rt

项目：heliopy 作者：heliopython | 项目源码 | 文件源码

def _mag_hires_helper(year, doy, local_dir, url, coords):
    fname = str(year)[2:] + doy + '_FGM_' + coords

    hdf_fname = '{}_{}.hdf'.format(year, doy)
    hdfloc = os.path.join(local_dir, hdf_fname)
    if os.path.isfile(hdfloc):
        return pd.read_hdf(hdfloc)

    f = helper.load(fname + '.TAB', local_dir, url)
    if 'error_message' in f.readline():
        f.close()
        os.remove(os.path.join(local_dir, fname + '.TAB'))
        raise RuntimeError(
            'No file named {} exits on remote server'.format(fname))

    df = pd.read_table(f, names=['Time', 'Bx', 'By', 'Bz'],
                       delim_whitespace=True,
                       parse_dates=[0], index_col=0)

    if use_hdf:
        df.to_hdf(hdfloc, key='data', mode='w')

    return df

项目：nci-workshop 作者：sbg | 项目源码 | 文件源码

def get_dataframe_list(args, data_fields=('gene', 'raw_counts')):

    # get a list of dataframes
    dfs, files = [], args['files'] or []
    # create an index using the filenames
    # this will prevent having an overlong command line for 100's or 1000's of files
    if args['file_index']:
        with open(args['file_index']) as fp:
            files.extend(fp.readlines())
    files = sorted(filter(None, set([f.strip() for f in files])))
    # now iterate over the files and get the looooong list of dataframes
    for f in files:
        # Get only specific columns with usecols
        df = pd.read_table(f, usecols=data_fields)
        dfs.append(df)
    return dfs, files # a list of dataframes and the files index

项目：pandahouse 作者：kszucs | 项目源码 | 文件源码

def to_dataframe(lines, **kwargs):
    names = lines.readline().decode('utf-8').strip().split('\t')
    types = lines.readline().decode('utf-8').strip().split('\t')

    dtypes, parse_dates, converters = {}, [], {}
    for name, chtype in zip(names, types):
        dtype = CH2PD[chtype]
        if dtype == 'object':
            converters[name] = decode_escapes
        elif dtype.startswith('datetime'):
            parse_dates.append(name)
        else:
            dtypes[name] = dtype

    return pd.read_table(lines, header=None, names=names, dtype=dtypes,
                         parse_dates=parse_dates, converters=converters,
                         na_values=set(), keep_default_na=False, **kwargs)

项目：deep-learning-bitcoin 作者：philipperemy | 项目源码 | 文件源码

def file_processor(data_file):
    print('Reading bitcoin market data file here: {}.'.format(data_file))
    d = pd.read_table(data_file, sep=',', header=None, index_col=0, names=['price', 'volume'])
    d.index = d.index.map(lambda ts: datetime.datetime.fromtimestamp(int(ts)))
    d.index.names = ['DateTime_UTC']
    p = pd.DataFrame(d['price'].resample('5Min').ohlc())
    p.columns = ['price_open', 'price_high', 'price_low', 'price_close']
    v = pd.DataFrame(d['volume'].resample('5Min').sum())
    v.columns = ['volume']
    p['volume'] = v['volume']

    # drop NaN values.
    # for example sometimes we don't have data for like one hour in a row.
    # So we have NaN buckets of 5Min in this particular hour.
    # Our convention is to avoid those NaN values and drop them!
    p = p.dropna()
    p.to_csv('/tmp/bitcoin_coinbase_M5.csv', sep='\t')
    return p

项目：auDeep 作者：auDeep | 项目源码 | 文件源码

def _metadata(self) -> pd.DataFrame:
        """
        Read the meta.txt file in the data set base directory containing general data set metadata.

        The meta.txt file is read only once and cached.

        Returns
        -------
        pandas.DataFrame
            The metadata contained in the meta.txt file as a pandas DataFrame

        Raises
        ------
        IOError
            If the data set cannot be parsed
        """
        if not self.can_parse():
            raise IOError("unable to parse DCASE dataset at {}".format(self._basedir))
        if self._metadata_cache is None:
            self._metadata_cache = pd.read_table(str(self._basedir / "meta.txt"), header=None)

        # noinspection PyTypeChecker
        return self._metadata_cache