Python pandas 模块,to_pickle() 实例源码

我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用pandas.to_pickle()

项目:singlecell-dash    作者:czbiohub    | 项目源码 | 文件源码
def compute_cell_smushing(self):
        """Within each plate, find a 2d embedding of all cells"""
        grouped = self.genes.groupby(self.cell_metadata[self.SAMPLE_MAPPING])

        if os.path.exists(self.cell_smushed_cache_file):
            smusheds = pd.read_pickle(self.cell_smushed_cache_file)
            # if nothing is missing, return the cached version
            if not set(grouped.groups) - set(smusheds):
                return smusheds
        else:
            smusheds = {}

        for plate_name, genes_subset in grouped:
            if plate_name not in smusheds:
                cell_smusher = TSNE(metric='cosine', random_state=0)
                cell_smushed = pd.DataFrame(
                    cell_smusher.fit_transform(genes_subset),
                    index=genes_subset.index)
                smusheds[plate_name] = cell_smushed

        pd.to_pickle(smusheds, self.cell_smushed_cache_file)

        return smusheds
项目:aesthetics    作者:shubhamchaudhary    | 项目源码 | 文件源码
def train(features):
    X, Y = ordered_dict_to_x_y(features)
    pd.DataFrame(X).to_csv('features.csv')

    clf = get_classification()
    clf.fit(X, Y)
    pd.to_pickle(clf, 'classification.pkl')
    return clf
项目:SmartSlam    作者:Oneiroe    | 项目源码 | 文件源码
def save_dataset(dataset, output_path):
    """ Save the whole dataset as pickle file
    :param dataset: pandas DataFrame
    :param output_path: path adn file name of output
    """
    logging.info('Saving dataset to pickle file :' + output_path)

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    pandas.to_pickle(dataset, output_path)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def round_trip_pickle(self, obj, path=None):
        if path is None:
            path = u('__%s__.pickle' % rands(10))
        with ensure_clean(path) as path:
            pd.to_pickle(obj, path)
            return pd.read_pickle(path)

    # https://docs.python.org/3/library/unittest.html#deprecated-aliases
项目:OnePy_Old    作者:Chandlercjy    | 项目源码 | 文件源码
def optimizer(strategyclass,portfolioclass,feed,params_generator,pkl_name=None):
    log = {}
    if pkl_name is None:
        pkl_name = 'optimizer_log'

    pkl_path = os.path.join(sys.path[0],'%s.pkl' % pkl_name)
    pd.to_pickle(log, pkl_path)

    while True:
        try:
            p_list = params_generator.next()
        except:
            break
        else:
            backup = copy.deepcopy(feed)
            data = backup
            strategy = strategyclass(data,p_list)
            portfolio = portfolioclass(data)
            go = OnePiece(data, strategy, portfolio)

            def combine():
                go.sunny()
                print p_list
                log = pd.read_pickle(pkl_path)
                log[p_list] = go.get_all_holdings().iat[-1,-1]
                pd.to_pickle(log, pkl_path)
            p = multiprocessing.Process(target=combine)
            p.daemon=True
            p.start()
    p.join()
项目:OnePy_Old    作者:Chandlercjy    | 项目源码 | 文件源码
def tushare_clean(csv_path, override=True, pickle_name=None):
    """
    1. save to local csv
    2. save to local pickle
    """
    def clean(df):
        df.reset_index(drop=True, inplace=True)
        df['date'] = pd.DatetimeIndex(df['date'])
        df.set_index('date', inplace=True)
        return df

    walk_list = os.walk(csv_path).next()
    csv_list=[]
    pickle_dict ={}

    for i in walk_list[2]:
        if 'csv' in i:
            df = pd.read_csv(os.path.join(csv_path, '%s' % i),
                                            parse_dates=True,index_col=0)
            cleaned_df = clean(df)

            # override CSV
            if override:
                cleaned_df.to_csv(os.path.join(csv_path, '%s' % i))

            # create pickle
            if type(pickle_name) is str:
                symbol = i.replace('.csv','')
                pickle_dict[symbol] = cleaned_df

    # Save to pickle
    if type(pickle_name) is str:
        pd.to_pickle(pickle_dict, os.path.join(csv_path, '%s.pkl' % pickle_name))
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def svd(train,test,dims=20,it=15,file_name='tf_idf',path='data/'):
    svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
    svd.fit(train)
    pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
    pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
    return 'Success'


# In[3]:
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def svd(train,test,dims=100,it=15,file_name='tf_idf',path='data/'):
    svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
    svd.fit(train)
    pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
    pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
    return 'Success'


# In[12]:
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def toTsne(train,test,n_component=2,file_name='tf_idf',path='data/'):
    tsne=TSNE(n_components=n_component,random_state=1123,njobs=-1)
    lentrain=train.shape[0]
    X=np.vstack([train,test])
    tsne.fit(X)
    res=tsne.embedding_
    #print res
    pd.to_pickle(res[:lentrain],path+'train_svd_20_tsne_'+str(n_component)+'_'+file_name+'.pkl')
    pd.to_pickle(res[lentrain:],path+'test_svd_20_tsne_'+str(n_component)+'_'+file_name+'.pkl')
    return 'Success'
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def svd(train,test,dims=6,it=15,file_name='tf_idf',path='data/'):
    svd=NMF(random_state=1123,n_components=dims)
    svd.fit(train)
    #print svd.transform(train).shape
    pd.to_pickle(svd.transform(train),path+'train_NMF_'+str(dims)+'_'+file_name+'.pkl')
    pd.to_pickle(svd.transform(test),path+'test_NMF_'+str(dims)+'_'+file_name+'.pkl')
    return 'Success'


# In[16]:
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def save_dict(self,outpath='./data/dictionary/'):
        pd.to_pickle(self.word_index, outpath + 'word_index.pkl')
        pd.to_pickle(self.index_word, outpath + 'index_word.pkl')
        if self.mode!='word':
            pd.to_pickle(self.char_index,outpath+'char_index.pkl')
            pd.to_pickle(self.index_char,outpath+'index_char.pkl')
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def make_mf_lsvc_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
    '''
    print clf
    for epoch in range(nb_epoch):
        print "Start epoch:",epoch
        mf_tr = np.zeros(X.shape[0])
        mf_te = np.zeros(X_test.shape[0])
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)


        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]
            clf.fit(X_tr, y_tr)
            mf_tr[ind_te] += clf.predict_proba(X_te).ravel()
            score = accuracy_score(y_te, clf.predict(X_te).ravel())
            del X_tr
            del X_te

            mf_te += clf.predict_proba(X_test).ravel()

            print '\tpred[{}] score:{}'.format(epoch, score)
        mf_te/=n_folds
        pd.to_pickle(mf_tr.reshape(-1,1),path+'X_mf_%s_%s_random.pkl'%(name,epoch))
        pd.to_pickle(mf_te.reshape(-1,1),path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def make_mf_regression(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
    '''
    print clf
    for epoch in range(nb_epoch):
        print "Start epoch:",epoch
        mf_tr = np.zeros(X.shape[0])
        mf_te = np.zeros(X_test.shape[0])
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)


        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]
            clf.fit(X_tr, y_tr)
            mf_tr[ind_te] += clf.predict(X_te)
            del X_tr
            del X_te

            l = 600000
            y_pred = []
            for batch in range(4):
                X_tmp = X_test[l*batch:l*(batch+1)]
                y_pred.append(clf.predict(X_tmp))
            y_pred = np.concatenate(y_pred)
            mf_te += y_pred
            score = log_loss(y_te, mf_tr[ind_te])
            print '\tpred[{}] score:{}'.format(epoch, score)
        mf_te/=n_folds
        pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random.pkl'%(name,epoch))
        pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
项目:sportsball    作者:jgershen    | 项目源码 | 文件源码
def expand_file_data(infile, outfile, live=False):
  infile_data = pd.read_pickle(infile)
  expanded = expand_nba_data(infile_data=infile_data, live=live)
  discretized = discretize_data(expanded)
  pd.to_pickle(discretized, outfile)
  return discretized
项目:sportsball    作者:jgershen    | 项目源码 | 文件源码
def expand_file_data(infile, outfile, pitcher):
  infile_data = pd.read_pickle(infile)
  outfile_data = expand_mlb_data(infile_data=infile_data, pitcher=pitcher)
  pd.to_pickle(outfile_data, outfile)
  return outfile_data
项目:sportsball    作者:jgershen    | 项目源码 | 文件源码
def split_data(infile, train, test, attrfile, na_strategy, trainpct, split_randomly):
  expanded_data = strip_and_process_na(pd.read_pickle(infile), attrfile, na_strategy)
  train_example_count = int(len(expanded_data.index) * trainpct / 100.0)
  if split_randomly:
    train_indices = np.random.choice(expanded_data.index, size=train_example_count)
  else:
    train_indices = expanded_data.sort("Date").index[:train_example_count]
  train_data = expanded_data.ix[train_indices]
  test_data = expanded_data.drop(train_indices)

  pd.to_pickle(train_data, train)
  pd.to_pickle(test_data, test)
项目:omniduct    作者:airbnb    | 项目源码 | 文件源码
def serialize(cls, formatted_data, fh):
        # compat: if pandas is old, to_pickle does not accept file handles
        if LooseVersion(pd.__version__) <= LooseVersion('0.20.3'):
            fh.close()
            fh = fh.name
        return pd.to_pickle(formatted_data, fh)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_round_trip_current(self):

        try:
            import cPickle as c_pickle

            def c_pickler(obj, path):
                with open(path, 'wb') as fh:
                    c_pickle.dump(obj, fh, protocol=-1)

            def c_unpickler(path):
                with open(path, 'rb') as fh:
                    fh.seek(0)
                    return c_pickle.load(fh)
        except:
            c_pickler = None
            c_unpickler = None

        import pickle as python_pickle

        def python_pickler(obj, path):
            with open(path, 'wb') as fh:
                python_pickle.dump(obj, fh, protocol=-1)

        def python_unpickler(path):
            with open(path, 'rb') as fh:
                fh.seek(0)
                return python_pickle.load(fh)

        for typ, dv in self.data.items():
            for dt, expected in dv.items():

                for writer in [pd.to_pickle, c_pickler, python_pickler]:
                    if writer is None:
                        continue

                    with tm.ensure_clean(self.path) as path:

                        # test writing with each pickler
                        writer(expected, path)

                        # test reading with each unpickler
                        result = pd.read_pickle(path)
                        self.compare_element(result, expected, typ)

                        if c_unpickler is not None:
                            result = c_unpickler(path)
                            self.compare_element(result, expected, typ)

                        result = python_unpickler(path)
                        self.compare_element(result, expected, typ)
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def make_mf_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
    '''
    print clf
    for epoch in range(nb_epoch):
        print "Start epoch:",epoch
        mf_tr = np.zeros((X.shape[0],len(np.unique(y))))
        mf_te = np.zeros((X_test.shape[0],len(np.unique(y))))
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)


        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]

            if ssp.issparse(X):
                clf.fit(X_tr.tocsc(), y_tr)    
                mf_tr[ind_te] += clf.predict_proba(X_te.tocsc())
            else:
                clf.fit(X_tr, y_tr)    
                mf_tr[ind_te] += clf.predict_proba(X_te)
            del X_tr
            del X_te

            l = 600000
            y_pred = []
            for batch in range(4):
                if batch!=3:
                    X_tmp = X_test[l*batch:l*(batch+1)]
                else:
                    X_tmp = X_test[l*batch:]
                if ssp.issparse(X):
                    y_pred.append(clf.predict_proba(X_tmp.tocsc()))
                else:
                    y_pred.append(clf.predict_proba(X_tmp))    
            y_pred = np.vstack(y_pred)
            mf_te += y_pred
            score = log_loss(y_te, mf_tr[ind_te])
            print '\tpred[{}] score:{}'.format(epoch, score)
        mf_te/=n_folds
        pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random.pkl'%(name,epoch))
        pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
项目:sportsball    作者:jgershen    | 项目源码 | 文件源码
def dump_nba_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False):
  """
  Dump NBA statistical data to a file.
  :param str outfile: name of file to become pickled pandas datafile
  :param str start_date: don't include games from before this date when dumping data
  :param str end_date: don't include games from after this date when dumping data
  :param int max_count: maximum # of rows to dump
  :param bool use_random: whether to select rows at random (if False, choose most recent)
  :return:
  """
  if start_date:
    start_date = parser.parse(start_date)
  else:
    start_date = datetime.datetime(2010, 10, 1)
  if end_date:
    end_date = parser.parse(end_date)
  else:
    end_date = datetime.datetime.today()
  print 'Dump NBA data for %s to %s' % (start_date, end_date)
  print 'loading data...'
  all_game_rows = load_all_game_data()

  # Filter by date
  if start_date is not None:
    all_game_rows = all_game_rows[all_game_rows['date'] > start_date]
  if end_date is not None:
    all_game_rows = all_game_rows[all_game_rows['date'] < end_date]

  # Sample filtered data
  if max_count and max_count < len(all_game_rows):
    print 'sampling %d rows...' % max_count
    if use_random:
      # We seed to 0 when we call this from CLI to make sure that random splits are replicable.
      random.seed(0)
      kept_indices = random.sample(all_game_rows.index, max_count)
      selected = all_game_rows.loc[kept_indices]
    else:
      all_game_rows.sort("date")
      selected = all_game_rows.tail(max_count)
  else:
    selected = all_game_rows
  print 'saving...'
  pandas.to_pickle(selected, outfile)
  print 'Done!'
  return selected
项目:sportsball    作者:jgershen    | 项目源码 | 文件源码
def dump_mlb_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False, datatype='batting'):
  """
  Dump MLB statistical data to a file.
  :param str outfile: name of file to become pickled pandas datafile
  :param str start_date: don't include games from before this date when dumping data
  :param str end_date: don't include games from after this date when dumping data
  :param int max_count: maximum # of rows to dump
  :param bool use_random: whether to select rows at random (if False, choose most recent)
  :return:
  """
  print 'Dump MLB data for', datatype
  print 'loading data...'
  all_bsbr_logs = load_gamelogs(datatype=datatype)
  unindexed_dfs = []
  print 'reindexing data...'
  pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()])
  for player_id, dataframe in pbar(all_bsbr_logs.items()):
    uidf = dataframe.reset_index()
    # Add player ID as a column to the dataframe for future joining purposes!
    uidf['player_id'] = pandas.Series(data=player_id, index=uidf.index)
    unindexed_dfs.append(uidf)
  all_game_rows = pandas.concat(unindexed_dfs, ignore_index=True)

  # Filter by date
  if start_date is not None:
    all_game_rows = all_game_rows[all_game_rows['Date'] > start_date]
  if end_date is not None:
    all_game_rows = all_game_rows[all_game_rows['Date'] < end_date]

  # Don't use relief pitchers in our dataset
  if datatype == 'pitching':
    print 'restricting to starting pitchers only...'
    all_game_rows = all_game_rows[all_game_rows['player_id'].apply(brefid_is_starting_pitcher)]

  # Sample filtered data
  if max_count and max_count < len(all_game_rows):
    print 'sampling %d rows...' % max_count
    if use_random:
      kept_indices = random.sample(all_game_rows.index, max_count)
      selected = all_game_rows.iloc[kept_indices]
    else:
      all_game_rows.sort("Date")
      selected = all_game_rows.tail(max_count)
  else:
    selected = all_game_rows
  print 'saving...'
  pandas.to_pickle(selected, outfile)
  print 'Done!'
  return selected