我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用pandas.to_pickle()。
def compute_cell_smushing(self): """Within each plate, find a 2d embedding of all cells""" grouped = self.genes.groupby(self.cell_metadata[self.SAMPLE_MAPPING]) if os.path.exists(self.cell_smushed_cache_file): smusheds = pd.read_pickle(self.cell_smushed_cache_file) # if nothing is missing, return the cached version if not set(grouped.groups) - set(smusheds): return smusheds else: smusheds = {} for plate_name, genes_subset in grouped: if plate_name not in smusheds: cell_smusher = TSNE(metric='cosine', random_state=0) cell_smushed = pd.DataFrame( cell_smusher.fit_transform(genes_subset), index=genes_subset.index) smusheds[plate_name] = cell_smushed pd.to_pickle(smusheds, self.cell_smushed_cache_file) return smusheds
def train(features): X, Y = ordered_dict_to_x_y(features) pd.DataFrame(X).to_csv('features.csv') clf = get_classification() clf.fit(X, Y) pd.to_pickle(clf, 'classification.pkl') return clf
def save_dataset(dataset, output_path): """ Save the whole dataset as pickle file :param dataset: pandas DataFrame :param output_path: path adn file name of output """ logging.info('Saving dataset to pickle file :' + output_path) os.makedirs(os.path.dirname(output_path), exist_ok=True) pandas.to_pickle(dataset, output_path)
def round_trip_pickle(self, obj, path=None): if path is None: path = u('__%s__.pickle' % rands(10)) with ensure_clean(path) as path: pd.to_pickle(obj, path) return pd.read_pickle(path) # https://docs.python.org/3/library/unittest.html#deprecated-aliases
def optimizer(strategyclass,portfolioclass,feed,params_generator,pkl_name=None): log = {} if pkl_name is None: pkl_name = 'optimizer_log' pkl_path = os.path.join(sys.path[0],'%s.pkl' % pkl_name) pd.to_pickle(log, pkl_path) while True: try: p_list = params_generator.next() except: break else: backup = copy.deepcopy(feed) data = backup strategy = strategyclass(data,p_list) portfolio = portfolioclass(data) go = OnePiece(data, strategy, portfolio) def combine(): go.sunny() print p_list log = pd.read_pickle(pkl_path) log[p_list] = go.get_all_holdings().iat[-1,-1] pd.to_pickle(log, pkl_path) p = multiprocessing.Process(target=combine) p.daemon=True p.start() p.join()
def tushare_clean(csv_path, override=True, pickle_name=None): """ 1. save to local csv 2. save to local pickle """ def clean(df): df.reset_index(drop=True, inplace=True) df['date'] = pd.DatetimeIndex(df['date']) df.set_index('date', inplace=True) return df walk_list = os.walk(csv_path).next() csv_list=[] pickle_dict ={} for i in walk_list[2]: if 'csv' in i: df = pd.read_csv(os.path.join(csv_path, '%s' % i), parse_dates=True,index_col=0) cleaned_df = clean(df) # override CSV if override: cleaned_df.to_csv(os.path.join(csv_path, '%s' % i)) # create pickle if type(pickle_name) is str: symbol = i.replace('.csv','') pickle_dict[symbol] = cleaned_df # Save to pickle if type(pickle_name) is str: pd.to_pickle(pickle_dict, os.path.join(csv_path, '%s.pkl' % pickle_name))
def svd(train,test,dims=20,it=15,file_name='tf_idf',path='data/'): svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims) svd.fit(train) pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl') pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl') return 'Success' # In[3]:
def svd(train,test,dims=100,it=15,file_name='tf_idf',path='data/'): svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims) svd.fit(train) pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl') pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl') return 'Success' # In[12]:
def toTsne(train,test,n_component=2,file_name='tf_idf',path='data/'): tsne=TSNE(n_components=n_component,random_state=1123,njobs=-1) lentrain=train.shape[0] X=np.vstack([train,test]) tsne.fit(X) res=tsne.embedding_ #print res pd.to_pickle(res[:lentrain],path+'train_svd_20_tsne_'+str(n_component)+'_'+file_name+'.pkl') pd.to_pickle(res[lentrain:],path+'test_svd_20_tsne_'+str(n_component)+'_'+file_name+'.pkl') return 'Success'
def svd(train,test,dims=6,it=15,file_name='tf_idf',path='data/'): svd=NMF(random_state=1123,n_components=dims) svd.fit(train) #print svd.transform(train).shape pd.to_pickle(svd.transform(train),path+'train_NMF_'+str(dims)+'_'+file_name+'.pkl') pd.to_pickle(svd.transform(test),path+'test_NMF_'+str(dims)+'_'+file_name+'.pkl') return 'Success' # In[16]:
def save_dict(self,outpath='./data/dictionary/'): pd.to_pickle(self.word_index, outpath + 'word_index.pkl') pd.to_pickle(self.index_word, outpath + 'index_word.pkl') if self.mode!='word': pd.to_pickle(self.char_index,outpath+'char_index.pkl') pd.to_pickle(self.index_char,outpath+'index_char.pkl')
def make_mf_lsvc_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''): n = X.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier ''' print clf for epoch in range(nb_epoch): print "Start epoch:",epoch mf_tr = np.zeros(X.shape[0]) mf_te = np.zeros(X_test.shape[0]) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) mf_tr[ind_te] += clf.predict_proba(X_te).ravel() score = accuracy_score(y_te, clf.predict(X_te).ravel()) del X_tr del X_te mf_te += clf.predict_proba(X_test).ravel() print '\tpred[{}] score:{}'.format(epoch, score) mf_te/=n_folds pd.to_pickle(mf_tr.reshape(-1,1),path+'X_mf_%s_%s_random.pkl'%(name,epoch)) pd.to_pickle(mf_te.reshape(-1,1),path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
def make_mf_regression(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''): n = X.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier ''' print clf for epoch in range(nb_epoch): print "Start epoch:",epoch mf_tr = np.zeros(X.shape[0]) mf_te = np.zeros(X_test.shape[0]) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) mf_tr[ind_te] += clf.predict(X_te) del X_tr del X_te l = 600000 y_pred = [] for batch in range(4): X_tmp = X_test[l*batch:l*(batch+1)] y_pred.append(clf.predict(X_tmp)) y_pred = np.concatenate(y_pred) mf_te += y_pred score = log_loss(y_te, mf_tr[ind_te]) print '\tpred[{}] score:{}'.format(epoch, score) mf_te/=n_folds pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random.pkl'%(name,epoch)) pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
def expand_file_data(infile, outfile, live=False): infile_data = pd.read_pickle(infile) expanded = expand_nba_data(infile_data=infile_data, live=live) discretized = discretize_data(expanded) pd.to_pickle(discretized, outfile) return discretized
def expand_file_data(infile, outfile, pitcher): infile_data = pd.read_pickle(infile) outfile_data = expand_mlb_data(infile_data=infile_data, pitcher=pitcher) pd.to_pickle(outfile_data, outfile) return outfile_data
def split_data(infile, train, test, attrfile, na_strategy, trainpct, split_randomly): expanded_data = strip_and_process_na(pd.read_pickle(infile), attrfile, na_strategy) train_example_count = int(len(expanded_data.index) * trainpct / 100.0) if split_randomly: train_indices = np.random.choice(expanded_data.index, size=train_example_count) else: train_indices = expanded_data.sort("Date").index[:train_example_count] train_data = expanded_data.ix[train_indices] test_data = expanded_data.drop(train_indices) pd.to_pickle(train_data, train) pd.to_pickle(test_data, test)
def serialize(cls, formatted_data, fh): # compat: if pandas is old, to_pickle does not accept file handles if LooseVersion(pd.__version__) <= LooseVersion('0.20.3'): fh.close() fh = fh.name return pd.to_pickle(formatted_data, fh)
def test_round_trip_current(self): try: import cPickle as c_pickle def c_pickler(obj, path): with open(path, 'wb') as fh: c_pickle.dump(obj, fh, protocol=-1) def c_unpickler(path): with open(path, 'rb') as fh: fh.seek(0) return c_pickle.load(fh) except: c_pickler = None c_unpickler = None import pickle as python_pickle def python_pickler(obj, path): with open(path, 'wb') as fh: python_pickle.dump(obj, fh, protocol=-1) def python_unpickler(path): with open(path, 'rb') as fh: fh.seek(0) return python_pickle.load(fh) for typ, dv in self.data.items(): for dt, expected in dv.items(): for writer in [pd.to_pickle, c_pickler, python_pickler]: if writer is None: continue with tm.ensure_clean(self.path) as path: # test writing with each pickler writer(expected, path) # test reading with each unpickler result = pd.read_pickle(path) self.compare_element(result, expected, typ) if c_unpickler is not None: result = c_unpickler(path) self.compare_element(result, expected, typ) result = python_unpickler(path) self.compare_element(result, expected, typ)
def make_mf_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''): n = X.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier ''' print clf for epoch in range(nb_epoch): print "Start epoch:",epoch mf_tr = np.zeros((X.shape[0],len(np.unique(y)))) mf_te = np.zeros((X_test.shape[0],len(np.unique(y)))) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] if ssp.issparse(X): clf.fit(X_tr.tocsc(), y_tr) mf_tr[ind_te] += clf.predict_proba(X_te.tocsc()) else: clf.fit(X_tr, y_tr) mf_tr[ind_te] += clf.predict_proba(X_te) del X_tr del X_te l = 600000 y_pred = [] for batch in range(4): if batch!=3: X_tmp = X_test[l*batch:l*(batch+1)] else: X_tmp = X_test[l*batch:] if ssp.issparse(X): y_pred.append(clf.predict_proba(X_tmp.tocsc())) else: y_pred.append(clf.predict_proba(X_tmp)) y_pred = np.vstack(y_pred) mf_te += y_pred score = log_loss(y_te, mf_tr[ind_te]) print '\tpred[{}] score:{}'.format(epoch, score) mf_te/=n_folds pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random.pkl'%(name,epoch)) pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
def dump_nba_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False): """ Dump NBA statistical data to a file. :param str outfile: name of file to become pickled pandas datafile :param str start_date: don't include games from before this date when dumping data :param str end_date: don't include games from after this date when dumping data :param int max_count: maximum # of rows to dump :param bool use_random: whether to select rows at random (if False, choose most recent) :return: """ if start_date: start_date = parser.parse(start_date) else: start_date = datetime.datetime(2010, 10, 1) if end_date: end_date = parser.parse(end_date) else: end_date = datetime.datetime.today() print 'Dump NBA data for %s to %s' % (start_date, end_date) print 'loading data...' all_game_rows = load_all_game_data() # Filter by date if start_date is not None: all_game_rows = all_game_rows[all_game_rows['date'] > start_date] if end_date is not None: all_game_rows = all_game_rows[all_game_rows['date'] < end_date] # Sample filtered data if max_count and max_count < len(all_game_rows): print 'sampling %d rows...' % max_count if use_random: # We seed to 0 when we call this from CLI to make sure that random splits are replicable. random.seed(0) kept_indices = random.sample(all_game_rows.index, max_count) selected = all_game_rows.loc[kept_indices] else: all_game_rows.sort("date") selected = all_game_rows.tail(max_count) else: selected = all_game_rows print 'saving...' pandas.to_pickle(selected, outfile) print 'Done!' return selected
def dump_mlb_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False, datatype='batting'): """ Dump MLB statistical data to a file. :param str outfile: name of file to become pickled pandas datafile :param str start_date: don't include games from before this date when dumping data :param str end_date: don't include games from after this date when dumping data :param int max_count: maximum # of rows to dump :param bool use_random: whether to select rows at random (if False, choose most recent) :return: """ print 'Dump MLB data for', datatype print 'loading data...' all_bsbr_logs = load_gamelogs(datatype=datatype) unindexed_dfs = [] print 'reindexing data...' pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]) for player_id, dataframe in pbar(all_bsbr_logs.items()): uidf = dataframe.reset_index() # Add player ID as a column to the dataframe for future joining purposes! uidf['player_id'] = pandas.Series(data=player_id, index=uidf.index) unindexed_dfs.append(uidf) all_game_rows = pandas.concat(unindexed_dfs, ignore_index=True) # Filter by date if start_date is not None: all_game_rows = all_game_rows[all_game_rows['Date'] > start_date] if end_date is not None: all_game_rows = all_game_rows[all_game_rows['Date'] < end_date] # Don't use relief pitchers in our dataset if datatype == 'pitching': print 'restricting to starting pitchers only...' all_game_rows = all_game_rows[all_game_rows['player_id'].apply(brefid_is_starting_pitcher)] # Sample filtered data if max_count and max_count < len(all_game_rows): print 'sampling %d rows...' % max_count if use_random: kept_indices = random.sample(all_game_rows.index, max_count) selected = all_game_rows.iloc[kept_indices] else: all_game_rows.sort("Date") selected = all_game_rows.tail(max_count) else: selected = all_game_rows print 'saving...' pandas.to_pickle(selected, outfile) print 'Done!' return selected