我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_pickle()。
def load_pkl(): ''' loads a pickled DataFrame with the employers to scrape ratings for. INPUT: None OUTPUT: df: pandas DataFrame split: threshold of good/bad employer ratings ''' df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl')) df['company_id'] = df['company_id'].astype(int) df['num_ratings'] = df['num_ratings'].astype(int) split = df['overall_rating'].mean() return df, split
def _load_o2p(self): if self.o2p: return path = self.flags.data_path p = "%s/o2p.pkl"%path if os.path.exists(p)==False: self._load_db() ops = self.pdDB.data['op_prior'] ops = ops.append(self.pdDB.data['op_train']) o2p = ops.sort_values(['order_id', 'add_to_cart_order'])\ .groupby('order_id')['product_id'].apply(list) o2p.to_pickle(p) else: o2p = pd.read_pickle(p) self.o2p = o2p print_mem_time("Loaded o2p %d"%len(o2p))
def compute_cell_smushing(self): """Within each plate, find a 2d embedding of all cells""" grouped = self.genes.groupby(self.cell_metadata[self.SAMPLE_MAPPING]) if os.path.exists(self.cell_smushed_cache_file): smusheds = pd.read_pickle(self.cell_smushed_cache_file) # if nothing is missing, return the cached version if not set(grouped.groups) - set(smusheds): return smusheds else: smusheds = {} for plate_name, genes_subset in grouped: if plate_name not in smusheds: cell_smusher = TSNE(metric='cosine', random_state=0) cell_smushed = pd.DataFrame( cell_smusher.fit_transform(genes_subset), index=genes_subset.index) smusheds[plate_name] = cell_smushed pd.to_pickle(smusheds, self.cell_smushed_cache_file) return smusheds
def fit_behavioral_data(): """Fit a model for all subjects. """ df = pd.read_pickle('data.pkl') subjects = df.index.get_level_values('subject').unique() data = np.empty((subjects.size, 10)) cues = (0, 1) for i, subject in enumerate(subjects): print('Fitting model for subject {}'.format(subject)) df_s = df.loc[subject] for cue in cues: ml = ML(df_s[df_s['cue']==cue]) r = ml.ml_estimation() data[i,2*cue:(2*cue+2)] = r.x data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense())) data[i,cue+8] = r.fun model = pd.DataFrame(data, pd.Index(subjects, name='subject'), ['alpha_0', 'beta_0', 'alpha_1', 'beta_1', 'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1', 'NLL_0', 'NLL_1']) return model
def fit_single_subject(subject=4): df = pd.read_pickle('data.pkl') print('Fitting model for subject {}'.format(subject)) df_s = df.loc[subject] cues = (0, 1, 2) for cue in cues: ml = ML(df_s[df_s['cue']==cue]) r = ml.ml_estimation() H_inv = r.hess_inv.todense() print('\t cue:{:d}'.format(cue)) print('\t\tr:\n\t\t\t{}\n'.format(r.x)) print('\tInverse of Hessian:\n{}\n'.format(H_inv)) globals().update(locals())
def _load_table(self, filepath): """ Load table from file system. :param str filepath: Path to table in CSV, TSV, XLSX or Pandas pickle format. :return: Pandas table :rtype: pandas.core.frame.DataFrame """ _, ext = os.path.splitext(filepath.lower()) if ext == '.tsv': return pd.read_table(filepath, **self.kwargs) if ext == '.csv': return pd.read_csv(filepath, **self.kwargs) if ext == '.xlsx': return pd.read_excel(filepath, **self.kwargs) return pd.read_pickle(filepath, **self.kwargs)
def test_legacy_pickle(self): if PY3: raise nose.SkipTest("testing for legacy pickles not " "support on py3") path = tm.get_data_path('multiindex_v1.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) self.assertTrue(obj.equals(obj2)) res = obj.get_indexer(obj) exp = np.arange(len(obj)) assert_almost_equal(res, exp) res = obj.get_indexer(obj2[::-1]) exp = obj.get_indexer(obj[::-1]) exp2 = obj2.get_indexer(obj2[::-1]) assert_almost_equal(res, exp) assert_almost_equal(exp, exp2)
def test_legacy_v2_unpickle(self): # 0.7.3 -> 0.8.0 format manage path = tm.get_data_path('mindex_073.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) self.assertTrue(obj.equals(obj2)) res = obj.get_indexer(obj) exp = np.arange(len(obj)) assert_almost_equal(res, exp) res = obj.get_indexer(obj2[::-1]) exp = obj.get_indexer(obj[::-1]) exp2 = obj2.get_indexer(obj2[::-1]) assert_almost_equal(res, exp) assert_almost_equal(exp, exp2)
def test_pickle_v0_14_1(self): # we have the name warning # 10482 with tm.assert_produces_warning(UserWarning): cat = pd.Categorical(values=['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'], name='foobar', ordered=False) pickle_path = os.path.join(tm.get_data_path(), 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], # name='foobar') # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) # self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
def test_pickle_v0_15_2(self): # ordered -> _ordered # GH 9347 # we have the name warning # 10482 with tm.assert_produces_warning(UserWarning): cat = pd.Categorical(values=['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'], name='foobar', ordered=False) pickle_path = os.path.join(tm.get_data_path(), 'categorical_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], # name='foobar') # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) # self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
def compare(self, vf, version): # py3 compat when reading py2 pickle try: data = pandas.read_pickle(vf) except (ValueError) as e: if 'unsupported pickle protocol:' in str(e): # trying to read a py3 pickle in py2 return else: raise for typ, dv in data.items(): for dt, result in dv.items(): try: expected = self.data[typ][dt] except (KeyError): continue # use a specific comparator # if available comparator = getattr(self, "compare_{typ}_{dt}".format( typ=typ, dt=dt), self.compare_element) comparator(result, expected, typ, version) return data
def thunder(): if os.path.exists('../dataset/thunder.pkl'): return pd.read_pickle('../dataset/thunder.pkl') thunder_df = pd.read_csv('../input/thunder.csv', names=[ 'datetime', # ???? 'lat', # ??(10??) 'lon', # ??(10??) 'type' # ???, CG: ???, IC: ??? ]) # ????????? thunder_df.datetime = pd.to_datetime(thunder_df.datetime) # observation_point_df.to_pickle('../dataset/observation_point.pkl') thunder_df = pd.concat([thunder_df, pd.get_dummies(thunder_df.type)], axis=1) thunder_df.to_pickle('../dataset/thunder_df.pkl') return thunder_df
def __init__(self, baseDir='../temp/repo'): '''baseDir?????????''' self.dir = baseDir self.data = {} if not os.path.exists(self.dir): os.makedirs(self.dir) logging.info('?????: %s'%self.dir) #????????????????????? for p in os.listdir(self.dir): if os.path.isfile( os.path.join(self.dir, p)): key = re.split(r'.', p)[0] path = os.path.join(self.dir, p) t = pd.read_pickle(path) logging.info('?%s???%s.'%(path, key)) self.data[key] = t
def read_models_from_dir(dir): models = glob.glob(dir + '/*/') selected_models = filter(lambda x: 'bag' not in x, models) print selected_models bagged_oobs = [] bagged_preds = [] for model in selected_models: pred_file = model + '/' + 'preds.csv' oob_file = model + '/' + 'oob.pkl' oob = pd.read_pickle(oob_file) preds = pd.read_csv(pred_file) preds['ut_ms'] = pd.to_datetime(preds['ut_ms'], unit='ms') preds=preds.set_index('ut_ms') bagged_oobs.append(oob) bagged_preds.append(preds) return bagged_oobs, bagged_preds, selected_models
def read_models_from_dir(dir): model_array = [] models = glob.glob(dir + '/*/') selected_models = filter(lambda x: 'bag' not in x, models) print selected_models for model in selected_models: try: pred_file = model + '/' + 'preds.csv' oob_file = model + '/' + 'oob.pkl' oob = pd.read_pickle(oob_file) cols = [model + str(i) for i in oob.columns] print model, oob.shape preds = pd.read_csv(pred_file) preds['ut_ms'] = pd.to_datetime(preds['ut_ms'], unit='ms') preds = preds.set_index('ut_ms') model_array.append((Model(model, oob, preds, RMSE(target.loc[oob.index], oob)))) except: print "Error! ", model pass return model_array
def load(): global user_order, goods, pname2id, model user_order = pd.read_pickle('../input/mk/user_order.p') goods = pd.read_pickle('../input/mk/goods.p') pname2id = {} for k,v in zip(goods.product_name, goods.product_id): pname2id[k] = v model = load_instacart_vec() print('Activated utils.vec2pids, utils.pnames2ids') return
def make(T): """ T = 0 folder = 'trainT-0' """ if T==-1: folder = 'test' else: folder = 'trainT-'+str(T) label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder)) df = pd.merge(label[['order_id', 'product_id']], tbl[['order_id', 'product_id','days_since_last_order_this_item']], on=['order_id', 'product_id'], how='left') df.to_pickle('../feature/{}/f303_order-product.p'.format(folder)) #============================================================================== # main #==============================================================================
def concat_pred_item(T, dryrun=False): if T==-1: name = 'test' else: name = 'trainT-'+str(T) df = utils.load_pred_item(name) df = pd.merge(df, pd.read_pickle('../feature/{}/f317_user-product.p'.format(name)), on=['user_id', 'product_id'],how='left') gc.collect() #============================================================================== print('output') #============================================================================== if dryrun == True: return df else: utils.to_pickles(df, '../feature/{}/all_apdx'.format(name), 20, inplace=True)
def trainModel(self): df = pd.read_pickle("./train_features.pkl") x_df = pd.concat([df.iloc[:,4:6],df.iloc[:,8]],axis=1) y_df = df.iloc[:,9] print(x_df) print(len(x_df)) print(len(y_df)) train_no = int(0.8 * len(df)) #train_no = 100000 print(train_no) train_df = x_df.iloc[0:train_no,:] train_labels = y_df.iloc[0:train_no] test_df = x_df.iloc[train_no:,:] test_labels = y_df.iloc[train_no:] self.model = LogisticClassifier(3) self.model.trainModel(train_df,train_labels) self.model.validateModel(test_df,test_labels)
def get_answers_matrix(split): if split == 'train': data_path = 'data/train_qa' elif split == 'val': data_path = 'data/val_qa' else: print('Invalid split!') sys.exit() df = pd.read_pickle(data_path) answers = df[['multiple_choice_answer']].values.tolist() answer_matrix = np.zeros((len(answers),1001)) default_onehot = np.zeros(1001) default_onehot[1000] = 1.0 for i, answer in enumerate(answers): answer_matrix[i] = answer_to_onehot_dict.get(answer[0].lower(),default_onehot) return answer_matrix
def get_questions_matrix(split): if split == 'train': data_path = 'data/train_qa' elif split == 'val': data_path = 'data/val_qa' else: print('Invalid split!') sys.exit() df = pd.read_pickle(data_path) questions = df[['question']].values.tolist() word_idx = ebd.load_idx() seq_list = [] for question in questions: words = word_tokenize(question[0]) seq = [] for word in words: seq.append(word_idx.get(word,0)) seq_list.append(seq) question_matrix = pad_sequences(seq_list) return question_matrix
def get_result_by_last_three_weeks_mean(): data = pd.read_pickle(static_params.DATA_PATH + 'user_pay_last_three_weeks.pkl') result = pd.DataFrame(data['iid']) date = '2016-11-' index = 1 for index in range(1,8): column = date + str(index) result[column] = data.loc[:,['2016-10-' + str(index + 10),'2016-10-' + str(index + 17),'2016-10-' + str(index + 24)]].mean(1) data2 = result.copy() result = pd.merge(data2,result,on='iid') result.iloc[:,-4] = result.iloc[:,-4]*1.2 result = result.astype(int) result.to_csv(static_params.DATA_PATH + 'submission.csv',header=None,index=None)
def get_result_by_last_two_weeks_mean(): #??????????????? data = pd.read_pickle(static_params.DATA_PATH + 'user_pay_last_two_weeks.pkl') print data result = pd.DataFrame(data['iid']) date = '2016-11-' index = 1 for index in range(1,8): column = date + str(index) result[column] = data.loc[:,['2016-10-' + str(index + 17),'2016-10-' + str(index + 24)]].mean(1) data2 = result.copy() result = pd.merge(data2,result,on='iid').astype(int) result.to_csv(static_params.DATA_PATH + 'submission.csv',header=None,index=None)
def user_view_split_by_shop(): if(not os.path.exists(static_params.DATA_USER_VIEW_BY_SHOP_PATH)): os.mkdir(static_params.DATA_USER_VIEW_BY_SHOP_PATH) data = pd.read_pickle(static_params.DATA_PATH + 'user_view.pkl') print type(data) data.columns = ['uid','iid','time'] print data data['iid'] = data['iid'].astype(str) data['time'] = data['time'].apply(get_date) grouped = data.groupby(['iid'],as_index=False) for name,group in grouped: f = open(static_params.DATA_USER_VIEW_BY_SHOP_PATH + str(name) + '.pkl', 'wb') cPickle.dump(group,f,-1) f.close()
def get_extra_train(): ##############################extra features################################## train_simhash_features=pd.read_csv('data/extra_feature/train_simhash_features.csv') train_selftrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/train_selftrained_w2v_sim_dist.pkl') train_selftrained_glove_sim_dist=pd.read_pickle('data/extra_feature/train_selftrained_glove_sim_dist.pkl') train_pretrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/train_pretrained_w2v_sim_dist.pkl') train_distinct_word_stats_selftrained_glove=pd.read_csv('data/extra_feature/train_distinct_word_stats_selftrained_glove.csv') train_distinct_word_stats_pretrained=pd.read_csv('data/extra_feature/train_distinct_word_stats_pretrained.csv') train_distinct_word_stats=pd.read_csv('data/extra_feature/train_distinct_word_stats.csv') X_train=np.hstack([train_simhash_features, train_selftrained_w2v_sim_dist, train_selftrained_glove_sim_dist, train_pretrained_w2v_sim_dist, train_distinct_word_stats_selftrained_glove, train_distinct_word_stats_pretrained, train_distinct_word_stats,]) print X_train.shape return X_train
def get_extra_test(): ##############################extra features################################## test_simhash_features=pd.read_csv('data/extra_feature/test_simhash_features.csv') test_selftrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/test_selftrained_w2v_sim_dist.pkl') test_selftrained_glove_sim_dist=pd.read_pickle('data/extra_feature/test_selftrained_glove_sim_dist.pkl') test_pretrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/test_pretrained_w2v_sim_dist.pkl') test_distinct_word_stats_selftrained_glove=pd.read_csv('data/extra_feature/test_distinct_word_stats_selftrained_glove.csv') test_distinct_word_stats_pretrained=pd.read_csv('data/extra_feature/test_distinct_word_stats_pretrained.csv') test_distinct_word_stats=pd.read_csv('data/extra_feature/test_distinct_word_stats.csv') X_test=np.hstack([ test_simhash_features, test_selftrained_w2v_sim_dist, test_selftrained_glove_sim_dist, test_pretrained_w2v_sim_dist, test_distinct_word_stats_selftrained_glove, test_distinct_word_stats_pretrained, test_distinct_word_stats,]) print X_test.shape return X_test
def get_feature_importance(feature): import scipy.stats as sps import pandas as pd y_train = pd.read_csv('../data/train.csv')['is_duplicate'] return sps.spearmanr(feature,y_train)[0] # import pickle # pickle.dump(X_train,open("data_train.pkl", 'wb'), protocol=2) # # data_file=['test_deptree','test_glove_sim_dist','test_pca_glove', # 'test_pca_pattern','test_w2w','test_pos','test_pca_char'] # # path='../test/' # for it in range(6): # tmp=[] # flist=[item+str(it) for item in data_file] # test=np.empty((400000,0)) # if it==5: # test=np.empty((345796,0)) # for f in flist: # test=np.hstack([test,pd.read_pickle(path+f+'.pkl')]) # pickle.dump(test,open('data_test{0}.pkl'.format(it),'wb'),protocol=2)
def split_cli(): p = ArgumentParser() p.add_argument("expanded", default="expanded.pickle", help="Expanded pickle file targets.") p.add_argument("stripped", default="test.pickle", help="stripped data filename") p.add_argument("train", default="train.pickle", help="training filename") p.add_argument("test", default="test.pickle", help="test filename") p.add_argument("attrfile", default="attrs.txt", help="attrs to care about for NA purposes") p.add_argument("--na-strategy", default="drop", help="what to do with NA rows (default is drop them)") p.add_argument("--trainpct", default=70, type=int, help="percentage of data to put into training set") p.add_argument("--random", action='store_true', help="split train/test sets randomly (default is by time)") cfg = p.parse_args() strip_and_process_to_files(expanded_file=pd.read_pickle(cfg.expanded), stripped_file=cfg.stripped, attrfile=cfg.attrfile, na_strategy=cfg.na_strategy) split_to_files(trainfile=cfg.train, testfile=cfg.test, stripped=cfg.stripped, trainpct=cfg.trainpct, split_randomly=cfg.random)
def load_nf_histplayerinfo(sport, identifiers_to_load): """ Load previously saved dataframes of numberfire prediction data. :param str sport: which sport! :param list[str] identifiers_to_load: id of players to load :return dict[str, DataFrame]: dict of player -> prediction data for player """ loaded = 0 histplayerinfo_dict = {} for identifier in identifiers_to_load: target_file = get_histplayerinfo_filename(sport, identifier) if os.path.exists(target_file): histplayerinfo_dict[identifier] = pandas.read_pickle(target_file) # Attempt to convert the index to time based if possible if histplayerinfo_dict[identifier] is not None and 'date' in histplayerinfo_dict[identifier].columns: histplayerinfo_dict[identifier].set_index('date', inplace=True) loaded += 1 return histplayerinfo_dict
def load_nf_salaryinfo(sport, players): """ Load previously saved dataframes of numberfire salary data :param list[str] players: players to load :return dict[str, DataFrame]: dict of player -> salary data for player """ loaded = 0 player_dict = {} for player in players: target_file = get_salary_filename(sport, player) if os.path.exists(target_file): player_dict[player] = pandas.read_pickle(target_file) # Attempt to convert the index to time based if possible if player_dict[player] is not None and 'date' in player_dict[player].columns: player_dict[player].set_index('date', inplace=True) loaded += 1 return player_dict
def combine_dataframe_into_pickle_file(dataframe, outfile, overwrite=False): """ Save the provided pandas dataframe as a pickle to the provided file path. If a file is already present at that location, unpickle it, combine the dataframes, and save the result as a pickle (overwriting the file but keeping the data). Uses combine_first, prioritizing new data but keeping data from before. Obviously this will blow up catastrophically if there is a file at outfile which is not a DataFrame, and the data will get super gross if it *is* a DataFrame but the indices do not match. :param pandas.DataFrame dataframe: input dataframe :param str outfile: output file :return None: """ if os.path.exists(outfile) and not overwrite: target_df = pandas.read_pickle(outfile) merged_df = dataframe.combine_first(target_df) merged_df.to_pickle(outfile) else: dataframe.to_pickle(outfile)
def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL): # data = pd.read_csv(file_in, dtype='str') # data['DateTime'] = pd.to_datetime( # data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str), # format='%Y%m%d%H%M%S') # data = data.set_index('DateTime') # data = pd.Series(data['<CLOSE>']).map(float) # data = data.resample('M').fillna(method='pad') # data = preprocessing.minmax_scale(data) # data_t = data[6:] # data_f = data.reshape(-1, 6) # data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)]) # np.save(file_out[0], data_f[:len(data_f) - 1]) # np.save(file_out[1], data_t) data = preprocessing.minmax_scale(pd.read_pickle( file_in)['close']) data = data.reshape(-1, 24) data_m = np.array([[data[i + x][0] for x in range(5)] for i in range(len(data) - 5 + 1)]) data_m = data_m.reshape(-1, 5) data_s = np.array([data[i + 5][0] for i in range(len(data) - 5)]) np.save(file_out[0], data_m[:len(data_m) - 1]) np.save(file_out[1], data_s)
def get_fs_t_5(file_in, file_out, i): data = pd.read_pickle(file_in)['close'] data = data.reshape(-1, 24) data = np.float32([[data[i + x][-1] for x in range(5 * i) if x % i == 0] for i in range(len(data) - 5 * i + 1)]) data = data.reshape(-1, 5) data_t = { 'change': np.float32( [(data[i + i][-1] - data[i + i][0]) / data[i + i][0] * 100 for i in range(data.shape[0] - i)]), 'target_open': np.float32([data[i + i][0] for i in range(data.shape[0] - i)]), 'real_target': np.float32([data[i + i][-1] for i in range(data.shape[0] - i)]) } data_t = pd.DataFrame(data_t) np.save(file_out[0], data[:len(data) - i]) data_t.to_pickle(file_out[1])
def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL): # data = pd.read_csv(file_in, dtype='str') # data['DateTime'] = pd.to_datetime( # data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str), # format='%Y%m%d%H%M%S') # data = data.set_index('DateTime') # data = pd.Series(data['<CLOSE>']).map(float) # data = data.resample('M').fillna(method='pad') # data = preprocessing.minmax_scale(data) # data_t = data[6:] # data_f = data.reshape(-1, 6) # data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)]) # np.save(file_out[0], data_f[:len(data_f) - 1]) # np.save(file_out[1], data_t) data = preprocessing.minmax_scale(pd.read_pickle( file_in)['close']) data_m = np.array([[data[i + x * 24 * 24] for x in range(6)] for i in range(len(data) - 6 * 24 * 24 + 1)]) data_m = data_m.reshape(-1, 6) data_s = np.array([data[i + 6 * 24 * 24] for i in range(len(data) - 6 * 24 * 24)]) np.save(file_out[0], data_m[:len(data_m) - 1]) np.save(file_out[1], data_s)
def _build(self,flags,files): path = flags.input_path Table = namedtuple('Table', 'name fname dtype') fnames = "adult.data,adult.test".split(',') names = "train,test".split(',') TABLES = [Table(i,"%s/%s"%(path,j),None) for i,j in zip(names,fnames) if files =="all" or i in files] print() self.flags = flags path = flags.data_path data = {} columns = [ "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket" ] for table in TABLES: name = table.name fname = table.fname dtype = table.dtype pname = "%s/%s.pkl"%(path,name.split('/')[-1].split('.')[0]) if os.path.exists(pname): data[name] = pd.read_pickle(pname) else: if name == 'train': data[name] = pd.read_csv(fname,dtype=dtype,header=None,skipinitialspace=True, names=columns) if name == 'test': data[name] = pd.read_csv(fname,dtype=dtype,header=None,skipinitialspace=True, skiprows=1,names=columns) data[name]['target'] = data[name]["income_bracket"].apply(lambda x: ">50K" in x).astype(int) data[name].drop('income_bracket',axis=1,inplace=True) data[name].to_pickle(pname) print_mem_time("Loaded {} {}".format(fname.split('/')[-1],data[name].shape)) self.data = data # no copy, pass the inference print()
def read_data(name): train_pk = name.replace('.csv','.pkl') if os.path.exists(train_pk) == False: train = pd.read_csv(name) if "va" not in name and "test" not in name: train.to_pickle(train_pk) else: train = pd.read_pickle(train_pk) return train
def _load_u2o(self): if self.u2o: return path = self.flags.data_path p = "%s/u2o.pkl"%path if os.path.exists(p)==False: self._load_db() u2o = self.pdDB.data['orders'].groupby('user_id')['order_id'].apply(list) u2o.to_pickle(p) else: u2o = pd.read_pickle(p) self.u2o = u2o print_mem_time("Loaded u2o %d"%len(u2o))
def _build(self,flags,files): fnames,names = self.fnames,self.names path = self.path Table = namedtuple('Table', 'name fname dtype') tables = [Table(i,"%s/%s"%(path,j),{}) for i,j in zip(names,fnames) if files =="all" or i in files] print() self.flags = flags path = flags.data_path data = {} for table in tables: name,fname,dtype = table.name,table.fname,table.dtype pname = "%s/%s_%s.pkl"%(path,self.name,name.split('/')[-1].split('.')[0]) if os.path.exists(pname): data[name] = pd.read_pickle(pname) else: if '_text' in name: data[name] = pd.read_csv(fname,header=None,sep="\|\|",skiprows=1,names=['ID','Text']) else: data[name] = pd.read_csv(fname) data[name].to_pickle(pname) print_mem_time("Loaded {} {}".format(fname.split('/')[-1],data[name].shape)) self.data = data # no copy, pass the reference if "training_variants" in self.data: y = self.data["training_variants"]['Class']-1 from utils.np_utils.encoder import onehot_encode self.y = onehot_encode(y,self.flags.classes) print()
def combine_data(paths): ''' Function to combine dataframes from pickled form INPUT: paths: Iterable of filepaths for pickled DataFrames OUTPUT: ratings_df: Single pandas DataFrame with all ratings ''' ratings_df = pd.read_pickle(paths[0]) for path in paths[1:]: ratings_df = ratings_df.append(pd.read_pickle(path)) return ratings_df
def check_review_counts(ratings_df): ''' Function to check that enough data was collected. Compares number of reviews for each target employer with the number of reviews collected INPUT: ratings_df: Pandas DataFrame containing scraped review text OUTPUT: good_er_ids, bad_er_ids: Lists of tuples to rescrape from glassdoor ''' clean_df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl')) target_ratings = clean_df[['company_name', 'company_id', 'num_ratings', 'overall_rating']] company_ratings = ratings_df['company_name'].value_counts() company_ratings = company_ratings.to_frame(name='ratings_collected') company_ratings.reset_index(inplace=True) check_df = target_ratings.merge(company_ratings, how='left', left_on='company_name', right_on='index') check_df['company_id'] = check_df['company_id'].astype(int) check_df.drop('index', axis=1, inplace=True) check_df['delta'] = check_df['num_ratings'] - check_df['ratings_collected'] check_df['delta_pct'] = check_df['delta'] / check_df['num_ratings'] rescrape = check_df[check_df['delta_pct'] > 0.5] good_rescrape = rescrape[rescrape['overall_rating'] > 3.5] bad_rescrape = rescrape[rescrape['overall_rating'] < 3.5] good_er_ids = zip(good_rescrape['company_name'], good_rescrape['company_id']) bad_er_ids = zip(bad_rescrape['company_name'], bad_rescrape['company_id']) pickle.dump(good_er_ids, open(os.path.join('data', 'rescrape_pros.pkl'), 'wb')) pickle.dump(bad_er_ids, open(os.path.join('data', 'rescrape_cons.pkl'), 'wb')) return good_er_ids, bad_er_ids
def plot(result_dict_file, is_show, plot_save_file): """ Draw result DataFrame """ import pandas as pd from rqalpha.plot import plot_result result_dict = pd.read_pickle(result_dict_file) if is_show: plot_result(result_dict) if plot_save_file: plot_result(result_dict, show_windows=False, savefile=plot_save_file)
def report(result_pickle_file_path, target_report_csv_path): """ Generate report from backtest output file """ import pandas as pd result_dict = pd.read_pickle(result_pickle_file_path) from rqalpha.utils.report import generate_report generate_report(result_dict, target_report_csv_path)
def plot(result_dict_file, show, plot_save_file): """ [sys_analyser] draw result DataFrame """ import pandas as pd from .plot import plot_result result_dict = pd.read_pickle(result_dict_file) plot_result(result_dict, show, plot_save_file)
def report(result_pickle_file_path, target_report_csv_path): """ [sys_analyser] Generate report from backtest output file """ import pandas as pd result_dict = pd.read_pickle(result_pickle_file_path) from .report import generate_report generate_report(result_dict, target_report_csv_path)
def display_proposals(): '''print out a list of the proposal names which were generated and stored in the dill folder by the build_program_files script no inputs ''' print('proposal list:') print(list(pd.read_pickle('dill/proposal_names.pkl').proposals))
def plot_actions(cue=0): mpl.rcParams['axes.labelsize'] = 'large' d_map = {3:1, 8:2, 14:3, 23:4} df = pd.read_pickle('data.pkl').reset_index() df = df.loc[df['cue'] == cue] g = sns.FacetGrid(df, col='subject', col_wrap=6, size=1.5, ylim=(0, 5), aspect=1.5) g.map(plt.plot, 'action') g.set(xticks=[], yticks=[0,1,2,3], yticklabels=['3', '8', '14', '23']) g.set(ylim=(-0.5, 4)) g.set_ylabels('choice') g.fig.tight_layout() g.fig.subplots_adjust(top=0.93) subjects = df['subject'].unique() for ax, subject in zip(g.axes, subjects): df_subject = df.loc[df['subject'] == subject] df_subject.reset_index(inplace=True) df_wins = df_subject.loc[df_subject['reward'] > 0] df_lose = df_subject.loc[df_subject['reward'] < 0] pos_win = df_wins.loc[df_wins['subject'] == subject].index pos_lose = df_lose.loc[df_lose['subject'] == subject].index ax.eventplot(pos_win, lineoffsets=3.5, linelength=0.75, linewidths=0.4) ax.eventplot(pos_lose, lineoffsets=3.5, linelength=0.75, color='r', linewidths=0.4) plt.tight_layout() plt.savefig('actions_0.pdf') plt.show() globals().update(locals())
def appendDfToPickle(df, filePath): import os import pandas as pd if not os.path.isfile(filePath): df.to_pickle(filePath) else: tempDF=pd.read_pickle(filePath) tempDF=tempDF.append(df, ignore_index = True) tempDF.to_pickle(filePath)
def load_dataset(key): """ Function to load datasets included in the chainladder package. Arguments: key: str The name of the dataset, e.g. RAA, ABC, UKMotor, GenIns, etc. Returns: pandas.DataFrame of the loaded dataset. """ path = os.path.dirname(os.path.abspath(__file__)) return read_pickle(os.path.join(path, 'data', key))