Python pandas 模块，read_pickle() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用pandas.read_pickle()。

项目：glassdoor-analysis 作者：THEdavehogue | 项目源码 | 文件源码

def load_pkl():
    '''
    loads a pickled DataFrame with the employers to scrape ratings for.

    INPUT:
        None

    OUTPUT:
        df: pandas DataFrame
        split: threshold of good/bad employer ratings
    '''
    df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl'))
    df['company_id'] = df['company_id'].astype(int)
    df['num_ratings'] = df['num_ratings'].astype(int)
    split = df['overall_rating'].mean()
    return df, split

项目：kaggle-review 作者：daxiongshu | 项目源码 | 文件源码

def _load_o2p(self):
        if self.o2p:
            return
        path = self.flags.data_path
        p = "%s/o2p.pkl"%path
        if os.path.exists(p)==False:
            self._load_db()
            ops = self.pdDB.data['op_prior']
            ops = ops.append(self.pdDB.data['op_train'])
            o2p = ops.sort_values(['order_id', 'add_to_cart_order'])\
                .groupby('order_id')['product_id'].apply(list)
            o2p.to_pickle(p)
        else:
            o2p = pd.read_pickle(p)
        self.o2p = o2p
        print_mem_time("Loaded o2p %d"%len(o2p))

项目：singlecell-dash 作者：czbiohub | 项目源码 | 文件源码

def compute_cell_smushing(self):
        """Within each plate, find a 2d embedding of all cells"""
        grouped = self.genes.groupby(self.cell_metadata[self.SAMPLE_MAPPING])

        if os.path.exists(self.cell_smushed_cache_file):
            smusheds = pd.read_pickle(self.cell_smushed_cache_file)
            # if nothing is missing, return the cached version
            if not set(grouped.groups) - set(smusheds):
                return smusheds
        else:
            smusheds = {}

        for plate_name, genes_subset in grouped:
            if plate_name not in smusheds:
                cell_smusher = TSNE(metric='cosine', random_state=0)
                cell_smushed = pd.DataFrame(
                    cell_smusher.fit_transform(genes_subset),
                    index=genes_subset.index)
                smusheds[plate_name] = cell_smushed

        pd.to_pickle(smusheds, self.cell_smushed_cache_file)

        return smusheds

项目：FHDMM 作者：aweinstein | 项目源码 | 文件源码

def fit_behavioral_data():
    """Fit a model for all subjects. """
    df = pd.read_pickle('data.pkl')
    subjects = df.index.get_level_values('subject').unique()
    data = np.empty((subjects.size, 10))
    cues = (0, 1)
    for i, subject in enumerate(subjects):
        print('Fitting model for subject {}'.format(subject))
        df_s = df.loc[subject]
        for cue in cues:
            ml = ML(df_s[df_s['cue']==cue])
            r = ml.ml_estimation()
            data[i,2*cue:(2*cue+2)] = r.x
            data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense()))
            data[i,cue+8] = r.fun

    model = pd.DataFrame(data, pd.Index(subjects, name='subject'),
                         ['alpha_0', 'beta_0', 'alpha_1', 'beta_1',
                          'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1',
                          'NLL_0', 'NLL_1'])
    return model

项目：FHDMM 作者：aweinstein | 项目源码 | 文件源码

def fit_single_subject(subject=4):
    df = pd.read_pickle('data.pkl')
    print('Fitting model for subject {}'.format(subject))
    df_s = df.loc[subject]

    cues = (0, 1, 2)
    for cue in cues:
        ml = ML(df_s[df_s['cue']==cue])
        r = ml.ml_estimation()
        H_inv = r.hess_inv.todense()
        print('\t cue:{:d}'.format(cue))
        print('\t\tr:\n\t\t\t{}\n'.format(r.x))
        print('\tInverse of Hessian:\n{}\n'.format(H_inv))


    globals().update(locals())

项目：nuts-ml 作者：maet3608 | 项目源码 | 文件源码

def _load_table(self, filepath):
        """
        Load table from file system.

        :param str filepath: Path to table in CSV, TSV, XLSX or
                   Pandas pickle format.
        :return: Pandas table
        :rtype: pandas.core.frame.DataFrame
        """
        _, ext = os.path.splitext(filepath.lower())
        if ext == '.tsv':
            return pd.read_table(filepath, **self.kwargs)
        if ext == '.csv':
            return pd.read_csv(filepath, **self.kwargs)
        if ext == '.xlsx':
            return pd.read_excel(filepath, **self.kwargs)
        return pd.read_pickle(filepath, **self.kwargs)

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def test_legacy_pickle(self):
        if PY3:
            raise nose.SkipTest("testing for legacy pickles not "
                                "support on py3")

        path = tm.get_data_path('multiindex_v1.pickle')
        obj = pd.read_pickle(path)

        obj2 = MultiIndex.from_tuples(obj.values)
        self.assertTrue(obj.equals(obj2))

        res = obj.get_indexer(obj)
        exp = np.arange(len(obj))
        assert_almost_equal(res, exp)

        res = obj.get_indexer(obj2[::-1])
        exp = obj.get_indexer(obj[::-1])
        exp2 = obj2.get_indexer(obj2[::-1])
        assert_almost_equal(res, exp)
        assert_almost_equal(exp, exp2)

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def test_legacy_v2_unpickle(self):

        # 0.7.3 -> 0.8.0 format manage
        path = tm.get_data_path('mindex_073.pickle')
        obj = pd.read_pickle(path)

        obj2 = MultiIndex.from_tuples(obj.values)
        self.assertTrue(obj.equals(obj2))

        res = obj.get_indexer(obj)
        exp = np.arange(len(obj))
        assert_almost_equal(res, exp)

        res = obj.get_indexer(obj2[::-1])
        exp = obj.get_indexer(obj[::-1])
        exp2 = obj2.get_indexer(obj2[::-1])
        assert_almost_equal(res, exp)
        assert_almost_equal(exp, exp2)

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def test_pickle_v0_14_1(self):

        # we have the name warning
        # 10482
        with tm.assert_produces_warning(UserWarning):
            cat = pd.Categorical(values=['a', 'b', 'c'],
                                 categories=['a', 'b', 'c', 'd'],
                                 name='foobar', ordered=False)
        pickle_path = os.path.join(tm.get_data_path(),
                                   'categorical_0_14_1.pickle')
        # This code was executed once on v0.14.1 to generate the pickle:
        #
        # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
        #                   name='foobar')
        # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
        #
        self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def test_pickle_v0_15_2(self):
        # ordered -> _ordered
        # GH 9347

        # we have the name warning
        # 10482
        with tm.assert_produces_warning(UserWarning):
            cat = pd.Categorical(values=['a', 'b', 'c'],
                                 categories=['a', 'b', 'c', 'd'],
                                 name='foobar', ordered=False)
        pickle_path = os.path.join(tm.get_data_path(),
                                   'categorical_0_15_2.pickle')
        # This code was executed once on v0.15.2 to generate the pickle:
        #
        # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
        #                   name='foobar')
        # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
        #
        self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def compare(self, vf, version):

        # py3 compat when reading py2 pickle
        try:
            data = pandas.read_pickle(vf)
        except (ValueError) as e:
            if 'unsupported pickle protocol:' in str(e):
                # trying to read a py3 pickle in py2
                return
            else:
                raise

        for typ, dv in data.items():
            for dt, result in dv.items():
                try:
                    expected = self.data[typ][dt]
                except (KeyError):
                    continue

                # use a specific comparator
                # if available
                comparator = getattr(self, "compare_{typ}_{dt}".format(
                    typ=typ, dt=dt), self.compare_element)
                comparator(result, expected, typ, version)
        return data

项目：jsaicup2017 作者：SS1031 | 项目源码 | 文件源码

def thunder():
    if os.path.exists('../dataset/thunder.pkl'):
        return pd.read_pickle('../dataset/thunder.pkl')

    thunder_df = pd.read_csv('../input/thunder.csv',
                             names=[
                                 'datetime',    # ????
                                 'lat',         # ??(10??)
                                 'lon',         # ??(10??)
                                 'type'         # ???, CG: ???, IC: ???
                             ])

    # ?????????
    thunder_df.datetime = pd.to_datetime(thunder_df.datetime)

    # observation_point_df.to_pickle('../dataset/observation_point.pkl')
    thunder_df = pd.concat([thunder_df, pd.get_dummies(thunder_df.type)], axis=1)
    thunder_df.to_pickle('../dataset/thunder_df.pkl')

    return thunder_df

项目：tianchi_ijcai17 作者：LiuDongjing | 项目源码 | 文件源码

def __init__(self, baseDir='../temp/repo'):
        '''baseDir?????????'''
        self.dir = baseDir
        self.data = {}
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
            logging.info('?????: %s'%self.dir)

        #?????????????????????
        for p in os.listdir(self.dir):
            if os.path.isfile(
                    os.path.join(self.dir, p)):
                key = re.split(r'.', p)[0]
                path = os.path.join(self.dir, p)
                t = pd.read_pickle(path)
                logging.info('?%s???%s.'%(path, key))
                self.data[key] = t

项目：kelvin-power-challenge 作者：alex-bauer | 项目源码 | 文件源码

def read_models_from_dir(dir):
    models = glob.glob(dir + '/*/')

    selected_models = filter(lambda x: 'bag' not in x, models)

    print selected_models
    bagged_oobs = []
    bagged_preds = []

    for model in selected_models:

        pred_file = model + '/' + 'preds.csv'
        oob_file = model + '/' + 'oob.pkl'

        oob = pd.read_pickle(oob_file)
        preds = pd.read_csv(pred_file)
        preds['ut_ms'] = pd.to_datetime(preds['ut_ms'], unit='ms')
        preds=preds.set_index('ut_ms')
        bagged_oobs.append(oob)
        bagged_preds.append(preds)

    return bagged_oobs, bagged_preds, selected_models

项目：kelvin-power-challenge 作者：alex-bauer | 项目源码 | 文件源码

def read_models_from_dir(dir):
    model_array = []

    models = glob.glob(dir + '/*/')

    selected_models = filter(lambda x: 'bag' not in x, models)

    print selected_models

    for model in selected_models:
        try:
            pred_file = model + '/' + 'preds.csv'
            oob_file = model + '/' + 'oob.pkl'

            oob = pd.read_pickle(oob_file)
            cols = [model + str(i) for i in oob.columns]
            print model, oob.shape
            preds = pd.read_csv(pred_file)
            preds['ut_ms'] = pd.to_datetime(preds['ut_ms'], unit='ms')
            preds = preds.set_index('ut_ms')
            model_array.append((Model(model, oob, preds, RMSE(target.loc[oob.index], oob))))
        except:
            print "Error! ", model
            pass
    return model_array

项目：Instacart 作者：KazukiOnodera | 项目源码 | 文件源码

def load():

    global user_order, goods, pname2id, model

    user_order = pd.read_pickle('../input/mk/user_order.p')

    goods = pd.read_pickle('../input/mk/goods.p')

    pname2id = {}
    for k,v in zip(goods.product_name, goods.product_id):
        pname2id[k] = v

    model = load_instacart_vec()

    print('Activated utils.vec2pids, utils.pnames2ids')

    return

项目：Instacart 作者：KazukiOnodera | 项目源码 | 文件源码

def make(T):
    """
    T = 0
    folder = 'trainT-0'
    """
    if T==-1:
        folder = 'test'
    else:
        folder = 'trainT-'+str(T)

    label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))

    df = pd.merge(label[['order_id', 'product_id']], 
                 tbl[['order_id', 'product_id','days_since_last_order_this_item']], 
                 on=['order_id', 'product_id'], how='left')

    df.to_pickle('../feature/{}/f303_order-product.p'.format(folder))
#==============================================================================
# main
#==============================================================================

项目：Instacart 作者：KazukiOnodera | 项目源码 | 文件源码

def concat_pred_item(T, dryrun=False):
    if T==-1:
        name = 'test'
    else:
        name = 'trainT-'+str(T)

    df = utils.load_pred_item(name)

    df = pd.merge(df, pd.read_pickle('../feature/{}/f317_user-product.p'.format(name)), 
                  on=['user_id', 'product_id'],how='left')

    gc.collect()

    #==============================================================================
    print('output')
    #==============================================================================
    if dryrun == True:
        return df
    else:
        utils.to_pickles(df, '../feature/{}/all_apdx'.format(name), 20, inplace=True)

项目：tensorflow-quorakaggle 作者：ram1988 | 项目源码 | 文件源码

def trainModel(self):
        df = pd.read_pickle("./train_features.pkl")
        x_df = pd.concat([df.iloc[:,4:6],df.iloc[:,8]],axis=1)
        y_df = df.iloc[:,9]

        print(x_df)
        print(len(x_df))
        print(len(y_df))

        train_no = int(0.8 * len(df))
        #train_no = 100000
        print(train_no)

        train_df = x_df.iloc[0:train_no,:]
        train_labels = y_df.iloc[0:train_no]
        test_df = x_df.iloc[train_no:,:]
        test_labels = y_df.iloc[train_no:]

        self.model = LogisticClassifier(3)
        self.model.trainModel(train_df,train_labels)
        self.model.validateModel(test_df,test_labels)

项目：NeuralNetwork-ImageQA 作者：ayushoriginal | 项目源码 | 文件源码

def get_answers_matrix(split):
    if split == 'train':
        data_path = 'data/train_qa'
    elif split == 'val':
        data_path = 'data/val_qa'
    else:
        print('Invalid split!')
        sys.exit()

    df = pd.read_pickle(data_path)
    answers = df[['multiple_choice_answer']].values.tolist()
    answer_matrix = np.zeros((len(answers),1001))
    default_onehot = np.zeros(1001)
    default_onehot[1000] = 1.0

    for i, answer in enumerate(answers):
        answer_matrix[i] = answer_to_onehot_dict.get(answer[0].lower(),default_onehot)

    return answer_matrix

项目：NeuralNetwork-ImageQA 作者：ayushoriginal | 项目源码 | 文件源码

def get_questions_matrix(split):
    if split == 'train':
        data_path = 'data/train_qa'
    elif split == 'val':
        data_path = 'data/val_qa'
    else:
        print('Invalid split!')
        sys.exit()

    df = pd.read_pickle(data_path)
    questions = df[['question']].values.tolist()
    word_idx = ebd.load_idx()
    seq_list = []

    for question in questions:
        words = word_tokenize(question[0])
        seq = []
        for word in words:
            seq.append(word_idx.get(word,0))
        seq_list.append(seq)
    question_matrix = pad_sequences(seq_list)

    return question_matrix

项目：tianchi_koubei_17 作者：codemayq | 项目源码 | 文件源码

def get_result_by_last_three_weeks_mean():
    data = pd.read_pickle(static_params.DATA_PATH + 'user_pay_last_three_weeks.pkl')

    result = pd.DataFrame(data['iid'])

    date = '2016-11-'
    index = 1
    for index in range(1,8):
        column = date + str(index)
        result[column]  = data.loc[:,['2016-10-' + str(index + 10),'2016-10-' + str(index + 17),'2016-10-' + str(index + 24)]].mean(1)

    data2 = result.copy()
    result = pd.merge(data2,result,on='iid')

    result.iloc[:,-4] = result.iloc[:,-4]*1.2
    result = result.astype(int)

    result.to_csv(static_params.DATA_PATH + 'submission.csv',header=None,index=None)

项目：tianchi_koubei_17 作者：codemayq | 项目源码 | 文件源码

def get_result_by_last_two_weeks_mean():
    #???????????????
    data = pd.read_pickle(static_params.DATA_PATH + 'user_pay_last_two_weeks.pkl')

    print data

    result = pd.DataFrame(data['iid'])

    date = '2016-11-'
    index = 1
    for index in range(1,8):
        column = date + str(index)
        result[column]  = data.loc[:,['2016-10-' + str(index + 17),'2016-10-' + str(index + 24)]].mean(1)

    data2 = result.copy()
    result = pd.merge(data2,result,on='iid').astype(int)

    result.to_csv(static_params.DATA_PATH + 'submission.csv',header=None,index=None)

项目：tianchi_koubei_17 作者：codemayq | 项目源码 | 文件源码

def user_view_split_by_shop():
    if(not os.path.exists(static_params.DATA_USER_VIEW_BY_SHOP_PATH)):
        os.mkdir(static_params.DATA_USER_VIEW_BY_SHOP_PATH)

    data = pd.read_pickle(static_params.DATA_PATH + 'user_view.pkl')
    print type(data)

    data.columns = ['uid','iid','time']
    print data

    data['iid'] = data['iid'].astype(str)
    data['time'] = data['time'].apply(get_date)

    grouped = data.groupby(['iid'],as_index=False)

    for name,group in grouped:
        f = open(static_params.DATA_USER_VIEW_BY_SHOP_PATH + str(name) + '.pkl', 'wb')
        cPickle.dump(group,f,-1)
        f.close()

项目：kaggle-quora-solution-8th 作者：qqgeogor | 项目源码 | 文件源码

def get_extra_train():
##############################extra features##################################
    train_simhash_features=pd.read_csv('data/extra_feature/train_simhash_features.csv')
    train_selftrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/train_selftrained_w2v_sim_dist.pkl')
    train_selftrained_glove_sim_dist=pd.read_pickle('data/extra_feature/train_selftrained_glove_sim_dist.pkl')
    train_pretrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/train_pretrained_w2v_sim_dist.pkl')
    train_distinct_word_stats_selftrained_glove=pd.read_csv('data/extra_feature/train_distinct_word_stats_selftrained_glove.csv')
    train_distinct_word_stats_pretrained=pd.read_csv('data/extra_feature/train_distinct_word_stats_pretrained.csv')
    train_distinct_word_stats=pd.read_csv('data/extra_feature/train_distinct_word_stats.csv')


    X_train=np.hstack([train_simhash_features,
            train_selftrained_w2v_sim_dist,
            train_selftrained_glove_sim_dist,
            train_pretrained_w2v_sim_dist,
            train_distinct_word_stats_selftrained_glove,
            train_distinct_word_stats_pretrained,
            train_distinct_word_stats,])


    print X_train.shape

    return X_train

项目：kaggle-quora-solution-8th 作者：qqgeogor | 项目源码 | 文件源码

def get_extra_test():  
##############################extra features##################################
    test_simhash_features=pd.read_csv('data/extra_feature/test_simhash_features.csv')
    test_selftrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/test_selftrained_w2v_sim_dist.pkl')
    test_selftrained_glove_sim_dist=pd.read_pickle('data/extra_feature/test_selftrained_glove_sim_dist.pkl')
    test_pretrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/test_pretrained_w2v_sim_dist.pkl')
    test_distinct_word_stats_selftrained_glove=pd.read_csv('data/extra_feature/test_distinct_word_stats_selftrained_glove.csv')
    test_distinct_word_stats_pretrained=pd.read_csv('data/extra_feature/test_distinct_word_stats_pretrained.csv')
    test_distinct_word_stats=pd.read_csv('data/extra_feature/test_distinct_word_stats.csv')


    X_test=np.hstack([    test_simhash_features,
    test_selftrained_w2v_sim_dist,
    test_selftrained_glove_sim_dist,
    test_pretrained_w2v_sim_dist,
    test_distinct_word_stats_selftrained_glove,
    test_distinct_word_stats_pretrained,
    test_distinct_word_stats,])


    print X_test.shape

    return X_test

项目：kaggle-quora-solution-8th 作者：qqgeogor | 项目源码 | 文件源码

def get_feature_importance(feature):
    import scipy.stats as sps
    import pandas as pd
    y_train = pd.read_csv('../data/train.csv')['is_duplicate']
    return  sps.spearmanr(feature,y_train)[0]

# import pickle
# pickle.dump(X_train,open("data_train.pkl", 'wb'), protocol=2)
#
# data_file=['test_deptree','test_glove_sim_dist','test_pca_glove',
#            'test_pca_pattern','test_w2w','test_pos','test_pca_char']
#
# path='../test/'
# for it in range(6):
#     tmp=[]
#     flist=[item+str(it) for item in data_file]
#     test=np.empty((400000,0))
#     if it==5:
#         test=np.empty((345796,0))
#     for f in flist:
#         test=np.hstack([test,pd.read_pickle(path+f+'.pkl')])
#     pickle.dump(test,open('data_test{0}.pkl'.format(it),'wb'),protocol=2)

项目：sportsball 作者：jgershen | 项目源码 | 文件源码

def split_cli():
  p = ArgumentParser()
  p.add_argument("expanded", default="expanded.pickle", help="Expanded pickle file targets.")
  p.add_argument("stripped", default="test.pickle", help="stripped data filename")
  p.add_argument("train", default="train.pickle", help="training filename")
  p.add_argument("test", default="test.pickle", help="test filename")
  p.add_argument("attrfile", default="attrs.txt", help="attrs to care about for NA purposes")
  p.add_argument("--na-strategy", default="drop", help="what to do with NA rows (default is drop them)")
  p.add_argument("--trainpct", default=70, type=int, help="percentage of data to put into training set")
  p.add_argument("--random", action='store_true', help="split train/test sets randomly (default is by time)")
  cfg = p.parse_args()

  strip_and_process_to_files(expanded_file=pd.read_pickle(cfg.expanded),
                             stripped_file=cfg.stripped,
                             attrfile=cfg.attrfile,
                             na_strategy=cfg.na_strategy)
  split_to_files(trainfile=cfg.train,
                 testfile=cfg.test,
                 stripped=cfg.stripped,
                 trainpct=cfg.trainpct,
                 split_randomly=cfg.random)

项目：sportsball 作者：jgershen | 项目源码 | 文件源码

def load_nf_histplayerinfo(sport, identifiers_to_load):
  """
  Load previously saved dataframes of numberfire prediction data.
  :param str sport: which sport!
  :param list[str] identifiers_to_load:  id of players to load
  :return dict[str, DataFrame]: dict of player -> prediction data for player
  """
  loaded = 0
  histplayerinfo_dict = {}
  for identifier in identifiers_to_load:
    target_file = get_histplayerinfo_filename(sport, identifier)
    if os.path.exists(target_file):
      histplayerinfo_dict[identifier] = pandas.read_pickle(target_file)
      # Attempt to convert the index to time based if possible
      if histplayerinfo_dict[identifier] is not None and 'date' in histplayerinfo_dict[identifier].columns:
        histplayerinfo_dict[identifier].set_index('date', inplace=True)
      loaded += 1
  return histplayerinfo_dict

项目：sportsball 作者：jgershen | 项目源码 | 文件源码

def load_nf_salaryinfo(sport, players):
  """
  Load previously saved dataframes of numberfire salary data
  :param list[str] players: players to load
  :return dict[str, DataFrame]: dict of player -> salary data for player
  """
  loaded = 0
  player_dict = {}
  for player in players:
    target_file = get_salary_filename(sport, player)
    if os.path.exists(target_file):
      player_dict[player] = pandas.read_pickle(target_file)
      # Attempt to convert the index to time based if possible
      if player_dict[player] is not None and 'date' in player_dict[player].columns:
        player_dict[player].set_index('date', inplace=True)
      loaded += 1
  return player_dict

项目：sportsball 作者：jgershen | 项目源码 | 文件源码

def combine_dataframe_into_pickle_file(dataframe, outfile, overwrite=False):
  """
  Save the provided pandas dataframe as a pickle to the provided file path. If a file is already present at that
  location, unpickle it, combine the dataframes, and save the result as a pickle (overwriting the file but keeping the
  data). Uses combine_first, prioritizing new data but keeping data from before.
  Obviously this will blow up catastrophically if there is a file at outfile which is not a DataFrame, and the data
  will get super gross if it *is* a DataFrame but the indices do not match.
  :param pandas.DataFrame dataframe: input dataframe
  :param str outfile: output file
  :return None:
  """
  if os.path.exists(outfile) and not overwrite:
    target_df = pandas.read_pickle(outfile)
    merged_df = dataframe.combine_first(target_df)
    merged_df.to_pickle(outfile)
  else:
    dataframe.to_pickle(outfile)

项目：fx 作者：TaRyu | 项目源码 | 文件源码

def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL):
    # data = pd.read_csv(file_in, dtype='str')
    # data['DateTime'] = pd.to_datetime(
    #     data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str),
    #     format='%Y%m%d%H%M%S')
    # data = data.set_index('DateTime')
    # data = pd.Series(data['<CLOSE>']).map(float)
    # data = data.resample('M').fillna(method='pad')
    # data = preprocessing.minmax_scale(data)
    # data_t = data[6:]
    # data_f = data.reshape(-1, 6)
    # data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)])
    # np.save(file_out[0], data_f[:len(data_f) - 1])
    # np.save(file_out[1], data_t)
    data = preprocessing.minmax_scale(pd.read_pickle(
        file_in)['close'])
    data = data.reshape(-1, 24)
    data_m = np.array([[data[i + x][0] for x in range(5)]
                       for i in range(len(data) - 5 + 1)])
    data_m = data_m.reshape(-1, 5)
    data_s = np.array([data[i + 5][0]
                       for i in range(len(data) - 5)])
    np.save(file_out[0], data_m[:len(data_m) - 1])
    np.save(file_out[1], data_s)

项目：fx 作者：TaRyu | 项目源码 | 文件源码

def get_fs_t_5(file_in, file_out, i):
    data = pd.read_pickle(file_in)['close']
    data = data.reshape(-1, 24)
    data = np.float32([[data[i + x][-1] for
                        x in range(5 * i) if x % i == 0]
                       for i in range(len(data) - 5 * i + 1)])
    data = data.reshape(-1, 5)
    data_t = {
        'change': np.float32(
            [(data[i + i][-1] - data[i + i][0]) /
             data[i + i][0] * 100
             for i in range(data.shape[0] - i)]),
        'target_open': np.float32([data[i + i][0]
                                   for i in range(data.shape[0] - i)]),
        'real_target': np.float32([data[i + i][-1]
                                   for i in range(data.shape[0] - i)])
    }
    data_t = pd.DataFrame(data_t)
    np.save(file_out[0], data[:len(data) - i])
    data_t.to_pickle(file_out[1])

项目：fx 作者：TaRyu | 项目源码 | 文件源码

def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL):
    # data = pd.read_csv(file_in, dtype='str')
    # data['DateTime'] = pd.to_datetime(
    #     data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str),
    #     format='%Y%m%d%H%M%S')
    # data = data.set_index('DateTime')
    # data = pd.Series(data['<CLOSE>']).map(float)
    # data = data.resample('M').fillna(method='pad')
    # data = preprocessing.minmax_scale(data)
    # data_t = data[6:]
    # data_f = data.reshape(-1, 6)
    # data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)])
    # np.save(file_out[0], data_f[:len(data_f) - 1])
    # np.save(file_out[1], data_t)
    data = preprocessing.minmax_scale(pd.read_pickle(
        file_in)['close'])
    data_m = np.array([[data[i + x * 24 * 24] for x in range(6)]
                       for i in range(len(data) - 6 * 24 * 24 + 1)])
    data_m = data_m.reshape(-1, 6)
    data_s = np.array([data[i + 6 * 24 * 24]
                       for i in range(len(data) - 6 * 24 * 24)])
    np.save(file_out[0], data_m[:len(data_m) - 1])
    np.save(file_out[1], data_s)

项目：VQA 作者：VedantYadav | 项目源码 | 文件源码

def get_answers_matrix(split):
    if split == 'train':
        data_path = 'data/train_qa'
    elif split == 'val':
        data_path = 'data/val_qa'
    else:
        print('Invalid split!')
        sys.exit()

    df = pd.read_pickle(data_path)
    answers = df[['multiple_choice_answer']].values.tolist()
    answer_matrix = np.zeros((len(answers),1001))
    default_onehot = np.zeros(1001)
    default_onehot[1000] = 1.0

    for i, answer in enumerate(answers):
        answer_matrix[i] = answer_to_onehot_dict.get(answer[0].lower(),default_onehot)

    return answer_matrix

项目：VQA 作者：VedantYadav | 项目源码 | 文件源码

def get_questions_matrix(split):
    if split == 'train':
        data_path = 'data/train_qa'
    elif split == 'val':
        data_path = 'data/val_qa'
    else:
        print('Invalid split!')
        sys.exit()

    df = pd.read_pickle(data_path)
    questions = df[['question']].values.tolist()
    word_idx = ebd.load_idx()
    seq_list = []

    for question in questions:
        words = word_tokenize(question[0])
        seq = []
        for word in words:
            seq.append(word_idx.get(word,0))
        seq_list.append(seq)
    question_matrix = pad_sequences(seq_list)

    return question_matrix

项目：kaggle-review 作者：daxiongshu | 项目源码 | 文件源码

def _build(self,flags,files):
        path = flags.input_path
        Table = namedtuple('Table', 'name fname dtype')
        fnames = "adult.data,adult.test".split(',')
        names = "train,test".split(',')
        TABLES = [Table(i,"%s/%s"%(path,j),None) for i,j in zip(names,fnames) if files =="all" or i in files]

        print()
        self.flags = flags
        path = flags.data_path
        data = {}
        columns = [
            "age", "workclass", "fnlwgt", "education", "education_num",
            "marital_status", "occupation", "relationship", "race", "gender",
            "capital_gain", "capital_loss", "hours_per_week", "native_country",
            "income_bracket"
        ]

        for table in TABLES:
            name = table.name
            fname = table.fname
            dtype = table.dtype
            pname = "%s/%s.pkl"%(path,name.split('/')[-1].split('.')[0])
            if os.path.exists(pname):
                data[name] = pd.read_pickle(pname)
            else:
                if name == 'train':
                    data[name] = pd.read_csv(fname,dtype=dtype,header=None,skipinitialspace=True,
                        names=columns)
                if name == 'test':
                    data[name] = pd.read_csv(fname,dtype=dtype,header=None,skipinitialspace=True,
                        skiprows=1,names=columns)
                data[name]['target'] = data[name]["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
                data[name].drop('income_bracket',axis=1,inplace=True)
                data[name].to_pickle(pname)
            print_mem_time("Loaded {} {}".format(fname.split('/')[-1],data[name].shape))
        self.data = data # no copy, pass the inference
        print()

项目：kaggle-review 作者：daxiongshu | 项目源码 | 文件源码

def read_data(name):
    train_pk = name.replace('.csv','.pkl')
    if os.path.exists(train_pk) == False:
        train = pd.read_csv(name)
        if "va" not in name and "test" not in name:
            train.to_pickle(train_pk)
    else:
        train = pd.read_pickle(train_pk)
    return train

项目：kaggle-review 作者：daxiongshu | 项目源码 | 文件源码

def _load_u2o(self):
        if self.u2o:
            return
        path = self.flags.data_path
        p = "%s/u2o.pkl"%path
        if os.path.exists(p)==False:
            self._load_db()        
            u2o = self.pdDB.data['orders'].groupby('user_id')['order_id'].apply(list) 
            u2o.to_pickle(p)
        else:
            u2o = pd.read_pickle(p)
        self.u2o = u2o
        print_mem_time("Loaded u2o %d"%len(u2o))

项目：kaggle-review 作者：daxiongshu | 项目源码 | 文件源码

def _build(self,flags,files):
        fnames,names = self.fnames,self.names
        path = self.path 
        Table = namedtuple('Table', 'name fname dtype')
        tables = [Table(i,"%s/%s"%(path,j),{}) for i,j in zip(names,fnames) if files =="all" or i in files]

        print()
        self.flags = flags
        path = flags.data_path
        data = {}
        for table in tables:
            name,fname,dtype = table.name,table.fname,table.dtype
            pname = "%s/%s_%s.pkl"%(path,self.name,name.split('/')[-1].split('.')[0])
            if os.path.exists(pname):
                data[name] = pd.read_pickle(pname)
            else: 
                if '_text' in name:              
                    data[name] = pd.read_csv(fname,header=None,sep="\|\|",skiprows=1,names=['ID','Text']) 
                else:
                    data[name] = pd.read_csv(fname)
                data[name].to_pickle(pname)
            print_mem_time("Loaded {} {}".format(fname.split('/')[-1],data[name].shape))
        self.data = data # no copy, pass the reference
        if "training_variants" in self.data:
            y = self.data["training_variants"]['Class']-1
            from utils.np_utils.encoder import onehot_encode
            self.y = onehot_encode(y,self.flags.classes)
        print()

项目：glassdoor-analysis 作者：THEdavehogue | 项目源码 | 文件源码

def combine_data(paths):
    '''
    Function to combine dataframes from pickled form

    INPUT:
        paths: Iterable of filepaths for pickled DataFrames

    OUTPUT:
        ratings_df: Single pandas DataFrame with all ratings
    '''
    ratings_df = pd.read_pickle(paths[0])
    for path in paths[1:]:
        ratings_df = ratings_df.append(pd.read_pickle(path))
    return ratings_df

项目：glassdoor-analysis 作者：THEdavehogue | 项目源码 | 文件源码

def check_review_counts(ratings_df):
    '''
    Function to check that enough data was collected. Compares number of reviews
    for each target employer with the number of reviews collected

    INPUT:
        ratings_df: Pandas DataFrame containing scraped review text

    OUTPUT:
        good_er_ids, bad_er_ids: Lists of tuples to rescrape from glassdoor
    '''
    clean_df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl'))
    target_ratings = clean_df[['company_name', 'company_id',
                               'num_ratings', 'overall_rating']]
    company_ratings = ratings_df['company_name'].value_counts()
    company_ratings = company_ratings.to_frame(name='ratings_collected')
    company_ratings.reset_index(inplace=True)
    check_df = target_ratings.merge(company_ratings,
                                    how='left',
                                    left_on='company_name',
                                    right_on='index')
    check_df['company_id'] = check_df['company_id'].astype(int)
    check_df.drop('index', axis=1, inplace=True)
    check_df['delta'] = check_df['num_ratings'] - check_df['ratings_collected']
    check_df['delta_pct'] = check_df['delta'] / check_df['num_ratings']
    rescrape = check_df[check_df['delta_pct'] > 0.5]
    good_rescrape = rescrape[rescrape['overall_rating'] > 3.5]
    bad_rescrape = rescrape[rescrape['overall_rating'] < 3.5]
    good_er_ids = zip(good_rescrape['company_name'],
                      good_rescrape['company_id'])
    bad_er_ids = zip(bad_rescrape['company_name'], bad_rescrape['company_id'])
    pickle.dump(good_er_ids,
                open(os.path.join('data', 'rescrape_pros.pkl'), 'wb'))
    pickle.dump(bad_er_ids,
                open(os.path.join('data', 'rescrape_cons.pkl'), 'wb'))
    return good_er_ids, bad_er_ids

项目：InplusTrader_Linux 作者：zhengwsh | 项目源码 | 文件源码

def plot(result_dict_file, is_show, plot_save_file):
    """
    Draw result DataFrame
    """
    import pandas as pd
    from rqalpha.plot import plot_result

    result_dict = pd.read_pickle(result_dict_file)
    if is_show:
        plot_result(result_dict)
    if plot_save_file:
        plot_result(result_dict, show_windows=False, savefile=plot_save_file)

项目：InplusTrader_Linux 作者：zhengwsh | 项目源码 | 文件源码

def report(result_pickle_file_path, target_report_csv_path):
    """
    Generate report from backtest output file
    """
    import pandas as pd
    result_dict = pd.read_pickle(result_pickle_file_path)

    from rqalpha.utils.report import generate_report
    generate_report(result_dict, target_report_csv_path)

项目：InplusTrader_Linux 作者：zhengwsh | 项目源码 | 文件源码

def plot(result_dict_file, show, plot_save_file):
    """
    [sys_analyser] draw result DataFrame
    """
    import pandas as pd
    from .plot import plot_result

    result_dict = pd.read_pickle(result_dict_file)
    plot_result(result_dict, show, plot_save_file)

项目：InplusTrader_Linux 作者：zhengwsh | 项目源码 | 文件源码

def report(result_pickle_file_path, target_report_csv_path):
    """
    [sys_analyser] Generate report from backtest output file
    """
    import pandas as pd
    result_dict = pd.read_pickle(result_pickle_file_path)

    from .report import generate_report
    generate_report(result_dict, target_report_csv_path)

项目：seniority_list 作者：rubydatasystems | 项目源码 | 文件源码

def display_proposals():
    '''print out a list of the proposal names which were generated and stored
    in the dill folder by the build_program_files script

    no inputs
    '''
    print('proposal list:')
    print(list(pd.read_pickle('dill/proposal_names.pkl').proposals))

项目：FHDMM 作者：aweinstein | 项目源码 | 文件源码

def plot_actions(cue=0):
    mpl.rcParams['axes.labelsize'] = 'large'
    d_map = {3:1, 8:2, 14:3, 23:4}
    df = pd.read_pickle('data.pkl').reset_index()
    df = df.loc[df['cue'] == cue]
    g = sns.FacetGrid(df, col='subject',
                      col_wrap=6, size=1.5, ylim=(0, 5), aspect=1.5)

    g.map(plt.plot, 'action')
    g.set(xticks=[], yticks=[0,1,2,3], yticklabels=['3', '8', '14', '23'])
    g.set(ylim=(-0.5, 4))
    g.set_ylabels('choice')
    g.fig.tight_layout()
    g.fig.subplots_adjust(top=0.93)


    subjects = df['subject'].unique()
    for ax, subject in zip(g.axes, subjects):
        df_subject = df.loc[df['subject'] == subject]
        df_subject.reset_index(inplace=True)
        df_wins = df_subject.loc[df_subject['reward'] > 0]
        df_lose = df_subject.loc[df_subject['reward'] < 0]
        pos_win = df_wins.loc[df_wins['subject'] == subject].index
        pos_lose = df_lose.loc[df_lose['subject'] == subject].index
        ax.eventplot(pos_win, lineoffsets=3.5, linelength=0.75,
                     linewidths=0.4)
        ax.eventplot(pos_lose, lineoffsets=3.5, linelength=0.75,
                     color='r', linewidths=0.4)
    plt.tight_layout()
    plt.savefig('actions_0.pdf')
    plt.show()
    globals().update(locals())

项目：birdsong-keras 作者：bapalto | 项目源码 | 文件源码

def appendDfToPickle(df, filePath):
    import os
    import pandas as pd
    if not os.path.isfile(filePath):
        df.to_pickle(filePath)
    else:
        tempDF=pd.read_pickle(filePath)
        tempDF=tempDF.append(df, ignore_index = True)
        tempDF.to_pickle(filePath)

项目：chainladder-python 作者：jbogaardt | 项目源码 | 文件源码

def load_dataset(key):
    """ Function to load datasets included in the chainladder package.

    Arguments:
    key: str
        The name of the dataset, e.g. RAA, ABC, UKMotor, GenIns, etc.

    Returns:
    pandas.DataFrame of the loaded dataset.
   """
    path = os.path.dirname(os.path.abspath(__file__))
    return read_pickle(os.path.join(path, 'data', key))