我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_hdf()。
def _load_table(self, src, fmt, components=None, *args, **kwargs): """ Load a data frame from table formats: csv, hdf5, feather """ if fmt == 'csv': _data = pd.read_csv(src, *args, **kwargs) elif fmt == 'feather': _data = feather.read_dataframe(src, *args, **kwargs) # pylint: disable=redefined-variable-type elif fmt == 'hdf5': _data = pd.read_hdf(src, *args, **kwargs) # pylint: disable=redefined-variable-type # Put into this batch only part of it (defined by index) if isinstance(_data, pd.DataFrame): _data = _data.loc[self.indices] elif isinstance(_data, dd.DataFrame): # dask.DataFrame.loc supports advanced indexing only with lists _data = _data.loc[list(self.indices)].compute() components = tuple(components or self.components) for i, comp in enumerate(components): setattr(self, comp, _data.iloc[:, i].values)
def load_data(dataset): """Load data from a given dataset Parameters ---------- dataset : str Searches from dataset.h5 in this file's directory Returns ------- DataFrame Hourly temperature data """ p = Path(os.path.dirname(os.path.realpath(__file__))) / 'data' fname = p / f'{dataset}.h5' try: return pd.read_hdf(str(fname)) except FileNotFoundError: sources = {f.stem for f in p.iterdir() if f.is_file() and f.name.endswith('h5')} raise RuntimeError(f"Could not not find {dataset!r}. Existing " f"datasets are {sources}")
def get_data_opt10081(self, code, date='20161231'): try: data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index() start = str(data.index[-2]) except (FileNotFoundError, IndexError) as e: start = "20010101" print("get 81 data from %s" % start) self.kiwoom.start_date = datetime.strptime(start, "%Y%m%d") self.kiwoom.data_opt10081 = [] * 15 self.kiwoom.set_input_value("????", code) self.kiwoom.set_input_value("????", date) self.kiwoom.set_input_value("??????", 255) self.kiwoom.comm_rq_data("??????????", "opt10081", 0, "0101") while self.kiwoom.inquiry == '2': time.sleep(TR_REQ_TIME_INTERVAL) self.kiwoom.set_input_value("????", code) self.kiwoom.set_input_value("????", date) self.kiwoom.set_input_value("??????", 255) self.kiwoom.comm_rq_data("??????????", "opt10081", 2, "0101") self.kiwoom.data_opt10081.index = self.kiwoom.data_opt10081.loc[:, '??'] return self.kiwoom.data_opt10081.loc[:, ['???', '???', '????', '??', '??', '??']]
def get_data_opt10086(self, code, date): try: data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index() start = str(data.index[-2]) except (FileNotFoundError, IndexError) as e: start = "20010101" print("get 86 data from %s" % start) self.kiwoom.start_date = datetime.strptime(start, "%Y%m%d") self.kiwoom.data_opt10086 = [] * 23 self.kiwoom.set_input_value("????", code) self.kiwoom.set_input_value("????", date) self.kiwoom.set_input_value("????", 1) self.kiwoom.comm_rq_data("??????", "opt10086", 0, "0101") while self.kiwoom.inquiry == '2': time.sleep(TR_REQ_TIME_INTERVAL) self.kiwoom.set_input_value("????", code) self.kiwoom.set_input_value("????", date) self.kiwoom.set_input_value("????", 1) self.kiwoom.comm_rq_data("??????", "opt10086", 2, "0101") self.kiwoom.data_opt10086.index = self.kiwoom.data_opt10086.loc[:, '??'] return self.kiwoom.data_opt10086
def get_twitter_sentiment_multilabel_classification_dataset(): file_name = os.path.join('tests', 'twitter_sentiment.h5') try: df_twitter = pd.read_hdf(file_name) except Exception as e: print('Error') print(e) dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv' df_twitter = pd.read_csv(dataset_url, encoding='latin-1') # Do not write the index that pandas automatically creates df_twitter.to_hdf(file_name, key='df', format='fixed') # Grab only 10% of the dataset- runs much faster this way df_twitter = df_twitter.sample(frac=0.1) df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created) df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42) return df_twitter_train, df_twitter_test
def test_append_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) df = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) with ensure_clean_store(self.path) as store: store.append('mi', df) result = store.select('mi') tm.assert_frame_equal(result, df) # GH 3748 result = store.select('mi', columns=['A', 'B']) expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(result, expected) with ensure_clean_path('test.hdf') as path: df.to_hdf(path, 'df', format='table') result = read_hdf(path, 'df', columns=['A', 'B']) expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(result, expected)
def test_duplicate_column_name(self): df = DataFrame(columns=["a", "a"], data=[[0, 0]]) with ensure_clean_path(self.path) as path: self.assertRaises(ValueError, df.to_hdf, path, 'df', format='fixed') df.to_hdf(path, 'df', format='table') other = read_hdf(path, 'df') tm.assert_frame_equal(df, other) self.assertTrue(df.equals(other)) self.assertTrue(other.equals(df))
def test_colums_multiindex_modified(self): # BUG: 7212 # read_hdf store.select modified the passed columns parameters # when multi-indexed. df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) df.index.name = 'letters' df = df.set_index(keys='E', append=True) data_columns = df.index.names + df.columns.tolist() with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a', append=True, data_columns=data_columns, index=False) cols2load = list('BCD') cols2load_original = list(cols2load) df_loaded = read_hdf(path, 'df', columns=cols2load) # noqa self.assertTrue(cols2load_original == cols2load)
def test_read_hdf_open_store(self): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) df.index.name = 'letters' df = df.set_index(keys='E', append=True) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='w') direct = read_hdf(path, 'df') store = HDFStore(path, mode='r') indirect = read_hdf(store, 'df') tm.assert_frame_equal(direct, indirect) self.assertTrue(store.is_open) store.close()
def test_complex_fixed(self): df = DataFrame(np.random.rand(4, 5).astype(np.complex64), index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df') reread = read_hdf(path, 'df') assert_frame_equal(df, reread) df = DataFrame(np.random.rand(4, 5).astype(np.complex128), index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df') reread = read_hdf(path, 'df') assert_frame_equal(df, reread)
def test_complex_table(self): df = DataFrame(np.random.rand(4, 5).astype(np.complex64), index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table') reread = read_hdf(path, 'df') assert_frame_equal(df, reread) df = DataFrame(np.random.rand(4, 5).astype(np.complex128), index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table', mode='w') reread = read_hdf(path, 'df') assert_frame_equal(df, reread)
def updateResults(filename, initial, timeScore): """ Filename (tihout extension). open the filename.hdf and store results. then write the results to a HTML file """ hdfFile = filename+'.hdf' if not os.path.exists(hdfFile): copyfile('./data/default.hdf', hdfFile) data = pd.read_hdf(hdfFile).reset_index() # add new entry to data frame data.loc[len(data)] = [initial, timeScore] # rank best unique user scores sortData = data.groupby(['INITIALS']).min().head(10) # save new data sortData.to_hdf(hdfFile, 'test', mode='w') sortData = sortData.sort_values(by='TIME',ascending=True).reset_index() htmlTable(sortData, filename+'.html')
def read_test_train(train_size): print("Load train.csv") train = pd.read_hdf("../modified_data/train_original.csv.hdf", 'table') null_count = train.isnull().sum().sum() if null_count > 0: print('Nans:', null_count) cols = train.isnull().any(axis=0) print(cols[cols == True]) rows = train.isnull().any(axis=1) print(rows[rows == True]) print('NANs in train, please check it!') exit() split = round((1-train_size)*len(train.index)) train = train[split:] print("Load test.csv") test = pd.read_hdf("../modified_data/test.hdf", 'table') null_count = test.isnull().sum().sum() if null_count > 0: print('Nans:', null_count) cols = test.isnull().any(axis=0) print(cols[cols == True]) print('NANs in test, please check it!') exit() features = get_features(train, test) return train, test, features
def preprocess_day(a, b): a = pd.read_csv('data/restaurants_train_data.tsv', delimiter='\t') b = pd.read_csv('data/restaurants_test_data.tsv', delimiter='\t') print(a['text'][10]) a['text'] = a['text'].apply(clean) b['text'] = b['text'].apply(clean) # save pre-processed data as pickle file a.to_hdf('data/restaurants_train_data_processed.h5', 'table') b.to_hdf('data/restaurants_test_data_processed.h5', 'table') # load pre-processed pickle data a = pd.read_hdf('data/restaurants_train_data_processed.h5', 'table') a['text'] = a['text'].apply(ast.literal_eval) b = pd.read_hdf('data/restaurants_test_data_processed.h5', 'table') b['text'] = b['text'].apply(ast.literal_eval) print(a['text'][10])
def LoadParseData(filename): data_name = filename.split('_')[0] pd_data = pd.read_hdf(CODE_FOLDER + "data/" + filename) cols_features = pd_data.drop(['ID', 'target'], 1).columns.tolist() pd_train = pd_data[pd_data.target >= 0] pd_test = pd_data[pd_data.target == -1] Y = pd_train['target'].values.astype(int) test_idx = pd_test['ID'].values.astype(int) X = np.array(pd_train.drop(['ID', 'target'],1)) X_test = np.array(pd_test.drop(['ID','target'], 1)) return X, Y, X_test, test_idx, pd_data, data_name, cols_features
def update_data_using_real_data(self): # This test is slow, and so is excluded from the standard test suite. current_data = pd.read_hdf(self.TEST_CURRENT_DATA_FILE) new_data = pd.read_hdf(self.TEST_NEW_DATA_FILE) expected_updated_data = pd.read_hdf(self.TEST_UPDATED_DATA_FILE) resultant_updated_data = DataStorer.update_data( current_data.copy(), new_data.copy()) self.assertEqual( expected_updated_data.shape, resultant_updated_data.shape) self.assertTrue( expected_updated_data.equals(resultant_updated_data)) repeatedly_updated_data = DataStorer.update_data( resultant_updated_data.copy(), new_data.copy()) self.assertEqual( expected_updated_data.shape, repeatedly_updated_data.shape) self.assertTrue( expected_updated_data.equals(repeatedly_updated_data))
def __init__(self, table=None, filename=''): """ table: the pandas DataFrame that records rankable objects competition record filename: the hdf5 filename that stores the DataFrame. The DataFrame must be indexed by 'item_pair_rate'. """ if table is None: table = pd.read_hdf(filename, "item_pair_rate") table = table[['primary','secondary','rate1','rate2','weight']] self.table = table # itemid to index table idx = self._extract_list(self.table) self.itemlist = idx temptable = table.iloc[:,:2].values pair = np.fromfunction(np.vectorize(lambda i, j: idx[temptable[i,j]]), temptable.shape) pair = np.require(pair, dtype=np.int32) self.pair = pair
def delete_conversion_data(): train_data = pd.read_hdf(FilePath + 'train_0613_nodelconvert') print 'read finish' advertiser_conversion_list = find_delete_advertiser() print len(advertiser_conversion_list) for item in advertiser_conversion_list: t = threading.Thread(target=get_index_to_delete,args=(train_data,item)) t.start() while len(result_list)<len(advertiser_conversion_list): pass train_data.drop(delete_list, axis=0, inplace=True) train_data = train_data.reset_index() del train_data['index'] print 'train write begin' train_data.to_hdf(FilePath + 'train_0613', 'all') delete_list = Series(delete_list) delete_list.to_csv(FilePath + 'delete_negsample_index_oftrain0613.csv', mode='a', index=False)
def time_delta_fentong(): train_data = pd.read_hdf('../../gen/train_0626') test_data = pd.read_hdf('../../gen/test_0626') print 'read finish' train_data['time_delta_user_creative_next_fentong'] = train_data['time_delta_user_creative_next'].map(time_delta_map) test_data['time_delta_user_creative_next_fentong'] = test_data['time_delta_user_creative_next'].map(time_delta_map) train_data['time_delta_user_creative_fentong'] = train_data['time_delta_user_creative'].map(time_delta_map) test_data['time_delta_user_creative_fentong'] = test_data['time_delta_user_creative'].map(time_delta_map) train_data['time_delta_user_app_next_fentong'] = train_data['time_delta_user_app_next'].map(time_delta_map) test_data['time_delta_user_app_next_fentong'] = test_data['time_delta_user_app_next'].map(time_delta_map) train_data['time_delta_user_app_fentong'] = train_data['time_delta_user_app'].map(time_delta_map) test_data['time_delta_user_app_fentong'] = test_data['time_delta_user_app'].map(time_delta_map) train_data['time_delta_user_next_fentong'] = train_data['time_delta_user_next'].map(time_delta_map) test_data['time_delta_user_next_fentong'] = test_data['time_delta_user_next'].map(time_delta_map) train_data['time_delta_user_fentong'] = train_data['time_delta_user'].map(time_delta_map) test_data['time_delta_user_fentong'] = test_data['time_delta_user'].map(time_delta_map) print test_data train_data.to_hdf('../../gen/train_0626_delta_fentong','all') test_data.to_hdf('../../gen/test_0626_delta_fentong','all')
def __iter__(self, gen_type='train', batch_size=None, shuffle_block=False, random_sample=False, split_fields=False, on_disk=True, squeeze_output=False, **kwargs): gen_type = gen_type.lower() if on_disk: print('on disk...') for hdf_X, hdf_y in self._files_iter_(gen_type=gen_type, shuffle_block=shuffle_block): # num_of_lines = pd.HDFStore(hdf_y, mode='r').get_storer('fixed').shape[0] X_all = pd.read_hdf(hdf_X, mode='r').as_matrix() y_all = pd.read_hdf(hdf_y, mode='r').as_matrix() gen = self.generator(X_all, y_all, batch_size, shuffle=random_sample) for X, y in gen: if split_fields: X = np.split(X, self.max_length, axis=1) for i in range(self.max_length): X[i] -= self.feat_min[i] if squeeze_output: y = y.squeeze() yield X, y else: print('not implemented')
def bin_count(hdf_data_dir, file_prefix, num_of_parts): """ count positive/negative samples :param hdf_data_dir: :param file_prefix: see this param in feature_to_hdf() :param num_of_parts: :return: size of a dataset, positive samples, negative samples, positive ratio """ size = 0 num_of_pos = 0 num_of_neg = 0 for part in range(num_of_parts): _y = pd.read_hdf(os.path.join(hdf_data_dir, file_prefix + '_output_part_' + str(part) + '.h5'), mode='r') part_pos_num = _y.loc[_y.iloc[:, 0] == 1].shape[0] part_neg_num = _y.shape[0] - part_pos_num size += _y.shape[0] num_of_pos += part_pos_num num_of_neg += part_neg_num pos_ratio = 1.0 * num_of_pos / (num_of_pos + num_of_neg) return size, num_of_pos, num_of_neg, pos_ratio
def try_pandas(data_path: str) -> bool: """Guesses if a file is a pandas file. Parameters ---------- data_path Path to file. Returns ------- bool True if the file is pandas. """ try: pandas.read_hdf(data_path) except ValueError: return False return True
def _reader(self): if not self.does_exist(): return return pd.read_hdf(self.data_file, 'data')
def retrieveCommonDatesHDF(support_data_filename, key_list, in_date_list): ''' Get a list of all dates that have data available @support_data_filename: Filename of support data @in_date_list: Input date list to check @return dictionary of dates with data ''' valid_dates = OrderedDict() support_full_path = resource_filename('skdaccess',os.path.join('support',support_data_filename)) for key in key_list: try: available_dates = pd.read_hdf(support_full_path, key) except KeyError: print('Unknown station:',key) common_dates = list(set(in_date_list).intersection(set(available_dates))) common_dates.sort() valid_dates[key] = common_dates return valid_dates
def train(self): """""" start = time.time() print('size before truncated outliers is %d ' % len(self.TrainData)) TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)] print('size after truncated outliers is %d ' % len(self.TrainData)) TrainData['longitude'] -= -118600000 TrainData['latitude'] -= 34220000 #extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train') #self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['logerror'] self._l_train_columns = X.columns X = X.values.astype(np.float32, copy=False) lr = LassoLars(alpha= self._lr_alpha, max_iter= self._lr_iter, verbose= True) self._model = lr.fit(X, Y) end = time.time() print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start))) self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__, datetime.now().strftime('%Y%m%d-%H:%M:%S')) #with open(self._f_eval_train_model, 'wb') as o_file: # pickle.dump(self._model, o_file, -1) #o_file.close() #self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]], # ignore_index=True) ## ignore_index will reset the index or index will be overlaped return
def evaluate(self): """""" ## not truncate outliers pred_valid = pd.DataFrame(index=self.ValidData.index) pred_valid['parcelid'] = self.ValidData['parcelid'] truth_valid = pd.DataFrame(index=self.ValidData.index) truth_valid['parcelid'] = self.ValidData['parcelid'] start = time.time() for d in self._l_valid_predict_columns: l_valid_columns = ['%s%s' % (c, d) if (c in ['lastgap', 'monthyear', 'buildingage']) else c for c in self._l_train_columns] extra_va = pd.read_hdf(path_or_buf='%s/p21/eval_valid_%s.hdf' % (self.InputDir, d), key='valid') #ValidData = self.ValidData.join(extra_va, on= 'parcelid', how= 'left') ValidData = pd.concat([self.ValidData, extra_va.drop('parcelid', axis= 1)], axis= 1) x_valid = ValidData[l_valid_columns] x_valid = x_valid.values.astype(np.float32, copy=False) pred_valid[d] = self._model.predict(x_valid) # * 0.99 + 0.011 * 0.01 df_tmp = ValidData[ValidData['transactiondate'].dt.month == int(d[-2:])] truth_valid.loc[df_tmp.index, d] = df_tmp['logerror'] score = 0.0 ae = np.abs(pred_valid - truth_valid) for col in ae.columns: score += np.sum(ae[col]) score /= len(pred_valid) ##!! divided by number of instances, not the number of 'cells' print('============================= ') print('Local MAE is %.6f' % score) print('=============================') end = time.time() del self.ValidData gc.collect() print('time elapsed %ds' % (end - start))
def train(self): """""" start = time.time() extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train') print('size before truncated outliers is %d ' % len(self.TrainData)) self.TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)] #self.TrainData = self.TrainData.join(extra_tr, on='parcelid', how= 'left') self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1) print('size after truncated outliers is %d ' % len(self.TrainData)) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['logerror'] self._l_train_columns = X.columns X = X.values.astype(np.float32, copy=False) lr = Lasso(alpha= self._lr_alpha, max_iter= self._lr_iter, tol= 1e-4, random_state= 2017, selection= self._lr_sel) self._model = lr.fit(X, Y) end = time.time() print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start))) self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__, datetime.now().strftime('%Y%m%d-%H:%M:%S')) with open(self._f_eval_train_model, 'wb') as o_file: pickle.dump(self._model, o_file, -1) o_file.close() #self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]], # ignore_index=True) ## ignore_index will reset the index or index will be overlaped return
def LoadFromHdfFile(InputDir, mode = 'train'): if(mode == 'train'): data = pd.read_hdf(path_or_buf= '%s/train.hdf' % InputDir, key='train') elif(mode == 'valid'): data = pd.read_hdf(path_or_buf= '%s/valid.hdf' % InputDir, key='valid') else: data = pd.read_hdf(path_or_buf= '%s/test.hdf' % InputDir, key='test') return data ## class method, load data with pkl format
def recover_matrix(config, directory='.'): """Recover a matrix by either its config or uuid. Parameters ---------- config: str or dict config metadata for the matrix or uuid directory: str path to search for the matrix Returns ------- df_matrix: DataFrame DataFrame of specified matrix None: If no matrix matrix is found """ if isinstance(config, dict): uuid = generate_uuid(config) else: uuid = config fname = directory + '/' + uuid if os.path.isfile(fname + '.h5'): df_matrix = pd.read_hdf(fname + '.h5') return df_matrix elif os.path.isfile(fname + '.csv'): df_matrix = pd.read_csv(fname + '.csv') return df_matrix else: return None
def get_matrix_and_metadata(matrix_path, metadata_path): """Retrieve a matrix in hdf format and metadata about the matrix in yaml format Returns: (tuple) matrix, metadata """ matrix = pandas.read_hdf(matrix_path) with open(metadata_path) as f: metadata = yaml.load(f) return matrix, metadata
def read_smiles_data(filename): import pandas as pd h5f = pd.read_hdf(filename, 'table') data = h5f['structure'][:] # import gzip # data = [line.split()[0].strip() for line in gzip.open(filename) if line] return data
def __init__(self, labels_fname, regions_fname=None, max_n_rows=None, load_cached=True): self.labels_fname = labels_fname self.regions_fname = regions_fname self.max_n_rows = max_n_rows self._hash = None self.load_cached = load_cached # extract the sample names from the header #assert labels_fname.endswith("labels.tsv.gz"), \ # "Unrecognized labels filename '%s'" % labels_fname self._init_header_data(labels_fname) # extract the factor from the filename self.factor = os.path.basename(labels_fname).split('.')[0] # if we want to use a cached version... if self.load_cached is True: try: print "Loading '%s'" % self.cached_fname self.h5store = h5py.File(self.cached_fname) self.data = pd.read_hdf(self.cached_fname, 'data') except KeyError: self.data = self._build_dataframe() self.data.to_hdf(self.cached_fname, 'data') print self.h5store else: self.data = self._build_dataframe() return
def load_or_build_motif_scores(self, fasta_fname): try: self.motif_scores = pd.read_hdf(self.cached_fname, 'motif_scores') self.motif_scores.index = self.data.index except KeyError: self.motif_scores = self.build_motif_scores(fasta_fname) self.motif_scores.to_hdf(self.cached_fname, 'motif_scores') return self.motif_scores
def load_or_build_dnase_fc_scores(self): try: self.dnase_fc_scores = pd.read_hdf(self.cached_fname, 'dnase_scores') except KeyError: self.dnase_fc_scores = self.build_dnase_fc_scores() self.dnase_fc_scores.to_hdf(self.cached_fname, 'dnase_scores') except IOError: self.dnase_fc_scores = self.build_dnase_fc_scores() return self.dnase_fc_scores
def __init__(self, path): self._panel = pd.read_hdf(path)
def main(batch_size=10000): posts_df = pd.read_hdf('nw_posts.hdf5', 'posts') index_posts_in_elastic(posts_df, batch_size=batch_size)
def load_wv_pandas(fname): return pd.read_hdf(fname, 'data')
def get_availableExchanges(): SymbolsDF = pd.read_hdf(Constants.InputFolder + 'Symbols.hdf', 'Symbols') return SymbolsDF.EXCHANGE.drop_duplicates().values
def get_availableSymbols(SymbolFilter=None): SymbolsDF = pd.read_hdf(Constants.InputFolder+'Symbols.hdf', 'Symbols') if SymbolFilter == None : DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == 'NYSE', :] return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values if not ('Exchange' in SymbolFilter.keys()): DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == 'NYSE', :] return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == SymbolFilter['Exchange'], :] return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values
def get_availableSymbols(SymbolFilter=None): DF=pd.read_hdf(Constants.InputFolder+'Symbols.hdf', 'OANDA') return DF.instrument.values
def main(): DF = pd.read_hdf('/home/lc1bfrbl/Database/Oanda.hdf', 'WTICO_USD_H1') TTT=CalcTaylorCycle(DF) Index = (TTT.index.year == 2017) & (TTT.index.month == 6) TTT[Index].MO.plot() TTT[Index].MLo.plot() TTT[Index].MHi.plot() TTT[Index].High.plot() TTT[Index].Low.plot()
def save_table(self, code, date): TR_REQ_TIME_INTERVAL = 4 time.sleep(TR_REQ_TIME_INTERVAL) data_81 = self.wrapper.get_data_opt10081(code, date) time.sleep(TR_REQ_TIME_INTERVAL) data_86 = self.wrapper.get_data_opt10086(code, date) col_86 = ['???', '???', '??(??)', '???', '??', '??', '????', '???', '????', '???', '????', '????', '????', '?????', '?????', '?????', '?????'] data = pd.concat([data_81, data_86.loc[:, col_86]], axis=1) #con = sqlite3.connect("../data/stock.db") try: data = data.loc[data.index > int(self.kiwoom.start_date.strftime("%Y%m%d"))] #orig_data = pd.read_sql("SELECT * FROM '%s'" % code, con, index_col='??').sort_index() orig_data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index() end_date = orig_data.index[-1] orig_data = orig_data.loc[orig_data.index < end_date] data = data.loc[data.index >= end_date] data = pd.concat([orig_data, data], axis=0) except (FileNotFoundError, IndexError) as e: print(e) pass finally: data.index.name = '??' if len(data) != 0: #data.to_sql(code, con, if_exists='replace') data.to_hdf('../data/hdf/%s.hdf'%code, 'day', mode='w')
def read_h5(): code_list = glob.glob('../data/stock/*.h5') for code in code_list[:10]: data = pd.read_hdf(code, 'table').sort_index() data = data.loc[data.index >= str(20160101)] data = data.loc[data.index <= str(20160630)] print(data.head())
def superReadFile(filepath,**kwargs): """ Uses pandas.read_excel (on excel files) and returns a dataframe of the first sheet (unless sheet is specified in kwargs) Uses superReadText (on .txt,.tsv, or .csv files) and returns a dataframe of the data. One function to read almost all types of data files. """ if isinstance(filepath, pd.DataFrame): return filepath ext = os.path.splitext(filepath)[1].lower() if ext in ['.xlsx', '.xls']: kwargs.pop('dtype', None) return pd.read_excel(filepath,**kwargs) elif ext in ['.txt','.tsv','.csv']: return superReadText(filepath, **kwargs) elif ext in ['.gz', '.bz2', '.zip', 'xz']: return superReadCSV(filepath, **kwargs) elif ext in ['.h5']: return pd.read_hdf(filepath) else: raise NotImplementedError("Unable to read '{}' files".format(ext))
def test_conv_read_write(self): path = create_tempfile(self.path) try: def roundtrip(key, obj, **kwargs): obj.to_hdf(path, key, **kwargs) return read_hdf(path, key) o = tm.makeTimeSeries() assert_series_equal(o, roundtrip('series', o)) o = tm.makeStringSeries() assert_series_equal(o, roundtrip('string_series', o)) o = tm.makeDataFrame() assert_frame_equal(o, roundtrip('frame', o)) o = tm.makePanel() assert_panel_equal(o, roundtrip('panel', o)) # table df = DataFrame(dict(A=lrange(5), B=lrange(5))) df.to_hdf(path, 'table', append=True) result = read_hdf(path, 'table', where=['index>2']) assert_frame_equal(df[df.index > 2], result) finally: safe_remove(path)
def test_round_trip_equals(self): # GH 9330 df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table') other = read_hdf(path, 'df') tm.assert_frame_equal(df, other) self.assertTrue(df.equals(other)) self.assertTrue(other.equals(df))
def test_to_hdf_with_object_column_names(self): # GH9057 # Writing HDF5 table format should only work for string-like # column types types_should_fail = [tm.makeIntIndex, tm.makeFloatIndex, tm.makeDateIndex, tm.makeTimedeltaIndex, tm.makePeriodIndex] types_should_run = [tm.makeStringIndex, tm.makeCategoricalIndex] if compat.PY3: types_should_run.append(tm.makeUnicodeIndex) else: types_should_fail.append(tm.makeUnicodeIndex) for index in types_should_fail: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(self.path) as path: with self.assertRaises( ValueError, msg=("cannot have non-object label " "DataIndexableCol")): df.to_hdf(path, 'df', format='table', data_columns=True) for index in types_should_run: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table', data_columns=True) result = pd.read_hdf( path, 'df', where="index = [{0}]".format(df.index[0])) assert(len(result))
def test_read_hdf_errors(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: self.assertRaises(IOError, read_hdf, path, 'key') df.to_hdf(path, 'df') store = HDFStore(path, mode='r') store.close() self.assertRaises(IOError, read_hdf, store, 'df') with open(path, mode='r') as store: self.assertRaises(NotImplementedError, read_hdf, store, 'df')
def test_read_nokey(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a') reread = read_hdf(path) assert_frame_equal(df, reread) df.to_hdf(path, 'df2', mode='a') self.assertRaises(ValueError, read_hdf, path)