Python pandas 模块,read_hdf() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_hdf()。
def _load_table(self, src, fmt, components=None, *args, **kwargs):
""" Load a data frame from table formats: csv, hdf5, feather """
if fmt == 'csv':
_data = pd.read_csv(src, *args, **kwargs)
elif fmt == 'feather':
_data = feather.read_dataframe(src, *args, **kwargs) # pylint: disable=redefined-variable-type
elif fmt == 'hdf5':
_data = pd.read_hdf(src, *args, **kwargs) # pylint: disable=redefined-variable-type
# Put into this batch only part of it (defined by index)
if isinstance(_data, pd.DataFrame):
_data = _data.loc[self.indices]
elif isinstance(_data, dd.DataFrame):
# dask.DataFrame.loc supports advanced indexing only with lists
_data = _data.loc[list(self.indices)].compute()
components = tuple(components or self.components)
for i, comp in enumerate(components):
setattr(self, comp, _data.iloc[:, i].values)
def load_data(dataset):
"""Load data from a given dataset
Parameters
----------
dataset : str
Searches from dataset.h5 in this file's directory
Returns
-------
DataFrame
Hourly temperature data
"""
p = Path(os.path.dirname(os.path.realpath(__file__))) / 'data'
fname = p / f'{dataset}.h5'
try:
return pd.read_hdf(str(fname))
except FileNotFoundError:
sources = {f.stem for f in p.iterdir() if
f.is_file() and f.name.endswith('h5')}
raise RuntimeError(f"Could not not find {dataset!r}. Existing "
f"datasets are {sources}")
def get_data_opt10081(self, code, date='20161231'):
try:
data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index()
start = str(data.index[-2])
except (FileNotFoundError, IndexError) as e:
start = "20010101"
print("get 81 data from %s" % start)
self.kiwoom.start_date = datetime.strptime(start, "%Y%m%d")
self.kiwoom.data_opt10081 = [] * 15
self.kiwoom.set_input_value("????", code)
self.kiwoom.set_input_value("????", date)
self.kiwoom.set_input_value("??????", 255)
self.kiwoom.comm_rq_data("??????????", "opt10081", 0, "0101")
while self.kiwoom.inquiry == '2':
time.sleep(TR_REQ_TIME_INTERVAL)
self.kiwoom.set_input_value("????", code)
self.kiwoom.set_input_value("????", date)
self.kiwoom.set_input_value("??????", 255)
self.kiwoom.comm_rq_data("??????????", "opt10081", 2, "0101")
self.kiwoom.data_opt10081.index = self.kiwoom.data_opt10081.loc[:, '??']
return self.kiwoom.data_opt10081.loc[:, ['???', '???', '????', '??', '??', '??']]
def get_data_opt10086(self, code, date):
try:
data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index()
start = str(data.index[-2])
except (FileNotFoundError, IndexError) as e:
start = "20010101"
print("get 86 data from %s" % start)
self.kiwoom.start_date = datetime.strptime(start, "%Y%m%d")
self.kiwoom.data_opt10086 = [] * 23
self.kiwoom.set_input_value("????", code)
self.kiwoom.set_input_value("????", date)
self.kiwoom.set_input_value("????", 1)
self.kiwoom.comm_rq_data("??????", "opt10086", 0, "0101")
while self.kiwoom.inquiry == '2':
time.sleep(TR_REQ_TIME_INTERVAL)
self.kiwoom.set_input_value("????", code)
self.kiwoom.set_input_value("????", date)
self.kiwoom.set_input_value("????", 1)
self.kiwoom.comm_rq_data("??????", "opt10086", 2, "0101")
self.kiwoom.data_opt10086.index = self.kiwoom.data_opt10086.loc[:, '??']
return self.kiwoom.data_opt10086
def get_twitter_sentiment_multilabel_classification_dataset():
file_name = os.path.join('tests', 'twitter_sentiment.h5')
try:
df_twitter = pd.read_hdf(file_name)
except Exception as e:
print('Error')
print(e)
dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
# Do not write the index that pandas automatically creates
df_twitter.to_hdf(file_name, key='df', format='fixed')
# Grab only 10% of the dataset- runs much faster this way
df_twitter = df_twitter.sample(frac=0.1)
df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)
df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
return df_twitter_train, df_twitter_test
def test_append_hierarchical(self):
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['foo', 'bar'])
df = DataFrame(np.random.randn(10, 3), index=index,
columns=['A', 'B', 'C'])
with ensure_clean_store(self.path) as store:
store.append('mi', df)
result = store.select('mi')
tm.assert_frame_equal(result, df)
# GH 3748
result = store.select('mi', columns=['A', 'B'])
expected = df.reindex(columns=['A', 'B'])
tm.assert_frame_equal(result, expected)
with ensure_clean_path('test.hdf') as path:
df.to_hdf(path, 'df', format='table')
result = read_hdf(path, 'df', columns=['A', 'B'])
expected = df.reindex(columns=['A', 'B'])
tm.assert_frame_equal(result, expected)
def test_duplicate_column_name(self):
df = DataFrame(columns=["a", "a"], data=[[0, 0]])
with ensure_clean_path(self.path) as path:
self.assertRaises(ValueError, df.to_hdf,
path, 'df', format='fixed')
df.to_hdf(path, 'df', format='table')
other = read_hdf(path, 'df')
tm.assert_frame_equal(df, other)
self.assertTrue(df.equals(other))
self.assertTrue(other.equals(df))
def test_colums_multiindex_modified(self):
# BUG: 7212
# read_hdf store.select modified the passed columns parameters
# when multi-indexed.
df = DataFrame(np.random.rand(4, 5),
index=list('abcd'),
columns=list('ABCDE'))
df.index.name = 'letters'
df = df.set_index(keys='E', append=True)
data_columns = df.index.names + df.columns.tolist()
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df',
mode='a',
append=True,
data_columns=data_columns,
index=False)
cols2load = list('BCD')
cols2load_original = list(cols2load)
df_loaded = read_hdf(path, 'df', columns=cols2load) # noqa
self.assertTrue(cols2load_original == cols2load)
def test_read_hdf_open_store(self):
# GH10330
# No check for non-string path_or-buf, and no test of open store
df = DataFrame(np.random.rand(4, 5),
index=list('abcd'),
columns=list('ABCDE'))
df.index.name = 'letters'
df = df.set_index(keys='E', append=True)
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', mode='w')
direct = read_hdf(path, 'df')
store = HDFStore(path, mode='r')
indirect = read_hdf(store, 'df')
tm.assert_frame_equal(direct, indirect)
self.assertTrue(store.is_open)
store.close()
def test_complex_fixed(self):
df = DataFrame(np.random.rand(4, 5).astype(np.complex64),
index=list('abcd'),
columns=list('ABCDE'))
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df')
reread = read_hdf(path, 'df')
assert_frame_equal(df, reread)
df = DataFrame(np.random.rand(4, 5).astype(np.complex128),
index=list('abcd'),
columns=list('ABCDE'))
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df')
reread = read_hdf(path, 'df')
assert_frame_equal(df, reread)
def test_complex_table(self):
df = DataFrame(np.random.rand(4, 5).astype(np.complex64),
index=list('abcd'),
columns=list('ABCDE'))
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', format='table')
reread = read_hdf(path, 'df')
assert_frame_equal(df, reread)
df = DataFrame(np.random.rand(4, 5).astype(np.complex128),
index=list('abcd'),
columns=list('ABCDE'))
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', format='table', mode='w')
reread = read_hdf(path, 'df')
assert_frame_equal(df, reread)
def updateResults(filename, initial, timeScore):
""" Filename (tihout extension). open the filename.hdf and store results.
then write the results to a HTML file
"""
hdfFile = filename+'.hdf'
if not os.path.exists(hdfFile):
copyfile('./data/default.hdf', hdfFile)
data = pd.read_hdf(hdfFile).reset_index()
# add new entry to data frame
data.loc[len(data)] = [initial, timeScore]
# rank best unique user scores
sortData = data.groupby(['INITIALS']).min().head(10)
# save new data
sortData.to_hdf(hdfFile, 'test', mode='w')
sortData = sortData.sort_values(by='TIME',ascending=True).reset_index()
htmlTable(sortData, filename+'.html')
def read_test_train(train_size):
print("Load train.csv")
train = pd.read_hdf("../modified_data/train_original.csv.hdf", 'table')
null_count = train.isnull().sum().sum()
if null_count > 0:
print('Nans:', null_count)
cols = train.isnull().any(axis=0)
print(cols[cols == True])
rows = train.isnull().any(axis=1)
print(rows[rows == True])
print('NANs in train, please check it!')
exit()
split = round((1-train_size)*len(train.index))
train = train[split:]
print("Load test.csv")
test = pd.read_hdf("../modified_data/test.hdf", 'table')
null_count = test.isnull().sum().sum()
if null_count > 0:
print('Nans:', null_count)
cols = test.isnull().any(axis=0)
print(cols[cols == True])
print('NANs in test, please check it!')
exit()
features = get_features(train, test)
return train, test, features
def preprocess_day(a, b):
a = pd.read_csv('data/restaurants_train_data.tsv', delimiter='\t')
b = pd.read_csv('data/restaurants_test_data.tsv', delimiter='\t')
print(a['text'][10])
a['text'] = a['text'].apply(clean)
b['text'] = b['text'].apply(clean)
# save pre-processed data as pickle file
a.to_hdf('data/restaurants_train_data_processed.h5', 'table')
b.to_hdf('data/restaurants_test_data_processed.h5', 'table')
# load pre-processed pickle data
a = pd.read_hdf('data/restaurants_train_data_processed.h5', 'table')
a['text'] = a['text'].apply(ast.literal_eval)
b = pd.read_hdf('data/restaurants_test_data_processed.h5', 'table')
b['text'] = b['text'].apply(ast.literal_eval)
print(a['text'][10])
def LoadParseData(filename):
data_name = filename.split('_')[0]
pd_data = pd.read_hdf(CODE_FOLDER + "data/" + filename)
cols_features = pd_data.drop(['ID', 'target'], 1).columns.tolist()
pd_train = pd_data[pd_data.target >= 0]
pd_test = pd_data[pd_data.target == -1]
Y = pd_train['target'].values.astype(int)
test_idx = pd_test['ID'].values.astype(int)
X = np.array(pd_train.drop(['ID', 'target'],1))
X_test = np.array(pd_test.drop(['ID','target'], 1))
return X, Y, X_test, test_idx, pd_data, data_name, cols_features
def update_data_using_real_data(self):
# This test is slow, and so is excluded from the standard test suite.
current_data = pd.read_hdf(self.TEST_CURRENT_DATA_FILE)
new_data = pd.read_hdf(self.TEST_NEW_DATA_FILE)
expected_updated_data = pd.read_hdf(self.TEST_UPDATED_DATA_FILE)
resultant_updated_data = DataStorer.update_data(
current_data.copy(), new_data.copy())
self.assertEqual(
expected_updated_data.shape, resultant_updated_data.shape)
self.assertTrue(
expected_updated_data.equals(resultant_updated_data))
repeatedly_updated_data = DataStorer.update_data(
resultant_updated_data.copy(), new_data.copy())
self.assertEqual(
expected_updated_data.shape, repeatedly_updated_data.shape)
self.assertTrue(
expected_updated_data.equals(repeatedly_updated_data))
def __init__(self, table=None, filename=''):
"""
table: the pandas DataFrame that records rankable objects competition
record
filename: the hdf5 filename that stores the DataFrame. The DataFrame
must be indexed by 'item_pair_rate'.
"""
if table is None:
table = pd.read_hdf(filename, "item_pair_rate")
table = table[['primary','secondary','rate1','rate2','weight']]
self.table = table
# itemid to index table
idx = self._extract_list(self.table)
self.itemlist = idx
temptable = table.iloc[:,:2].values
pair = np.fromfunction(np.vectorize(lambda i, j: idx[temptable[i,j]]),
temptable.shape)
pair = np.require(pair, dtype=np.int32)
self.pair = pair
def delete_conversion_data():
train_data = pd.read_hdf(FilePath + 'train_0613_nodelconvert')
print 'read finish'
advertiser_conversion_list = find_delete_advertiser()
print len(advertiser_conversion_list)
for item in advertiser_conversion_list:
t = threading.Thread(target=get_index_to_delete,args=(train_data,item))
t.start()
while len(result_list)<len(advertiser_conversion_list):
pass
train_data.drop(delete_list, axis=0, inplace=True)
train_data = train_data.reset_index()
del train_data['index']
print 'train write begin'
train_data.to_hdf(FilePath + 'train_0613', 'all')
delete_list = Series(delete_list)
delete_list.to_csv(FilePath + 'delete_negsample_index_oftrain0613.csv', mode='a', index=False)
def time_delta_fentong():
train_data = pd.read_hdf('../../gen/train_0626')
test_data = pd.read_hdf('../../gen/test_0626')
print 'read finish'
train_data['time_delta_user_creative_next_fentong'] = train_data['time_delta_user_creative_next'].map(time_delta_map)
test_data['time_delta_user_creative_next_fentong'] = test_data['time_delta_user_creative_next'].map(time_delta_map)
train_data['time_delta_user_creative_fentong'] = train_data['time_delta_user_creative'].map(time_delta_map)
test_data['time_delta_user_creative_fentong'] = test_data['time_delta_user_creative'].map(time_delta_map)
train_data['time_delta_user_app_next_fentong'] = train_data['time_delta_user_app_next'].map(time_delta_map)
test_data['time_delta_user_app_next_fentong'] = test_data['time_delta_user_app_next'].map(time_delta_map)
train_data['time_delta_user_app_fentong'] = train_data['time_delta_user_app'].map(time_delta_map)
test_data['time_delta_user_app_fentong'] = test_data['time_delta_user_app'].map(time_delta_map)
train_data['time_delta_user_next_fentong'] = train_data['time_delta_user_next'].map(time_delta_map)
test_data['time_delta_user_next_fentong'] = test_data['time_delta_user_next'].map(time_delta_map)
train_data['time_delta_user_fentong'] = train_data['time_delta_user'].map(time_delta_map)
test_data['time_delta_user_fentong'] = test_data['time_delta_user'].map(time_delta_map)
print test_data
train_data.to_hdf('../../gen/train_0626_delta_fentong','all')
test_data.to_hdf('../../gen/test_0626_delta_fentong','all')
def __iter__(self, gen_type='train', batch_size=None, shuffle_block=False, random_sample=False, split_fields=False,
on_disk=True, squeeze_output=False, **kwargs):
gen_type = gen_type.lower()
if on_disk:
print('on disk...')
for hdf_X, hdf_y in self._files_iter_(gen_type=gen_type, shuffle_block=shuffle_block):
# num_of_lines = pd.HDFStore(hdf_y, mode='r').get_storer('fixed').shape[0]
X_all = pd.read_hdf(hdf_X, mode='r').as_matrix()
y_all = pd.read_hdf(hdf_y, mode='r').as_matrix()
gen = self.generator(X_all, y_all, batch_size, shuffle=random_sample)
for X, y in gen:
if split_fields:
X = np.split(X, self.max_length, axis=1)
for i in range(self.max_length):
X[i] -= self.feat_min[i]
if squeeze_output:
y = y.squeeze()
yield X, y
else:
print('not implemented')
def bin_count(hdf_data_dir, file_prefix, num_of_parts):
"""
count positive/negative samples
:param hdf_data_dir:
:param file_prefix: see this param in feature_to_hdf()
:param num_of_parts:
:return: size of a dataset, positive samples, negative samples, positive ratio
"""
size = 0
num_of_pos = 0
num_of_neg = 0
for part in range(num_of_parts):
_y = pd.read_hdf(os.path.join(hdf_data_dir, file_prefix + '_output_part_' + str(part) + '.h5'), mode='r')
part_pos_num = _y.loc[_y.iloc[:, 0] == 1].shape[0]
part_neg_num = _y.shape[0] - part_pos_num
size += _y.shape[0]
num_of_pos += part_pos_num
num_of_neg += part_neg_num
pos_ratio = 1.0 * num_of_pos / (num_of_pos + num_of_neg)
return size, num_of_pos, num_of_neg, pos_ratio
def try_pandas(data_path: str) -> bool:
"""Guesses if a file is a pandas file.
Parameters
----------
data_path
Path to file.
Returns
-------
bool
True if the file is pandas.
"""
try:
pandas.read_hdf(data_path)
except ValueError:
return False
return True
def _reader(self):
if not self.does_exist():
return
return pd.read_hdf(self.data_file, 'data')
def retrieveCommonDatesHDF(support_data_filename, key_list, in_date_list):
'''
Get a list of all dates that have data available
@support_data_filename: Filename of support data
@in_date_list: Input date list to check
@return dictionary of dates with data
'''
valid_dates = OrderedDict()
support_full_path = resource_filename('skdaccess',os.path.join('support',support_data_filename))
for key in key_list:
try:
available_dates = pd.read_hdf(support_full_path, key)
except KeyError:
print('Unknown station:',key)
common_dates = list(set(in_date_list).intersection(set(available_dates)))
common_dates.sort()
valid_dates[key] = common_dates
return valid_dates
def train(self):
""""""
start = time.time()
print('size before truncated outliers is %d ' % len(self.TrainData))
TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)]
print('size after truncated outliers is %d ' % len(self.TrainData))
TrainData['longitude'] -= -118600000
TrainData['latitude'] -= 34220000
#extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train')
#self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1)
X = self.TrainData.drop(self._l_drop_cols, axis=1)
Y = self.TrainData['logerror']
self._l_train_columns = X.columns
X = X.values.astype(np.float32, copy=False)
lr = LassoLars(alpha= self._lr_alpha, max_iter= self._lr_iter, verbose= True)
self._model = lr.fit(X, Y)
end = time.time()
print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start)))
self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,
datetime.now().strftime('%Y%m%d-%H:%M:%S'))
#with open(self._f_eval_train_model, 'wb') as o_file:
# pickle.dump(self._model, o_file, -1)
#o_file.close()
#self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]],
# ignore_index=True) ## ignore_index will reset the index or index will be overlaped
return
def evaluate(self):
""""""
## not truncate outliers
pred_valid = pd.DataFrame(index=self.ValidData.index)
pred_valid['parcelid'] = self.ValidData['parcelid']
truth_valid = pd.DataFrame(index=self.ValidData.index)
truth_valid['parcelid'] = self.ValidData['parcelid']
start = time.time()
for d in self._l_valid_predict_columns:
l_valid_columns = ['%s%s' % (c, d) if (c in ['lastgap', 'monthyear', 'buildingage']) else c for c in
self._l_train_columns]
extra_va = pd.read_hdf(path_or_buf='%s/p21/eval_valid_%s.hdf' % (self.InputDir, d), key='valid')
#ValidData = self.ValidData.join(extra_va, on= 'parcelid', how= 'left')
ValidData = pd.concat([self.ValidData, extra_va.drop('parcelid', axis= 1)], axis= 1)
x_valid = ValidData[l_valid_columns]
x_valid = x_valid.values.astype(np.float32, copy=False)
pred_valid[d] = self._model.predict(x_valid) # * 0.99 + 0.011 * 0.01
df_tmp = ValidData[ValidData['transactiondate'].dt.month == int(d[-2:])]
truth_valid.loc[df_tmp.index, d] = df_tmp['logerror']
score = 0.0
ae = np.abs(pred_valid - truth_valid)
for col in ae.columns:
score += np.sum(ae[col])
score /= len(pred_valid) ##!! divided by number of instances, not the number of 'cells'
print('============================= ')
print('Local MAE is %.6f' % score)
print('=============================')
end = time.time()
del self.ValidData
gc.collect()
print('time elapsed %ds' % (end - start))
def train(self):
""""""
start = time.time()
extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train')
print('size before truncated outliers is %d ' % len(self.TrainData))
self.TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)]
#self.TrainData = self.TrainData.join(extra_tr, on='parcelid', how= 'left')
self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1)
print('size after truncated outliers is %d ' % len(self.TrainData))
X = self.TrainData.drop(self._l_drop_cols, axis=1)
Y = self.TrainData['logerror']
self._l_train_columns = X.columns
X = X.values.astype(np.float32, copy=False)
lr = Lasso(alpha= self._lr_alpha, max_iter= self._lr_iter, tol= 1e-4, random_state= 2017, selection= self._lr_sel)
self._model = lr.fit(X, Y)
end = time.time()
print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start)))
self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,
datetime.now().strftime('%Y%m%d-%H:%M:%S'))
with open(self._f_eval_train_model, 'wb') as o_file:
pickle.dump(self._model, o_file, -1)
o_file.close()
#self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]],
# ignore_index=True) ## ignore_index will reset the index or index will be overlaped
return
def evaluate(self):
""""""
## not truncate outliers
pred_valid = pd.DataFrame(index=self.ValidData.index)
pred_valid['parcelid'] = self.ValidData['parcelid']
truth_valid = pd.DataFrame(index=self.ValidData.index)
truth_valid['parcelid'] = self.ValidData['parcelid']
start = time.time()
for d in self._l_valid_predict_columns:
l_valid_columns = ['%s%s' % (c, d) if (c in ['lastgap', 'monthyear', 'buildingage']) else c for c in
self._l_train_columns]
extra_va = pd.read_hdf(path_or_buf='%s/p21/eval_valid_%s.hdf' % (self.InputDir, d), key='valid')
#ValidData = self.ValidData.join(extra_va, on= 'parcelid', how= 'left')
ValidData = pd.concat([self.ValidData, extra_va.drop('parcelid', axis= 1)], axis= 1)
x_valid = ValidData[l_valid_columns]
x_valid = x_valid.values.astype(np.float32, copy=False)
pred_valid[d] = self._model.predict(x_valid) # * 0.99 + 0.011 * 0.01
df_tmp = ValidData[ValidData['transactiondate'].dt.month == int(d[-2:])]
truth_valid.loc[df_tmp.index, d] = df_tmp['logerror']
score = 0.0
ae = np.abs(pred_valid - truth_valid)
for col in ae.columns:
score += np.sum(ae[col])
score /= len(pred_valid) ##!! divided by number of instances, not the number of 'cells'
print('============================= ')
print('Local MAE is %.6f' % score)
print('=============================')
end = time.time()
del self.ValidData
gc.collect()
print('time elapsed %ds' % (end - start))
def LoadFromHdfFile(InputDir, mode = 'train'):
if(mode == 'train'):
data = pd.read_hdf(path_or_buf= '%s/train.hdf' % InputDir, key='train')
elif(mode == 'valid'):
data = pd.read_hdf(path_or_buf= '%s/valid.hdf' % InputDir, key='valid')
else:
data = pd.read_hdf(path_or_buf= '%s/test.hdf' % InputDir, key='test')
return data
## class method, load data with pkl format
def recover_matrix(config, directory='.'):
"""Recover a matrix by either its config or uuid.
Parameters
----------
config: str or dict
config metadata for the matrix or uuid
directory: str
path to search for the matrix
Returns
-------
df_matrix: DataFrame
DataFrame of specified matrix
None:
If no matrix matrix is found
"""
if isinstance(config, dict):
uuid = generate_uuid(config)
else:
uuid = config
fname = directory + '/' + uuid
if os.path.isfile(fname + '.h5'):
df_matrix = pd.read_hdf(fname + '.h5')
return df_matrix
elif os.path.isfile(fname + '.csv'):
df_matrix = pd.read_csv(fname + '.csv')
return df_matrix
else:
return None
def get_matrix_and_metadata(matrix_path, metadata_path):
"""Retrieve a matrix in hdf format and
metadata about the matrix in yaml format
Returns: (tuple) matrix, metadata
"""
matrix = pandas.read_hdf(matrix_path)
with open(metadata_path) as f:
metadata = yaml.load(f)
return matrix, metadata
def read_smiles_data(filename):
import pandas as pd
h5f = pd.read_hdf(filename, 'table')
data = h5f['structure'][:]
# import gzip
# data = [line.split()[0].strip() for line in gzip.open(filename) if line]
return data
def __init__(self,
labels_fname,
regions_fname=None,
max_n_rows=None,
load_cached=True):
self.labels_fname = labels_fname
self.regions_fname = regions_fname
self.max_n_rows = max_n_rows
self._hash = None
self.load_cached = load_cached
# extract the sample names from the header
#assert labels_fname.endswith("labels.tsv.gz"), \
# "Unrecognized labels filename '%s'" % labels_fname
self._init_header_data(labels_fname)
# extract the factor from the filename
self.factor = os.path.basename(labels_fname).split('.')[0]
# if we want to use a cached version...
if self.load_cached is True:
try:
print "Loading '%s'" % self.cached_fname
self.h5store = h5py.File(self.cached_fname)
self.data = pd.read_hdf(self.cached_fname, 'data')
except KeyError:
self.data = self._build_dataframe()
self.data.to_hdf(self.cached_fname, 'data')
print self.h5store
else:
self.data = self._build_dataframe()
return
def load_or_build_motif_scores(self, fasta_fname):
try:
self.motif_scores = pd.read_hdf(self.cached_fname, 'motif_scores')
self.motif_scores.index = self.data.index
except KeyError:
self.motif_scores = self.build_motif_scores(fasta_fname)
self.motif_scores.to_hdf(self.cached_fname, 'motif_scores')
return self.motif_scores
def load_or_build_dnase_fc_scores(self):
try:
self.dnase_fc_scores = pd.read_hdf(self.cached_fname, 'dnase_scores')
except KeyError:
self.dnase_fc_scores = self.build_dnase_fc_scores()
self.dnase_fc_scores.to_hdf(self.cached_fname, 'dnase_scores')
except IOError:
self.dnase_fc_scores = self.build_dnase_fc_scores()
return self.dnase_fc_scores
def main(batch_size=10000):
posts_df = pd.read_hdf('nw_posts.hdf5', 'posts')
index_posts_in_elastic(posts_df, batch_size=batch_size)
def load_wv_pandas(fname):
return pd.read_hdf(fname, 'data')
def get_availableExchanges():
SymbolsDF = pd.read_hdf(Constants.InputFolder + 'Symbols.hdf', 'Symbols')
return SymbolsDF.EXCHANGE.drop_duplicates().values
def get_availableSymbols(SymbolFilter=None):
SymbolsDF = pd.read_hdf(Constants.InputFolder+'Symbols.hdf', 'Symbols')
if SymbolFilter == None :
DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == 'NYSE', :]
return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values
if not ('Exchange' in SymbolFilter.keys()):
DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == 'NYSE', :]
return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values
DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == SymbolFilter['Exchange'], :]
return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values
def get_availableSymbols(SymbolFilter=None):
DF=pd.read_hdf(Constants.InputFolder+'Symbols.hdf', 'OANDA')
return DF.instrument.values
def main():
DF = pd.read_hdf('/home/lc1bfrbl/Database/Oanda.hdf', 'WTICO_USD_H1')
TTT=CalcTaylorCycle(DF)
Index = (TTT.index.year == 2017) & (TTT.index.month == 6)
TTT[Index].MO.plot()
TTT[Index].MLo.plot()
TTT[Index].MHi.plot()
TTT[Index].High.plot()
TTT[Index].Low.plot()
def save_table(self, code, date):
TR_REQ_TIME_INTERVAL = 4
time.sleep(TR_REQ_TIME_INTERVAL)
data_81 = self.wrapper.get_data_opt10081(code, date)
time.sleep(TR_REQ_TIME_INTERVAL)
data_86 = self.wrapper.get_data_opt10086(code, date)
col_86 = ['???', '???', '??(??)', '???', '??', '??', '????', '???', '????',
'???', '????', '????', '????', '?????', '?????', '?????', '?????']
data = pd.concat([data_81, data_86.loc[:, col_86]], axis=1)
#con = sqlite3.connect("../data/stock.db")
try:
data = data.loc[data.index > int(self.kiwoom.start_date.strftime("%Y%m%d"))]
#orig_data = pd.read_sql("SELECT * FROM '%s'" % code, con, index_col='??').sort_index()
orig_data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index()
end_date = orig_data.index[-1]
orig_data = orig_data.loc[orig_data.index < end_date]
data = data.loc[data.index >= end_date]
data = pd.concat([orig_data, data], axis=0)
except (FileNotFoundError, IndexError) as e:
print(e)
pass
finally:
data.index.name = '??'
if len(data) != 0:
#data.to_sql(code, con, if_exists='replace')
data.to_hdf('../data/hdf/%s.hdf'%code, 'day', mode='w')
def read_h5():
code_list = glob.glob('../data/stock/*.h5')
for code in code_list[:10]:
data = pd.read_hdf(code, 'table').sort_index()
data = data.loc[data.index >= str(20160101)]
data = data.loc[data.index <= str(20160630)]
print(data.head())
def superReadFile(filepath,**kwargs):
"""
Uses pandas.read_excel (on excel files) and returns a dataframe of the first sheet (unless sheet is specified in kwargs)
Uses superReadText (on .txt,.tsv, or .csv files) and returns a dataframe of the data.
One function to read almost all types of data files.
"""
if isinstance(filepath, pd.DataFrame):
return filepath
ext = os.path.splitext(filepath)[1].lower()
if ext in ['.xlsx', '.xls']:
kwargs.pop('dtype', None)
return pd.read_excel(filepath,**kwargs)
elif ext in ['.txt','.tsv','.csv']:
return superReadText(filepath, **kwargs)
elif ext in ['.gz', '.bz2', '.zip', 'xz']:
return superReadCSV(filepath, **kwargs)
elif ext in ['.h5']:
return pd.read_hdf(filepath)
else:
raise NotImplementedError("Unable to read '{}' files".format(ext))
def test_conv_read_write(self):
path = create_tempfile(self.path)
try:
def roundtrip(key, obj, **kwargs):
obj.to_hdf(path, key, **kwargs)
return read_hdf(path, key)
o = tm.makeTimeSeries()
assert_series_equal(o, roundtrip('series', o))
o = tm.makeStringSeries()
assert_series_equal(o, roundtrip('string_series', o))
o = tm.makeDataFrame()
assert_frame_equal(o, roundtrip('frame', o))
o = tm.makePanel()
assert_panel_equal(o, roundtrip('panel', o))
# table
df = DataFrame(dict(A=lrange(5), B=lrange(5)))
df.to_hdf(path, 'table', append=True)
result = read_hdf(path, 'table', where=['index>2'])
assert_frame_equal(df[df.index > 2], result)
finally:
safe_remove(path)
def test_round_trip_equals(self):
# GH 9330
df = DataFrame({"B": [1, 2], "A": ["x", "y"]})
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', format='table')
other = read_hdf(path, 'df')
tm.assert_frame_equal(df, other)
self.assertTrue(df.equals(other))
self.assertTrue(other.equals(df))
def test_to_hdf_with_object_column_names(self):
# GH9057
# Writing HDF5 table format should only work for string-like
# column types
types_should_fail = [tm.makeIntIndex, tm.makeFloatIndex,
tm.makeDateIndex, tm.makeTimedeltaIndex,
tm.makePeriodIndex]
types_should_run = [tm.makeStringIndex, tm.makeCategoricalIndex]
if compat.PY3:
types_should_run.append(tm.makeUnicodeIndex)
else:
types_should_fail.append(tm.makeUnicodeIndex)
for index in types_should_fail:
df = DataFrame(np.random.randn(10, 2), columns=index(2))
with ensure_clean_path(self.path) as path:
with self.assertRaises(
ValueError, msg=("cannot have non-object label "
"DataIndexableCol")):
df.to_hdf(path, 'df', format='table', data_columns=True)
for index in types_should_run:
df = DataFrame(np.random.randn(10, 2), columns=index(2))
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', format='table', data_columns=True)
result = pd.read_hdf(
path, 'df', where="index = [{0}]".format(df.index[0]))
assert(len(result))
def test_read_hdf_errors(self):
df = DataFrame(np.random.rand(4, 5),
index=list('abcd'),
columns=list('ABCDE'))
with ensure_clean_path(self.path) as path:
self.assertRaises(IOError, read_hdf, path, 'key')
df.to_hdf(path, 'df')
store = HDFStore(path, mode='r')
store.close()
self.assertRaises(IOError, read_hdf, store, 'df')
with open(path, mode='r') as store:
self.assertRaises(NotImplementedError, read_hdf, store, 'df')
def test_read_nokey(self):
df = DataFrame(np.random.rand(4, 5),
index=list('abcd'),
columns=list('ABCDE'))
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', mode='a')
reread = read_hdf(path)
assert_frame_equal(df, reread)
df.to_hdf(path, 'df2', mode='a')
self.assertRaises(ValueError, read_hdf, path)