我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.HDFStore()。
def getStationMetadata(): ''' Retrieve metadata on groundwater wells @return pandas dataframe with groundwater well information ''' data_file = DataFetcher.getDataLocation('groundwater') if data_file is None: print('Dataset not available') return None store = pd.HDFStore(data_file,'r') meta_data = store['meta_data'] store.close() return meta_data
def getAntennaLogs(): ''' Retrieve information about antenna changes @return dictionary of antenna changes ''' store_location = data_util.getDataLocation('ngl_gps') store = pd.HDFStore(store_location, 'r') logs_df = store['ngl_steps'] store.close() metadata = DataFetcher.getStationMetadata() logs_dict = OrderedDict() for station in metadata.index: offset_dates = logs_df[logs_df['Station']==station].index.unique() offset_dates = pd.Series(offset_dates) logs_dict[station] = offset_dates return logs_dict
def __init__(self, db_h5_file, future_time = 20, lookback = 100, read_first_k_table = -1, normalize=True, two_class=False): super(DBGeneticReader, self).__init__() self._db = pd.HDFStore(db_h5_file) self._future_time = future_time self._lookback = lookback self._db_len = 0 self._two_class = two_class self._tables = [] for k in self._db: self._db_len += len(self._db[k]) - future_time - lookback t = self._db[k].iloc[:, 4:].astype('float32') t['AveragePrice'] = (t['AskPrice1'] + t['BidPrice1']) / 2 if normalize: t = (t - t.mean()) / (t.std() + 1e-10) self._tables.append(t) if read_first_k_table != -1 and len(self._tables) == read_first_k_table: break
def _read_hdf_from_buffer(self, buffer): with pandas.HDFStore( "data.h5", mode="r", driver="H5FD_CORE", driver_core_backing_store=0, driver_core_image=buffer.read()) as store: if len(store.keys()) > 1: raise Exception('Ambiguous matrix store. More than one dataframe in the hdf file.') try: return store["matrix"] except KeyError: print("The hdf file should contain one and only key, matrix.") return store[store.keys()[0]]
def fake_metta(matrix_dict, metadata): """Stores matrix and metadata in a metta-data-like form Args: matrix_dict (dict) of form { columns: values }. Expects an entity_id to be present which it will use as the index metadata (dict). Any metadata that should be set Yields: tuple of filenames for matrix and metadata """ matrix = pandas.DataFrame.from_dict(matrix_dict).set_index('entity_id') with tempfile.NamedTemporaryFile() as matrix_file: with tempfile.NamedTemporaryFile('w') as metadata_file: hdf = pandas.HDFStore(matrix_file.name) hdf.put('title', matrix, data_columns=True) matrix_file.seek(0) yaml.dump(metadata, metadata_file) metadata_file.seek(0) yield (matrix_file.name, metadata_file.name)
def __init__(self): with pd.HDFStore("../input/train.h5", "r") as hfdata: self.timestamp = 0 fullset = hfdata.get("train") self.unique_timestamp = fullset["timestamp"].unique() # Get a list of unique timestamps # use the first half for training and # the second half for the test set n = len(self.unique_timestamp) i = int(n/2) timesplit = self.unique_timestamp[i] self.n = n self.unique_idx = i self.train = fullset[fullset.timestamp < timesplit] self.test = fullset[fullset.timestamp >= timesplit] # Needed to compute final score self.full = self.test.loc[:, ['timestamp', 'y']] self.full['y_hat'] = 0.0 self.temp_test_y = None
def hdfone_filenames(folder: str, path_to: str) -> List[str]: filenames = [] try: assert isinstance(folder, str), "folder isn't string: %s" % folder assert isinstance(path_to, str), "path_to isn't string: %s" % path_to if settings.DATA_TYPE == "hdfone": f = join(settings.DATA_PATH, "hdfone.hdfone") if isfile(f): with HDFStore(f) as hdf: filenames = [f for f in hdf.keys() if folder in f] hdf.close() else: filenames = multi_filenames(path_to_history=path_to) except Exception as err: print(colored.red("hdfone_filenames: {}".format(err))) return filenames
def write(self, frames): """ Write the frames to the target HDF5 file, using the format used by ``pd.Panel.to_hdf`` Parameters ---------- frames : iter[(int, DataFrame)] or dict[int -> DataFrame] An iterable or other mapping of sid to the corresponding OHLCV pricing data. """ with HDFStore(self._path, 'w', complevel=self._complevel, complib=self._complib) \ as store: panel = pd.Panel.from_dict(dict(frames)) panel.to_hdf(store, 'updates') with tables.open_file(self._path, mode='r+') as h5file: h5file.set_node_attr('/', 'version', 0)
def add_data(self, packet: DataPacket): """ Import a DataFrame into the project :param packet: DataPacket custom class containing file path, dataframe, data type and flight association :return: Void """ self.log.debug("Ingesting data and exporting to hdf5 store") file_uid = 'f' + uuid.uuid4().hex[1:] # Fixes NaturalNameWarning by ensuring first char is letter ('f'). with HDFStore(str(self.hdf_path)) as store: # Separate data into groups by data type (GPS & Gravity Data) # format: 'table' pytables format enables searching/appending, fixed is more performant. store.put('{}/{}'.format(packet.data_type, file_uid), packet.data, format='fixed', data_columns=True) # Store a reference to the original file path self.data_map[file_uid] = packet.path try: flight = self.flights[packet.flight.uid] if packet.data_type == 'gravity': flight.gravity = file_uid elif packet.data_type == 'gps': flight.gps = file_uid except KeyError: return False
def save_cache_file(data, cache_file, **kwargs): """Save minitree dataframe + cut history to a cache file Any kwargs will be passed to pandas HDFStore. Defaults are: complib='blosc' complevel=9 """ kwargs.setdefault('complib', 'blosc') kwargs.setdefault('complevel', 9) dirname = os.path.dirname(cache_file) if dirname and not os.path.exists(dirname): os.makedirs(dirname) store = pd.HDFStore(cache_file, **kwargs) store.put('data', data) # Store the cuts history for the data store.get_storer('data').attrs.cut_history = cuts._get_history(data) store.close()
def __init__(self): with pd.HDFStore("../input/train.h5", "r") as hfdata: self.timestamp = 0 fullset = hfdata.get("train") self.unique_timestamp = fullset["timestamp"].unique() # Get a list of unique timestamps # use the first half for training and # the second half for the test set n = len(self.unique_timestamp) i = int(n/2) timesplit = self.unique_timestamp[i] self.n = n self.unique_idx = i self.train = fullset[fullset.timestamp < timesplit] self.test = fullset[fullset.timestamp >= timesplit] self.y_test_full = self.test['y'] # Just in case the full labels are needed later self.y_pred_full = [] self.temp_test_y = None self.ID_COL_NAME = 'id' self.SAMPLE_COL_NAME = 'sample' self.TARGET_COL_NAME = 'y' self.TIME_COL_NAME = 'timestamp'
def _init_frame_node_parm_with_data(self): """ init pamr s need to be calculated :return:s """ try : store = pd.HDFStore(self.input_paths[0]) chunk = store.select('table1', start=0, stop=100) for col_name in self.encode_col: if (self.encode_len.get(col_name) == None): if (chunk[col_name].dtype in ['int', 'float']): self.encode_len[col_name] = 1 self.input_size = self.input_size + 1 else: self.encode_len[col_name] = self.word_vector_size self.input_size = self.input_size + self.word_vector_size self.encode_onehot[col_name] = OneHotEncoder(self.word_vector_size) self.encode_dtype[col_name] = str(chunk[col_name].dtype) except Exception as e : raise Exception ("error on wcnn feed parm prepare : {0}".format(e))
def _frame_parser(self, file_path, index): """ parse nlp data :return: """ try : store = pd.HDFStore(file_path) chunk = store.select('table1', start=index.start, stop=index.stop) input_vector = [] count = index.stop - index.start for col_name in self.encode_col: if (chunk[col_name].dtype == 'O'): input_vector.append(list(map(lambda x: self.encode_onehot[col_name].get_vector(x), chunk[col_name][0:count].tolist()))) else : input_vector.append(np.array(list(map(lambda x: [self._filter_nan(x)], chunk[col_name][0:count].tolist())))) return self._flat_data(input_vector, len(chunk[col_name][0:count].tolist())) except Exception as e : raise Exception (e) finally: store.close()
def _nlp_parser(self, file_path, index): """ parse nlp data :return: """ try : store = pd.HDFStore(file_path) chunk = store.select('table1', start=index.start, stop=index.stop) count = index.stop - index.start if (self.encode_col in chunk): encode = self.encode_pad(self._preprocess(chunk[self.encode_col].values)[0:count], max_len=self.encode_len) return self._word_embed_data(self.embed_type, encode) else: warnings.warn("not exists column names requested !!") return [['#'] * self.encode_len] except Exception as e : raise Exception (e) finally: store.close()
def _convert_data_format(self, file_path, index): """ :param obj: :param index: :return: """ try : return_data = [] store = pd.HDFStore(file_path) chunk = store.select('table1', start=index.start, stop=index.stop) for column in self.column_list : for line in self._preprocess(chunk[column].values)[index.start:index.stop] : return_data = return_data + line return [return_data] except Exception as e : raise Exception (e) finally: store.close()
def test_grbm_reload(): vis_layer = layers.BernoulliLayer(num_vis) hid_layer = layers.GaussianLayer(num_hid) # create some extrinsics grbm = model.Model([vis_layer, hid_layer]) with tempfile.NamedTemporaryFile() as file: # save the model store = pandas.HDFStore(file.name, mode='w') grbm.save(store) store.close() # reload store = pandas.HDFStore(file.name, mode='r') grbm_reload = model.Model.from_saved(store) store.close() # check the two models are consistent vis_data = vis_layer.random((num_samples, num_vis)) data_state = model_utils.State.from_visible(vis_data, grbm) dropout_scale = model_utils.State.dropout_rescale(grbm) vis_orig = grbm.deterministic_iteration(1, data_state, dropout_scale).units[0] vis_reload = grbm_reload.deterministic_iteration(1, data_state, dropout_scale).units[0] assert be.allclose(vis_orig, vis_reload)
def save(self, store: pandas.HDFStore) -> None: """ Save a model to an open HDFStore. Notes: Performs an IO operation. Args: store (pandas.HDFStore) Returns: None """ # save the config as an attribute config = self.get_config() store.put('model', pandas.DataFrame()) store.get_storer('model').attrs.config = config # save the parameters for i in range(self.num_weights): key = os.path.join('weights', 'weights'+str(i)) self.weights[i].save_params(store, key) for i in range(self.num_layers): key = os.path.join('layers', 'layers'+str(i)) self.layers[i].save_params(store, key)
def save_params(self, store, key): """ Save the parameters to a HDFStore. Notes: Performs an IO operation. Args: store (pandas.HDFStore): the writeable stream for the params. key (str): the path for the layer params. Returns: None """ for i, ip in enumerate(self.params): df_params = pandas.DataFrame(be.to_numpy_array(ip)) store.put(os.path.join(key, 'parameters', 'key'+str(i)), df_params)
def load_params(self, store, key): """ Load the parameters from an HDFStore. Notes: Performs an IO operation. Args: store (pandas.HDFStore): the readable stream for the params. key (str): the path for the layer params. Returns: None """ params = [] for i, ip in enumerate(self.params): params.append(be.float_tensor( store.get(os.path.join(key, 'parameters', 'key'+str(i))).as_matrix() ).squeeze()) # collapse trivial dimensions to a vector self.params = self.params.__class__(*params)
def get_trajs_mat(self, cols, traj): if traj == 'avg_probe_pp': with pd.HDFStore(self.pp_traj_df_path, mode='r') as store: df_traj = store.select('pp_traj_df', columns=cols) trajs_mat = df_traj.values.transpose() elif traj == 'avg_probe_ba': with pd.HDFStore(self.ba_traj_df_path, mode='r') as store: df_traj = store.select('ba_traj_df', columns=cols) trajs_mat = df_traj.values.transpose() elif 'cat_task' in traj: with pd.HDFStore(self.cat_task_traj_df_path, mode='r') as store: columns = [traj.replace('cat_task_', '') + '_fold{}'.format(i) for i in cols] df_traj = store.select('cat_task_traj_df', columns=columns) trajs_mat = df_traj.values.transpose() elif 'syn_task' in traj: with pd.HDFStore(self.syn_task_traj_df_path, mode='r') as store: columns = [traj.replace('syn_task_', '') + '_fold{}'.format(i) for i in cols] df_traj = store.select('syn_task_traj_df', columns=columns) trajs_mat = df_traj.values.transpose() else: raise AttributeError('rnnlab: Invalid argument passed to "traj".') return trajs_mat
def __iter__(self, gen_type='train', batch_size=None, shuffle_block=False, random_sample=False, split_fields=False, on_disk=True, squeeze_output=False, **kwargs): gen_type = gen_type.lower() if on_disk: print('on disk...') for hdf_X, hdf_y in self._files_iter_(gen_type=gen_type, shuffle_block=shuffle_block): # num_of_lines = pd.HDFStore(hdf_y, mode='r').get_storer('fixed').shape[0] X_all = pd.read_hdf(hdf_X, mode='r').as_matrix() y_all = pd.read_hdf(hdf_y, mode='r').as_matrix() gen = self.generator(X_all, y_all, batch_size, shuffle=random_sample) for X, y in gen: if split_fields: X = np.split(X, self.max_length, axis=1) for i in range(self.max_length): X[i] -= self.feat_min[i] if squeeze_output: y = y.squeeze() yield X, y else: print('not implemented')
def read_offsets(offsets_file): offsets = np.zeros( shape=(8, 2, 4096, 40), dtype='f4') def name_to_channel_gain_id(name): _, channel, gain = name.split('_') channel = int(channel) gain_id = {'high': 0, 'low': 1}[gain] return channel, gain_id with pd.HDFStore(offsets_file) as st: for name in st.keys(): channel, gain_id = name_to_channel_gain_id(name) df = st[name] df.sort_values(["cell", "sample"], inplace=True) offsets[channel, gain_id] = df["median"].values.reshape(-1, 40) return offsets
def cacheData(self, data_specification): ''' Cache Kepler data locally @param data_specification: List of kepler IDs ''' kid_list = data_specification data_location = DataFetcher.getDataLocation('kepler') if data_location == None: data_location = os.path.join(os.path.expanduser('~'),'.skdaccess','kepler') os.makedirs(data_location, exist_ok=True) data_location = os.path.join(data_location, 'kepler_data.h5') DataFetcher.setDataLocation('kepler', data_location) store = pd.HDFStore(data_location) missing_kid_list = [] for kid in kid_list: if 'kid_' + kid not in store: missing_kid_list.append(kid) if len(missing_kid_list) > 0: print("Downloading data for " + str(len(missing_kid_list)) + " star(s)") missing_kid_data = self.downloadKeplerData(missing_kid_list) for kid,data in missing_kid_data.items(): store.put('kid_' + kid, data) store.close()
def output(self): ''' Output kepler data wrapper @return DataWrapper ''' kid_list = self.ap_paramList[0]() kid_list = [ str(kid).zfill(9) for kid in kid_list ] self.cacheData(kid_list) data_location = DataFetcher.getDataLocation('kepler') kid_data = dict() store = pd.HDFStore(data_location) for kid in kid_list: kid_data[kid] = store['kid_' + kid] # If downloaded using old skdaccess version # switch index if kid_data[kid].index.name == 'TIME': kid_data[kid]['TIME'] = kid_data[kid].index kid_data[kid].set_index('CADENCENO', inplace=True) store.close() kid_data = OrderedDict(sorted(kid_data.items(), key=lambda t: t[0])) # If a list of quarters is specified, only select data in those quarters if self.quarter_list != None: for kid in kid_list: kid_data[kid] = kid_data[kid][kid_data[kid]['QUARTER'].isin(self.quarter_list)] return TableWrapper(kid_data, default_columns = ['PDCSAP_FLUX'], default_error_columns = ['PDCSAP_FLUX_ERR'])
def _rawData(self): ''' Select data from sites within site radius to be returned without stabilization. ''' storeName = self.meta_data keyList = self._validStations(storeName) if len(keyList) == 0: self._validInit = 0 else: storeData_fn = DataFetcher.getDataLocation('pbo') if storeData_fn is None: print('Dataset not available') return None storeData = pd.HDFStore(storeData_fn) mdyratio = self._mdyratio smSet_all, smHdr_all = pbo_util.nostab_sys(storeName,storeData,[self._start_time,self._end_time],indx=keyList,mdyratio=mdyratio, use_progress_bar = self.use_progress_bar) self._smSet_all = smSet_all self._smHdr_all = smHdr_all storeData.close() if len(self._smSet_all) == 0: self._validInit = 0 else: self._validInit = 1
def getAntennaLogs(): ''' Get antenna logs. @return dictionary of data frames containing antenna logs ''' meta_data = DataFetcher.getStationMetadata() storeData_fn = DataFetcher.getDataLocation('pbo') if storeData_fn is None: print('Dataset not available') return None store = pd.HDFStore(storeData_fn, 'r') logs = store['/antenna_logs'] antenna_dict = dict() # for label in meta_data.keys(): # try: # antenna_dict[label] = store['/antenna_log_' + label] # except: # pass for label in meta_data.keys(): if len(logs[logs['Station'] == label]) > 0: antenna_dict[label] = logs[logs['Station'] == label]['Date'] store.close() return antenna_dict
def getStationMetadata(): ''' Get station metadata @return data frame of station metadata ''' store_location = data_util.getDataLocation('ngl_gps') store = pd.HDFStore(store_location, 'r') metadata = store['metadata'] store.close() metadata.loc[:,'Lon'] = (metadata.loc[:,'Lon'] + 180) % 360 - 180 return metadata
def save_dataframe(self, df): with pd.HDFStore(''.join([TwoSigmaFinModTools._save_path, 'train_debug', self.timestamp, '.h5']), "w") as train: train.put("train_debug", df)
def load_dataframe(): dataframe_name = 'train_debug' # one-hot encoded # not one-hot # date_time = '20170613_19h09m40s' # date_time = '20170613_19h34m31s' # date_time = '20170614_00h07m32s' date_time = '20170619_11h47m22s' with pd.HDFStore(''.join([TwoSigmaFinModTools._save_path, dataframe_name, date_time, '.h5']), 'r') as train: return train.get(dataframe_name)
def write_hdf(hdf_fname, df, key, header): """ Output the contents of *df* and *header* to the HDF file *hdf_fname* under identifier *key*. """ with PD.HDFStore(hdf_fname) as store: store.put(key, df) store.get_storer(key).attrs.header = header return hdf_fname
def read_hdf(hdf_fname, key): """ Read contents of HDF file *hdf_fname* associated with *key* and return a :class:`DataFrame`, header tuple. """ if not os.path.isfile(hdf_fname): raise ValueError('file {} does not exist'.format(hdf_fname)) with PD.HDFStore(hdf_fname) as store: df = store.get(key) header = store.get_storer(key).attrs.header return df, header
def setUp(self): self.test_output_dir = mkdtemp(prefix="feagen_test_output_") pandas_hdf_path = join(self.test_output_dir, "pandas.h5") self.hdf_store = pd.HDFStore(pandas_hdf_path)
def __init__(self, hdf_path): hdf_dir = os.path.dirname(hdf_path) if hdf_dir != '': mkdir_p(hdf_dir) self.hdf_store = pd.HDFStore(hdf_path)
def _get_head_of_matrix(self): try: hdf = pandas.HDFStore(self.matrix_path) key = hdf.keys()[0] head_of_matrix = hdf.select(key, start=0, stop=1) head_of_matrix.set_index(self.metadata['indices'], inplace=True) self._head_of_matrix = head_of_matrix except pandas.error.EmptyDataError: self._head_of_matrix = None
def _write_hdf_to_buffer(self, df): with pandas.HDFStore( "data.h5", mode="w", driver="H5FD_CORE", driver_core_backing_store=0) as out: out["matrix"] = df return out._handle.get_file_image()
def export_data_table(self, table, end_date, label, feature_names): """ Save a data set as an HDF table for later reuse. :param table: the DataFrame to save :type table: pandas DataFrame :param end_date: end of labeling period :type end_date: a date format of some kind :param label: name of the column containing labels :type label: str :param feature_names: names of the columns containing features :type feature_names: list :return: the prefix of the HDF filename :rtype: str """ if type(end_date) == np.datetime64: end_date = np.datetime_as_string(end_date, timezone = 'local')[:10] else: end_date = end_date.to_datetime().date().isoformat() file_name = self.export_metadata(end_date, label, feature_names) file_path = '{0}/{1}.h5'.format(self.results_directory, file_name) if not os.path.exists(file_path): store = pd.HDFStore(file_path) store['df'] = table store.close() self.upload_file_to_s3('{0}.h5'.format(file_name), 'hdf_bucket_name', file_path) print("uploaded hdf to s3") return(file_name)
def __init__(self, hdfstore, tablename): if isinstance(hdfstore, pd.HDFStore): self.store = hdfstore else: self.store = pd.HDFStore(hdfstore, "r") self.tablename = tablename
def __init__(self, store, tablename, vecnamecol, lengthcol): if isinstance(store, pd.HDFStore): self.store = store else: self.store = pd.HDFStore(store, "r") self.tablename = tablename self.vecnamecol = vecnamecol self.lengthcol = lengthcol
def __init__(self, storesdict): self.stores = {} for key in storesdict: if isinstance(storesdict[key], pd.HDFStore): self.stores[key] = storesdict[key] else: self.stores[key] = pd.HDFStore(storesdict[key], "r") self.indices = {}
def load_data(self, uid: str, prefix: str): """ Load data from a specified group (prefix) - gps or gravity, from the projects HDF5 store. :param str uid: Datafile Unique Identifier :param str prefix: Data type prefix [gps or gravity] :return: """ with HDFStore(str(self.hdf_path)) as store: try: data = store.get('{}/{}'.format(prefix, uid)) except KeyError: return None else: return data
def load(args, filename): return pandas.HDFStore(filename)
def query_from_to(symbol,Start,today): store = pd.HDFStore(Constants.StockHDF) if not (symbol in store.keys()): return None store[]
def get_taylor_table(): store = pd.HDFStore(Constants.DatabaseTaylorCP) StoredDF = pd.DataFrame() for key in store.keys(): DF = store[key].tail(1) DF['SymbolID'] = key StoredDF = pd.concat([StoredDF, DF[['SymbolID', 'MO', 'MLo', 'MHi','TaylorDay']]], axis=0) store.close() return StoredDF.to_html()
def calc_oz_series_pandas(symbol, numWeeksBack=20, averageTf='W'): timeFrameMap={'W':(1*numWeeksBack), '3M':(numWeeksBack*15), 'Q':(numWeeksBack*15), 'M':(numWeeksBack*4)} print(Constants.StockHDF) store = pd.HDFStore(Constants.StockHDF) symbolKey = symbol + '_'+ averageTf today = datetime.datetime.now() # - datetime.timedelta(days=1) day_of_week = today.weekday() weekStart = today - datetime.timedelta(days=day_of_week + 1) if not (symbolKey in store.keys()): print('Symbol:'+symbol) weekly_DF = getWeeklyDF(timeFrameMap[averageTf], symbol) #print(weekly_DF) newDF=calc_OZ_pandas(weekly_DF,averageTf=averageTf) store[symbolKey] = newDF store.flush() #print('READ') lenStore = len(store[symbolKey]) - 1 if not (store[symbolKey].index[lenStore].date() == weekStart.date()): weekly_DF = getWeeklyDF(timeFrameMap[averageTf], symbol) newDF=calc_OZ_pandas(weekly_DF,averageTf=averageTf) store[symbolKey] = newDF store.flush() return store[symbolKey]