Python pandas 模块,DataFrame() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.DataFrame()。
def aggregate_ohlcv_panel(self,
fields,
ohlcv_panel,
items=None,
minor_axis=None):
"""
Convert an OHLCV Panel into a DataFrame by aggregating each field's
frame into a Series.
"""
vals = ohlcv_panel
if isinstance(ohlcv_panel, pd.Panel):
vals = ohlcv_panel.values
items = ohlcv_panel.items
minor_axis = ohlcv_panel.minor_axis
data = [
self.frame_to_series(
field,
vals[items.get_loc(field)],
minor_axis
)
for field in fields
]
return np.array(data)
def y_sum_by_time(x_arr, y_arr, top=None):
df = pd.DataFrame({'Timestamp': pd.to_datetime(x_arr, unit='s'), 'Status': y_arr})
df['Date'] = df['Timestamp'].apply(lambda x: "%d/%d/%d" % (x.day, x.month, x.year))
df['Hour'] = df['Timestamp'].apply(lambda x: "%d" % (x.hour))
df['Weekday'] = df['Timestamp'].apply(lambda x: "%s" % (x.weekday_name))
times = ['Hour', 'Weekday', 'Date']
result = {}
for groupby in times:
df_group = df.groupby(groupby, as_index=False).agg({'Status': np.sum})
if top != None and top > 0:
#df_group = df_group.nlargest(top, 'Status').sort(['Status', 'Hour'],ascending=False)
idx = df_group.nlargest(top, 'Status') > 0
else:
idx = df_group['Status'].max() == df_group['Status']
result[groupby] = {k: g['Status'].replace(np.nan, 'None').tolist() for k,g in df_group[idx].groupby(groupby)}
return result
def get_citation_df(args, text):
"""
Generate citation_df and save it to 'citations.tsv'.
"""
citation_df = pandas.DataFrame(
{'string': get_citation_strings(text)}
)
if args.citation_tags_path.is_file():
tag_df = pandas.read_table(args.citation_tags_path)
tag_df['string'] = '@tag:' + tag_df.tag
for citation in tag_df.citation:
is_valid_citation_string('@' + citation)
citation_df = citation_df.merge(tag_df[['string', 'citation']], how='left')
else:
citation_df['citation'] = None
logging.info(f'missing {args.citation_tags_path} file: no citation tags set')
citation_df.citation.fillna(citation_df.string.astype(str).str.lstrip('@'), inplace=True)
citation_df['standard_citation'] = citation_df.citation.map(standardize_citation)
citation_df['citation_id'] = citation_df.standard_citation.map(get_citation_id)
citation_df = citation_df.sort_values(['standard_citation', 'citation'])
citation_df.to_csv(args.citations_path, sep='\t', index=False)
check_collisions(citation_df)
check_multiple_citation_strings(citation_df)
return citation_df
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
# print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
return (1 / np.sum(f1))
# Main
def rhoA(self):
# rhoA
rhoA = pd.DataFrame(0, index=np.arange(1), columns=self.latent)
for i in range(self.lenlatent):
weights = pd.DataFrame(self.outer_weights[self.latent[i]])
weights = weights[(weights.T != 0).any()]
result = pd.DataFrame.dot(weights.T, weights)
result_ = pd.DataFrame.dot(weights, weights.T)
S = self.data_[self.Variables['measurement'][
self.Variables['latent'] == self.latent[i]]]
S = pd.DataFrame.dot(S.T, S) / S.shape[0]
numerador = (
np.dot(np.dot(weights.T, (S - np.diag(np.diag(S)))), weights))
denominador = (
(np.dot(np.dot(weights.T, (result_ - np.diag(np.diag(result_)))), weights)))
rhoA_ = ((result)**2) * (numerador / denominador)
if(np.isnan(rhoA_.values)):
rhoA[self.latent[i]] = 1
else:
rhoA[self.latent[i]] = rhoA_.values
return rhoA.T
def xloads(self):
# Xloadings
A = self.data_.transpose().values
B = self.fscores.transpose().values
A_mA = A - A.mean(1)[:, None]
B_mB = B - B.mean(1)[:, None]
ssA = (A_mA**2).sum(1)
ssB = (B_mB**2).sum(1)
xloads_ = (np.dot(A_mA, B_mB.T) /
np.sqrt(np.dot(ssA[:, None], ssB[None])))
xloads = pd.DataFrame(
xloads_, index=self.manifests, columns=self.latent)
return xloads
def alpha(self):
# Cronbach Alpha
alpha = pd.DataFrame(0, index=np.arange(1), columns=self.latent)
for i in range(self.lenlatent):
block = self.data_[self.Variables['measurement']
[self.Variables['latent'] == self.latent[i]]]
p = len(block.columns)
if(p != 1):
p_ = len(block)
correction = np.sqrt((p_ - 1) / p_)
soma = np.var(np.sum(block, axis=1))
cor_ = pd.DataFrame.corr(block)
denominador = soma * correction**2
numerador = 2 * np.sum(np.tril(cor_) - np.diag(np.diag(cor_)))
alpha_ = (numerador / denominador) * (p / (p - 1))
alpha[self.latent[i]] = alpha_
else:
alpha[self.latent[i]] = 1
return alpha.T
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(self, item):
output = pd.DataFrame(self.population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_pso(self, item):
output = pd.DataFrame(self.population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_tabu(self, item):
output = pd.DataFrame(self.population[item])
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
cost = (np.sum(f1))
print(1 / cost)
return [self.population[item], cost]
def apply(path):
data = metadata.load(path)
for service in data["services"]:
filename = os.path.join(path, service["filename"])
df = load_timeseries(filename, service)
print(service)
df2 = interpolate_missing(df[service["fields"]])
classes = classify_series(df2)
preprocessed_series = {}
for k in classes["other_fields"]:
# short by one value, because we have to short the other one!
preprocessed_series[k] = df2[k][1:]
for k in classes["monotonic_fields"]:
preprocessed_series[k + "-diff"] = df2[k].diff()[1:]
newname = service["name"] + "-preprocessed.tsv.gz"
df3 = pd.DataFrame(preprocessed_series)
df3.to_csv(os.path.join(path, newname), sep="\t", compression='gzip')
service["preprocessed_filename"] = newname
service["preprocessed_fields"] = list(df3.columns)
service.update(classes)
metadata.save(path, data)
def centroids(path):
metadata = load_metadata(path)
d = {}
for srv in metadata["services"]:
name = "%s/%s-cluster-1_1.tsv" % (path, srv["name"])
df = pd.read_csv(name, sep="\t", index_col='time', parse_dates=True)
d[srv["name"]] = df.centroid
df2 = pd.DataFrame(d)
df2 = df2.fillna(method="bfill", limit=1e9)
df2 = df2.fillna(method="ffill", limit=1e9)
fig = df2.plot()
handles, labels = fig.get_legend_handles_labels()
fig.grid('on')
lgd = fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5,-0.1))
plt.savefig("graph.png", bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.close("all")
def test_pd_outer_join():
dfs = [
pd.DataFrame({
'id': [0, 1, 2, 3],
'a': ['foo', 'bar', 'baz', np.nan],
'b': ['panda', 'zebra', np.nan, np.nan],
}),
pd.DataFrame({
'id': [1, 2, 3, 4],
'b': ['mouse', np.nan, 'tiger', 'egret'],
'c': ['toe', 'finger', 'nose', np.nan],
}),
]
expected = pd.DataFrame({
'id': [0, 1, 2, 3, 4],
'a': ['foo', 'bar', 'baz', np.nan, np.nan],
'b': ['panda', 'zebra', np.nan, 'tiger', 'egret'],
'c': [np.nan, 'toe', 'finger', 'nose', np.nan],
}).set_index('id')
actual = pd_outer_join(dfs, on='id')
print(expected)
print(actual)
assert expected.equals(actual)
def read_image(imagery_path):
# Read image
dataset = gdal.Open(imagery_path)
dsmatrix = dataset.ReadAsArray(xoff=0, yoff=0, xsize=dataset.RasterXSize, ysize=dataset.RasterYSize)
# Get Geographic meta data
geo_trans_list = dataset.GetGeoTransform()
proj_str = dataset.GetProjection()
num_bands = dataset.RasterCount
# Adapt to one bands or multi-bands
if num_bands > 1:
# Unfold array into pandas DataFrame
rows = dsmatrix.shape[1]
cols = dsmatrix.shape[2]
data_array = dsmatrix[:,0,:]
for irow in range(1,rows):
tempmatirx = dsmatrix[:,irow,:]
data_array = np.hstack((data_array,tempmatirx))
else:
# Unfold array into pandas DataFrame
rows = dsmatrix.shape[0]
cols = dsmatrix.shape[1]
data_array = dsmatrix[0,:]
for irow in range(1,rows):
tempmatirx = dsmatrix[irow,:]
data_array = np.hstack((data_array,tempmatirx))
data_frame = pd.DataFrame(data_array.T)
return data_frame, rows, cols, geo_trans_list, proj_str, num_bands
def __init__(self, *args, **kwargs):
'''
The same arguments as for pandas.DataFrame
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
data argument should be a list of XSeries objects or dict of XSeries objects.
In dict is passed, key must be a string and it's indicate appropriate column name.
For example, to create XDataFrame data should looks like
data = {'col_1': s_1, 'col_2': s_2, ..., 'col_n': s_n} where s_i is a XSeries
'''
data = kwargs.get('data')
if data is None:
data = args[0]
data_to_check = []
if isinstance(data, list):
data_to_check = data
elif isinstance(data, dict):
data_to_check = data.values()
for d in data_to_check:
if not isinstance(d, XSeries):
raise ValueError('All data must be XSeries instances')
super(XDataFrame, self).__init__(*args, **kwargs)
def to_pandas_dataframe(self):
'''
Convert self to pandas.DataFrame if all columns are primitive types.
See more at XSeries.to_pandas_series
:return:
'''
data_types = self.get_data_types()
is_all_columns_are_primitive = all(
_is_class_a_primitive(dt)
for dt in data_types
)
if is_all_columns_are_primitive:
self.__class__ = pd.DataFrame
else:
raise ValueError('Unable to cast to pd.DataFrame. {} is not all primitives.'.format(self.data_types))
return self
def _create_daily_stats(self, perfs):
# create daily and cumulative stats dataframe
daily_perfs = []
# TODO: the loop here could overwrite expected properties
# of daily_perf. Could potentially raise or log a
# warning.
for perf in perfs:
if 'daily_perf' in perf:
perf['daily_perf'].update(
perf['daily_perf'].pop('recorded_vars')
)
perf['daily_perf'].update(perf['cumulative_risk_metrics'])
daily_perfs.append(perf['daily_perf'])
else:
self.risk_report = perf
daily_dts = [np.datetime64(perf['period_close'], utc=True)
for perf in daily_perfs]
daily_stats = pd.DataFrame(daily_perfs, index=daily_dts)
return daily_stats
def _pipeline_output(self, pipeline, chunks):
"""
Internal implementation of `pipeline_output`.
"""
today = normalize_date(self.get_datetime())
try:
data = self._pipeline_cache.unwrap(today)
except Expired:
data, valid_until = self._run_pipeline(
pipeline, today, next(chunks),
)
self._pipeline_cache = CachedObject(data, valid_until)
# Now that we have a cached result, try to return the data for today.
try:
return data.loc[today]
except KeyError:
# This happens if no assets passed the pipeline screen on a given
# day.
return pd.DataFrame(index=[], columns=data.columns)
def frame_from_bardata(self, data, algo_dt):
"""
Create a DataFrame from the given BarData and algo dt.
"""
data = data._data
frame_data = np.empty((len(self.fields), len(self.sids))) * np.nan
for j, sid in enumerate(self.sids):
sid_data = data.get(sid)
if not sid_data:
continue
if algo_dt != sid_data['dt']:
continue
for i, field in enumerate(self.fields):
frame_data[i, j] = sid_data.get(field, np.nan)
return pd.DataFrame(
frame_data,
index=self.fields.copy(),
columns=self.sids.copy(),
)
def update_dividends(self, new_dividends):
"""
Update our dividend frame with new dividends. @new_dividends should be
a DataFrame with columns containing at least the entries in
zipline.protocol.DIVIDEND_FIELDS.
"""
# Mark each new dividend with a unique integer id. This ensures that
# we can differentiate dividends whose date/sid fields are otherwise
# identical.
new_dividends['id'] = np.arange(
self._dividend_count,
self._dividend_count + len(new_dividends),
)
self._dividend_count += len(new_dividends)
self.dividend_frame = sort_values(pd.concat(
[self.dividend_frame, new_dividends]
), ['pay_date', 'ex_date']).set_index('id', drop=False)
def create_test_panel_ohlc_source(sim_params, env):
start = sim_params.first_open \
if sim_params else pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc)
end = sim_params.last_close \
if sim_params else pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc)
index = env.days_in_range(start, end)
price = np.arange(0, len(index)) + 100
high = price * 1.05
low = price * 0.95
open_ = price + .1 * (price % 2 - .5)
volume = np.ones(len(index)) * 1000
arbitrary = np.ones(len(index))
df = pd.DataFrame({'price': price,
'high': high,
'low': low,
'open': open_,
'volume': volume,
'arbitrary': arbitrary},
index=index)
panel = pd.Panel.from_dict({0: df})
return DataPanelSource(panel), panel
def add_frame(self, tick, frame, minor_axis=None, items=None):
"""
"""
if self._pos == self.cap:
self._roll_data()
if isinstance(frame, pd.DataFrame):
minor_axis = frame.columns
items = frame.index
if set(minor_axis).difference(set(self.minor_axis)) or \
set(items).difference(set(self.items)):
self._update_buffer(frame)
vals = frame.T.astype(self.dtype)
self.buffer.loc[:, self._pos, :] = vals
self.date_buf[self._pos] = tick
self._pos += 1
def getindexdaily(self,code,start,end):
total=[]
startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[]}
for stockdaily in self.index[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date"):
series["date"].append(stockdaily["date"])
series["open"].append(stockdaily["open"])
series["close"].append(stockdaily["close"])
series["high"].append(stockdaily["high"])
series["low"].append(stockdaily["low"])
series["volume"].append(stockdaily["volume"])
totaldata=zip(series['date'],series['open'],series['close'],series['high'],series['low'],series['volume'])
df = pd.DataFrame(list(totaldata))
df.index=df.date
return df
def read_treasure_from_mongodb(self,start,end):
startdate=start
enddate=end
series={"Time Period":[],"1month":[],"3month":[],"6month":[],"1year":[],"2year":[],"3year":[],"5year":[],"7year":[],"10year":[],"20year":[],"30year":[]}
if type(start) is types.StringType:
startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
if type(end) is types.StringType:
enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
for treasuredaily in self.treasure['treasure'].find({"Time Period": {"$gte": startdate,"$lt":enddate}}).sort("date"):
series["Time Period"].append(treasuredaily["Time Period"])
series["1month"].append(treasuredaily["1month"])
series["3month"].append(treasuredaily["3month"])
series["6month"].append(treasuredaily["6month"])
series["1year"].append(treasuredaily["1year"])
series["2year"].append(treasuredaily["2year"])
series["3year"].append(treasuredaily["3year"])
series["5year"].append(treasuredaily["5year"])
series["7year"].append(treasuredaily["7year"])
series["10year"].append(treasuredaily["10year"])
series["20year"].append(treasuredaily["20year"])
series["30year"].append(treasuredaily["30year"])
totaldata=zip(series["1month"],series["3month"],series["6month"],series["1year"],series["2year"],series["3year"],series["5year"],series["7year"],series["10year"],series["20year"],series["30year"])
df = pd.DataFrame(data=list(totaldata),index=series["Time Period"],columns = ['1month', '3month','6month', '1year', '2year', '3year', '5year', '7year', '10year', '20year', '30year'])
return df.sort_index().tz_localize('UTC')
def __init__(self, constants, dates, sids):
loaders = {}
for column, const in iteritems(constants):
frame = DataFrame(
const,
index=dates,
columns=sids,
dtype=column.dtype,
)
loaders[column] = DataFrameLoader(
column=column,
baseline=frame,
adjustments=None,
)
self._loaders = loaders
def test_consume_metadata(self):
# Test dict consumption
dict_to_consume = {0: {'symbol': 'PLAY'},
1: {'symbol': 'MSFT'}}
self.env.write_data(equities_data=dict_to_consume)
finder = self.asset_finder_type(self.env.engine)
equity = finder.retrieve_asset(0)
self.assertIsInstance(equity, Equity)
self.assertEqual('PLAY', equity.symbol)
# Test dataframe consumption
df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1])
df['asset_name'][0] = "Dave'N'Busters"
df['exchange'][0] = "NASDAQ"
df['asset_name'][1] = "Microsoft"
df['exchange'][1] = "NYSE"
self.env = TradingEnvironment(load=noop_load)
self.env.write_data(equities_df=df)
finder = self.asset_finder_type(self.env.engine)
self.assertEqual('NASDAQ', finder.retrieve_asset(0).exchange)
self.assertEqual('Microsoft', finder.retrieve_asset(1).asset_name)
def setUp(self):
self.env = TradingEnvironment()
self.days = self.env.trading_days[:5]
self.panel = pd.Panel({1: pd.DataFrame({
'price': [1, 1, 2, 4, 8], 'volume': [1e9, 1e9, 1e9, 1e9, 0],
'type': [DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.CLOSE_POSITION]},
index=self.days)
})
self.no_close_panel = pd.Panel({1: pd.DataFrame({
'price': [1, 1, 2, 4, 8], 'volume': [1e9, 1e9, 1e9, 1e9, 1e9],
'type': [DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE]},
index=self.days)
})
def test_bfill(self):
# test ndim=1
N = 100
s = pd.Series(np.random.randn(N))
mask = random.sample(range(N), 10)
s.iloc[mask] = np.nan
correct = s.bfill().values
test = bfill(s.values)
assert_almost_equal(correct, test)
# test ndim=2
df = pd.DataFrame(np.random.randn(N, N))
df.iloc[mask] = np.nan
correct = df.bfill().values
test = bfill(df.values)
assert_almost_equal(correct, test)
def test_ffill(self):
# test ndim=1
N = 100
s = pd.Series(np.random.randn(N))
mask = random.sample(range(N), 10)
s.iloc[mask] = np.nan
correct = s.ffill().values
test = ffill(s.values)
assert_almost_equal(correct, test)
# test ndim=2
df = pd.DataFrame(np.random.randn(N, N))
df.iloc[mask] = np.nan
correct = df.ffill().values
test = ffill(df.values)
assert_almost_equal(correct, test)
def get_expected_next_event_dates(self, dates):
return pd.DataFrame({
0: get_values_for_date_ranges(zip_with_dates,
next_dates[0],
next_date_intervals[0],
dates),
1: get_values_for_date_ranges(zip_with_dates,
next_dates[1],
next_date_intervals[1],
dates),
2: get_values_for_date_ranges(zip_with_dates,
next_dates[2],
next_date_intervals[2],
dates),
3: get_values_for_date_ranges(zip_with_dates,
next_dates[3],
next_date_intervals[3],
dates),
4: zip_with_dates(dates, ['NaT'] * len(dates)),
}, index=dates)
def get_expected_previous_event_dates(self, dates):
return pd.DataFrame({
0: get_values_for_date_ranges(zip_with_dates,
prev_dates[0],
prev_date_intervals[0],
dates),
1: get_values_for_date_ranges(zip_with_dates,
prev_dates[1],
prev_date_intervals[1],
dates),
2: get_values_for_date_ranges(zip_with_dates,
prev_dates[2],
prev_date_intervals[2],
dates),
3: get_values_for_date_ranges(zip_with_dates,
prev_dates[3],
prev_date_intervals[3],
dates),
4: zip_with_dates(dates, ['NaT'] * len(dates)),
}, index=dates)
def get_vals_for_dates(zip_date_index_with_vals,
vals,
date_invervals,
dates):
return pd.DataFrame({
0: get_values_for_date_ranges(zip_date_index_with_vals,
vals[0],
date_invervals[0],
dates),
1: get_values_for_date_ranges(zip_date_index_with_vals,
vals[1],
date_invervals[1],
dates),
2: get_values_for_date_ranges(zip_date_index_with_vals,
vals[2],
date_invervals[2],
dates),
# Assume the latest of 2 cash values is used if we find out about 2
# announcements that happened on the same day for the same sid.
3: get_values_for_date_ranges(zip_date_index_with_vals,
vals[3],
date_invervals[3],
dates),
4: zip_date_index_with_vals(dates, ['NaN'] * len(dates)),
}, index=dates)
def test_auto_deltas(self):
expr = bz.data(
{'ds': self.df,
'ds_deltas': pd.DataFrame(columns=self.df.columns)},
dshape=var * Record((
('ds', self.dshape.measure),
('ds_deltas', self.dshape.measure),
)),
)
loader = BlazeLoader()
ds = from_blaze(
expr.ds,
loader=loader,
missing_values=self.missing_values,
)
self.assertEqual(len(loader), 1)
exprdata = loader[ds]
self.assertTrue(exprdata.expr.isidentical(expr.ds))
self.assertTrue(exprdata.deltas.isidentical(expr.ds_deltas))
def pipeline_event_loader_args(self, dates):
_, mapping = super(
BlazeCashBuybackAuthLoaderTestCase,
self,
).pipeline_event_loader_args(dates)
return (bz.data(pd.concat(
pd.DataFrame({
BUYBACK_ANNOUNCEMENT_FIELD_NAME:
frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
CASH_FIELD_NAME:
frame[CASH_FIELD_NAME],
TS_FIELD_NAME:
frame[TS_FIELD_NAME],
SID_FIELD_NAME: sid,
})
for sid, frame in iteritems(mapping)
).reset_index(drop=True)),)
def file_get_iem_data_frame(path):
"""
Return the IEM samplesheet data as a Pandas DataFrame,
to perform better slicing operations.
"""
rows = read_csv_rows(path)
if not rows_are_iem_samplesheet(rows):
raise ValueError("Invalid IEM samplesheet format: %s" % path)
section_gen = rows_iem_section_generator(rows)
for section in section_gen:
if section_is_valid_data(section):
# TODO this appears to be a problem if you have data columns
# with trailing all-blank entries (see CSI-215 fix)
df = pd.DataFrame(data=section.rows[1:], columns=section.rows[0])
# skip tailing rows
return df[df['Sample_ID'].notnull()]
raise ValueError("Invalid IEM samplesheet format, no data found: %s" % path)
def gen_csv_paths(data_dir, pref):
"""
Generate CSV file from image, contour, and segment file paths.
Args:
data_dir: BBBC006 data directory path.
pref: Prefix (either 'train' or 'test')
"""
filenames = get_png_files(os.path.join(data_dir, 'BBBC006_v1_' + pref))
contours = get_png_files(os.path.join(data_dir, 'BBBC006_v1_contours_'
+ pref))
segments = get_png_files(os.path.join(data_dir, 'BBBC006_v1_segments_'
+ pref))
all_files = [filenames, contours, segments]
pd_arr = pd.DataFrame(all_files).transpose()
pd_arr.to_csv(pref + '.csv', index=False, header=False)
def kraken_order_book(book_type: str, currency_code: str = 'EUR', coin_code: str = 'XBT'):
"""Kraken specific orderbook retrieval
"""
import krakenex
kraken_api = krakenex.API(key=KRAKEN_API_KEY, secret=KRAKEN_PRIVATE_KEY, conn=krakenex.Connection())
pair = f'X{coin_code}Z{currency_code}'
orders = kraken_api.query_public('Depth', {'pair': pair})
df = pd.DataFrame(
orders['result'][pair][book_type],
columns=['price', 'volume', 'timestamp'])
return df
def get_irrelevant_cited_papers(bad_papers, db_cursor, papers_table='papers'):
"""Retrieves the papers cited by the irrelevant papers given in input, from a SQL database.
Args:
bad_papers (list of dicts): the list of irrelevant papers, formatted as the output of :func:`data_retrieval.list2paper`
db_cursor (:class:`MySQLdb.cursors.Cursor`): cursor of a SQL database in which there is a papers table
papers_table (string): name of the papers table in the SQL database
Returns:
tuple of tuples: the results of the SQL query
"""
citations = []
for p in bad_papers:
for c in p['citations']:
citations.append([p['index'], c])
citations_df = pd.DataFrame(citations, columns=['citing', 'cited'])
cited = citations_df['cited'].unique()
db_cursor.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND p.id IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited))
return db_cursor.fetchall()
def parse_fasta(self):
self.ref_id=dict()
self.ref_inf=dict()
i=1
N = 0
ref_inf=np.empty(shape=[0,3])
for seqs in SeqIO.parse(self.ref,'fasta'):
seq_id = seqs.id
self.ref_id[i] = seq_id
seq = str(seqs.seq.upper())
seq_len = len(seq)
self.ref_inf[seq_id]=seq_len
N+=seq.count('N')
ref_inf = np.append(ref_inf,[[i,seq_id,seq_len]],axis=0)
i+=1
self.ref_detail = pd.DataFrame(ref_inf,columns=['Index','Contig','Length(bp)'])
self.N = N
def qualification_filter(self):
"""
Providing information of those unqualified and qualified contigs from the orginal fasta file
with the criterion: >20Kb & >=5 restriction sites inside.
"""
unqualified = np.empty(shape=[0,3])
qualified = np.empty(shape=[0,4])
rm_dup = self.RcmapTable[['CMapId','ContigLength','NumSites']].drop_duplicates()
for i in self.ref_id.keys():
index = i
name = self.ref_id[i]
length = self.ref_inf[name]
if i not in self.RcmapTable['CMapId'].unique():
unqualified = np.append(unqualified,[[index,name, length]],axis=0)
else:
Id = rm_dup[rm_dup['CMapId']==i].index[0]
sites = rm_dup['NumSites'][Id]
qualified = np.append(qualified,[[index,name,length,sites]],axis=0)
self.unqualified = pd.DataFrame(unqualified, columns=['index','contig','length(bp)'])
self.qualified = pd.DataFrame(qualified, columns=['index','contig','length(bp)','numSites'])
def test_append():
np.random.seed(0)
n = 1000
df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
'y': np.random.normal(size=n)})
gdf = gd.DataFrame.from_pandas(df)
frags = _fragmented_gdf(gdf, nsplit=13)
# Combine with .append
head = frags[0]
tail = frags[1:]
appended = dgd.from_pygdf(head, npartitions=1)
for each in tail:
appended = appended.append(each)
assert_frame_equal(df, appended.compute().to_pandas())
def test_series_append():
np.random.seed(0)
n = 1000
df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
'y': np.random.normal(size=n)})
gdf = gd.DataFrame.from_pandas(df)
frags = _fragmented_gdf(gdf, nsplit=13)
frags = [df.x for df in frags]
appending = dgd.from_pygdf(frags[0], npartitions=1)
for frag in frags[1:]:
appending = appending.append(frag)
appended = appending.compute().to_pandas()
assert isinstance(appended, pd.Series)
np.testing.assert_array_equal(appended, df.x)
def test_set_index(nelem):
np.random.seed(0)
# Use unique index range as the sort may not be stable-ordering
x = np.arange(nelem)
np.random.shuffle(x)
df = pd.DataFrame({'x': x,
'y': np.random.randint(0, nelem, size=nelem)})
ddf = dd.from_pandas(df, npartitions=2)
dgdf = dgd.from_dask_dataframe(ddf)
expect = ddf.set_index('x').compute()
got = dgdf.set_index('x').compute().to_pandas()
np.testing.assert_array_equal(got.index.values, expect.index.values)
np.testing.assert_array_equal(got.y.values, expect.y.values)
assert got.columns == expect.columns
def test_groupby_single_key(keygen):
np.random.seed(0)
nelem = 500
npartitions = 10
# Generate the keys
xs = keygen(nelem)
assert xs.size == nelem
df = pd.DataFrame({'x': xs,
'z': np.random.normal(size=nelem) + 1})
gdf = gd.DataFrame.from_pandas(df)
dgf = dgd.from_pygdf(gdf, npartitions=npartitions)
groups = dgf.groupby(by=['x']).count()
got = groups.compute().to_pandas()
# Check against expectation
expect = df.groupby(by=['x'], as_index=False).count()
# Check keys
np.testing.assert_array_equal(got.x, expect.x)
# Check values
np.testing.assert_array_equal(got.z, expect.z)
def store_test_predictions(self, prediction_id='_final'):
"""
Stores the test predictions in a CSV file
:param prediction_id: A simple id appended to the name of the summary for uniqueness
:return: None
"""
# prediction id is usually the step count
print 'Storing predictions on Test Data...'
review = []
true_summary = []
generated_summary = []
for i in range(self.test_size):
if not self.checkpointer.is_output_file_present():
review.append(self._index2sentence(self.test_review[i]))
true_summary.append(self._index2sentence(self.true_summary[i]))
if i < (self.test_batch_size * (self.test_size // self.test_batch_size)):
generated_summary.append(self._index2sentence(self.predicted_test_summary[i]))
else:
generated_summary.append('')
prediction_nm = 'generated_summary' + prediction_id
if self.checkpointer.is_output_file_present():
df = pd.read_csv(self.checkpointer.get_result_location(), header=0)
df[prediction_nm] = np.array(generated_summary)
else:
df = pd.DataFrame()
df['review'] = np.array(review)
df['true_summary'] = np.array(true_summary)
df[prediction_nm] = np.array(generated_summary)
df.to_csv(self.checkpointer.get_result_location(), index=False)
print 'Stored the predictions. Moving Forward'
if prediction_id == '_final':
print 'All done. Exiting..'
print 'Exited'
def crawl_for_reviews_and_summary(self, input_file):
"""
Crawl the input dataset
:param input_file: The location of the file containing the txt file dataset
:return: None
"""
self.raw_data_file = input_file
self.df = pd.DataFrame()
self.df['Review'] = self.__crawl_review()
self.df['Summary'] = self.__crawl_summary()
def pearson(X, y):
r = []
p = []
for c in X.columns:
r_, p_ = pearsonr(X[c], y)
r.append(r_)
p.append(p_)
dfr = pd.DataFrame(index=range(1, 1+len(X.columns)))
dfr['pearson'] = r
dfr['pearson_p'] = p
return dfr