Python pandas 模块,concat() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.concat()。
def data_preprocess(train,test):
outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
train.drop(train.index[outlier_idx],inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
all_data = all_data.drop(to_delete,axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train,X_test,y
def data_preprocess(train,test):
outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
train.drop(train.index[outlier_idx],inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
all_data = all_data.drop(to_delete,axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train,X_test,y
def data_preprocess(train, test):
outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
train.drop(train.index[outlier_idx], inplace=True)
all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
test.loc[:, 'MSSubClass':'SaleCondition']))
to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
all_data = all_data.drop(to_delete, axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
# log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(method='ffill')
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train, X_test, y
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
# print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
return (1 / np.sum(f1))
# Main
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(self, item):
output = pd.DataFrame(self.population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_pso(self, item):
output = pd.DataFrame(self.population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def merged(self, s, t):
chars = []
for c1, c2 in zip_longest(s.sequence, t.sequence):
if c1 is None:
c = c2
elif c2 is None:
c = c1
elif c1 == 'N':
c = c2
elif c2 == 'N':
c = c1
elif c1 != c2:
return None
else:
assert c1 == c2
c = c1
chars.append(c)
seq = ''.join(chars)
requested = s.requested or t.requested
name = s.name + ';' + t.name
# take union of groups
group = pd.concat([s.group, t.group]).groupby(level=0).last()
return SiblingInfo(seq, requested, name, group)
def update_dividends(self, new_dividends):
"""
Update our dividend frame with new dividends. @new_dividends should be
a DataFrame with columns containing at least the entries in
zipline.protocol.DIVIDEND_FIELDS.
"""
# Mark each new dividend with a unique integer id. This ensures that
# we can differentiate dividends whose date/sid fields are otherwise
# identical.
new_dividends['id'] = np.arange(
self._dividend_count,
self._dividend_count + len(new_dividends),
)
self._dividend_count += len(new_dividends)
self.dividend_frame = sort_values(pd.concat(
[self.dividend_frame, new_dividends]
), ['pay_date', 'ex_date']).set_index('id', drop=False)
def pipeline_event_loader_args(self, dates):
_, mapping = super(
BlazeCashBuybackAuthLoaderTestCase,
self,
).pipeline_event_loader_args(dates)
return (bz.data(pd.concat(
pd.DataFrame({
BUYBACK_ANNOUNCEMENT_FIELD_NAME:
frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
CASH_FIELD_NAME:
frame[CASH_FIELD_NAME],
TS_FIELD_NAME:
frame[TS_FIELD_NAME],
SID_FIELD_NAME: sid,
})
for sid, frame in iteritems(mapping)
).reset_index(drop=True)),)
def pipeline_event_loader_args(self, dates):
_, mapping = super(
BlazeShareBuybackAuthLoaderTestCase,
self,
).pipeline_event_loader_args(dates)
return (bz.data(pd.concat(
pd.DataFrame({
BUYBACK_ANNOUNCEMENT_FIELD_NAME:
frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
SHARE_COUNT_FIELD_NAME:
frame[SHARE_COUNT_FIELD_NAME],
TS_FIELD_NAME:
frame[TS_FIELD_NAME],
SID_FIELD_NAME: sid,
})
for sid, frame in iteritems(mapping)
).reset_index(drop=True)),)
def load_names_data():
fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
if not os.path.exists(fp):
r = requests.get(URL_NAMES)
with open(fp, 'wb') as f:
f.write(r.content)
post = collections.OrderedDict()
with zipfile.ZipFile(fp) as zf:
# get ZipInfo instances
for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
fn = zi.filename
if fn.startswith('yob'):
year = int(fn[3:7])
df = pd.read_csv(
zf.open(zi),
header=None,
names=('name', 'gender', 'count'))
df['year'] = year
post[year] = df
df = pd.concat(post.values())
df.set_index('name', inplace=True, drop=True)
return df
def load_names_data():
fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
if not os.path.exists(fp):
r = requests.get(URL_NAMES)
with open(fp, 'wb') as f:
f.write(r.content)
post = collections.OrderedDict()
with zipfile.ZipFile(fp) as zf:
# get ZipInfo instances
for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
fn = zi.filename
if fn.startswith('yob'):
year = int(fn[3:7])
df = pd.read_csv(
zf.open(zi),
header=None,
names=('name', 'gender', 'count'))
df['year'] = year
post[year] = df
df = pd.concat(post.values())
df.set_index('name', inplace=True, drop=True)
return df
def read_data(fname):
""" Read football-data.co.uk csv """
data = (
pd.read_csv(fname)
.rename(columns={
'HomeTeam': 'home_team',
'AwayTeam': 'away_team',
'FTHG': 'home_goals',
'FTAG': 'away_goals'
})
.loc[lambda df: ~pd.isnull(df['home_goals'])] # Remove future games
)
team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
data['home_team_id'] = data['home_team'].replace(team_map)
data['away_team_id'] = data['away_team'].replace(team_map)
for col in ('home_goals', 'away_goals'):
data[col] = [int(c) for c in data[col]]
return data, team_map
def QA_fetch_get_security_bars(code, _type, lens, ip=best_ip['stock'], port=7709):
api = TdxHq_API()
with api.connect(ip, port):
data = pd.concat([api.to_df(api.get_security_bars(_select_type(_type), _select_market_code(
code), code, (i - 1) * 800, 800)) for i in range(1, int(lens / 800) + 2)], axis=0)
data = data\
.assign(datetime=pd.to_datetime(data['datetime']), code=str(code))\
.drop(['year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=False)\
.assign(date=data['datetime'].apply(lambda x: str(x)[0:10]))\
.assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(x)))\
.assign(time_stamp=data['datetime'].apply(lambda x: QA_util_time_stamp(x)))\
.assign(type=_type).set_index('datetime', drop=False, inplace=False).tail(lens)
if data is not None:
return data
else:
return None
def QA_fetch_get_stock_block(ip=best_ip['stock'], port=7709):
'????'
api = TdxHq_API()
with api.connect(ip, port):
data = pd.concat([api.to_df(api.get_and_parse_block_info("block_gn.dat")).assign(type='gn'),
api.to_df(api.get_and_parse_block_info(
"block.dat")).assign(type='yb'),
api.to_df(api.get_and_parse_block_info(
"block_zs.dat")).assign(type='zs'),
api.to_df(api.get_and_parse_block_info("block_fg.dat")).assign(type='fg')])
if len(data) > 10:
return data.assign(source='tdx').drop(['block_type', 'code_index'], axis=1).set_index('code', drop=False, inplace=False).drop_duplicates()
else:
QA_util_log_info('Wrong with fetch block ')
def QA_fetch_get_future_day(code, start_date, end_date, level='day', ip=best_ip['future'], port=7727):
'???? ??'
apix = TdxExHq_API()
start_date = str(start_date)[0:10]
today_ = datetime.date.today()
lens = QA_util_get_trade_gap(start_date, today_)
global extension_market_info
extension_market_info=QA_fetch_get_future_list() if extension_market_info is None else extension_market_info
with apix.connect(ip, port):
code_market = extension_market_info.query('code=="{}"'.format(code))
data = pd.concat([apix.to_df(apix.get_instrument_bars(_select_type(
level), int(code_market.market), str(code),(int(lens / 700) - i) * 700, 700))for i in range(int(lens / 700) + 1)], axis=0)
data = data.assign(date=data['datetime'].apply(lambda x: str(x[0:10]))).assign(code=str(code))\
.assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(str(x)[0:10]))).set_index('date', drop=False, inplace=False)
return data.drop(['year', 'month', 'day', 'hour', 'minute', 'datetime'], axis=1)[start_date:end_date].assign(date=data['date'].apply(lambda x: str(x)[0:10]))
def QA_data_make_qfq(bfq_data, xdxr_data):
'???????????'
info = xdxr_data[xdxr_data['category'] == 1]
bfq_data['if_trade'] = 1
data = pd.concat([bfq_data, info[['category']]
[bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
data['if_trade'].fillna(value=0, inplace=True)
data = data.fillna(method='ffill')
data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
data = data.fillna(0)
data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
* data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
data['adj'] = (data['preclose'].shift(-1) /
data['close']).fillna(1)[::-1].cumprod()
data['open'] = data['open'] * data['adj']
data['high'] = data['high'] * data['adj']
data['low'] = data['low'] * data['adj']
data['close'] = data['close'] * data['adj']
data['preclose'] = data['preclose'] * data['adj']
return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu',
'if_trade', 'category'], axis=1).query("open != 0")
def QA_data_make_hfq(bfq_data, xdxr_data):
'???????????'
info = xdxr_data[xdxr_data['category'] == 1]
bfq_data['if_trade'] = 1
data = pd.concat([bfq_data, info[['category']]
[bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
data['if_trade'].fillna(value=0, inplace=True)
data = data.fillna(method='ffill')
data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
data = data.fillna(0)
data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
* data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
data['adj'] = (data['preclose'].shift(-1) /
data['close']).fillna(1).cumprod()
data['open'] = data['open'] / data['adj']
data['high'] = data['high'] / data['adj']
data['low'] = data['low'] / data['adj']
data['close'] = data['close'] / data['adj']
data['preclose'] = data['preclose'] / data['adj']
return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu'], axis=1).query("open != 0")
def get_text_len(DB, tr, te):
if tr is None:
if te=='stage1':
Data = [DB.data['training_text'],DB.data['test_text_filter']]
else:
Data = [pd.concat([DB.data['training_text'],DB.data['test_text_filter']],axis=0),DB.data['stage2_test_text']]
else:
Data = [DB.data['training_text']]
for data in Data:
data['tl'] = data['Text'].apply(lambda x:len(x))
data['tl2'] = data['Text'].apply(lambda x:len(x.split()))
if tr is None:
X,Xt = Data
return X[['tl','tl2']].values, Xt[['tl','tl2']].values
else:
X = Data[0][['tl','tl2']].values
return X[tr],X[te]
def get_pattern(DB,tr,te,patterns):
cols = ['p%d'%c for c,p in enumerate(patterns)]
if tr is None:
test = DB.data['test_variants_filter'] if te=='stage1' else DB.data['stage2_test_variants']
if te=='stage1':
train = DB.data['training_variants']
else:
train = pd.concat([DB.data['training_variants'],DB.data["test_variants_filter"]],axis=0)
Data =[train,test]
else:
Data = [DB.data['training_variants']]
for data in Data:
for c,p in enumerate(patterns):
data['p%d'%c] = data['Variation'].apply(lambda x: len(re.findall(p,str(x).lower())))
if tr is None:
return train[cols].values,test[cols].values
else:
X = data[cols].values
return X[tr],X[te]
def onehot_gene(DB, tr, te):
from utils.np_utils.encoder import onehot_encode
if tr is None:
train = DB.data['training_variants']
if te=="stage1":
test = DB.data['test_variants_filter']
else:
train = pd.concat([train,DB.data['test_variants_filter']],axis=0)
test = DB.data['stage2_test_variants']
lbl_encode(train,test)
n = max(train['Gene'].max(),test['Gene'].max())
gtr = onehot_encode(train['Gene'].values,n=n+1)
gte = onehot_encode(test['Gene'].values)
return gtr,gte
else:
data = DB.data['training_variants']
lbl_encode(data,cols=['Gene'])
gene = data['Gene'].values
gene = onehot_encode(gene)
return gene[tr],gene[te]
def post_cv(flags):
import re
import os
path = flags.data_path
files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))]
s = []
for name in files:
s.append(pd.read_csv("%s/%s"%(path,name)))
s = pd.concat(s,axis=0)
print(s.head())
classes = len([i for i in s.columns.values if 'class' in i])
from utils.np_utils.utils import cross_entropy
yp = s[['class%d'%i for i in range(1,classes+1)]].values
y=s['real'].values
print(cross_entropy(y,yp))
s.to_csv("%s/cv.csv"%path,index=False)
def create_agents(self, generator):
"""
Given information on a set of countries and a generator function,
generate the agents and assign the results to ``self.agents``.
:type generator: DataFrame, str, int
:param generator: A function which generates the agents.
"""
self.generator = generator
country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
country_array.index = range(len(country_array))
# Garbage collect before creating new processes.
gc.collect()
self.agents = pd.concat(
self.pool.imap(self._gen_agents,
np.array_split(country_array, self.processes * self.splits))
)
self.agents.index = range(len(self.agents))
def create_agents(self, generator):
"""
Given information on a set of countries and a generator function,
generate the agents and assign the results to ``self.agents``.
:type generator: DataFrame, str, int
:param generator: A function which generates the agents.
"""
self.generator = generator
country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
country_array.index = range(len(country_array))
# Garbage collect before creating new processes.
gc.collect()
self.agents = pd.concat(
self.pool.imap(self._gen_agents,
np.array_split(country_array, self.processes * self.splits))
)
self.agents.index = range(len(self.agents))
def OHETr(self, tr):
""""""
OHEDict = {}
for col in tr.columns:
ValueCounts = [str(int(v)) for v in tr[col].value_counts().index.values]
ValueCounts.append('missing')
SelectedValues = dict((k, v) for (v, k) in enumerate(ValueCounts, start=0))
OHTr = self.__ApplyOH(tr[col].values, SelectedValues)
headers = dict((('%s_%s' % (col, k)), SelectedValues[k]) for k in SelectedValues)
tmp = [v[0] for v in sorted(headers.items(), key=lambda x: x[1])]
OHDFTr = pd.DataFrame(OHTr, index=tr.index, columns=tmp)
tr = pd.concat([tr, OHDFTr], axis=1)
tr.drop(col, axis=1, inplace=True)
OHEDict[col] = SelectedValues
#print('Column %s was encoded.' % col)
return tr, OHEDict
def combineFilesIntoDf(file_path, filenames, reset_index=False, drop_cols=None):
df = None
for filename in filenames:
fdf = pd.DataFrame.from_csv(file_path + filename)
if reset_index:
fdf = fdf.reset_index()
if df is None:
df = fdf.copy(deep=True)
else:
df = pd.concat([df,fdf])
if drop_cols is not None:
for feat in drop_cols:
df = df.drop(feat, 1)
return df
def tsne_cluster_cuisine(df,sublist):
lenlist=[0]
df_sub = df[df['cuisine']==sublist[0]]
lenlist.append(df_sub.shape[0])
for cuisine in sublist[1:]:
temp = df[df['cuisine']==cuisine]
df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
lenlist.append(df_sub.shape[0])
df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
print df_X.shape, lenlist
dist = squareform(pdist(df_X, metric='cosine'))
tsne = TSNE(metric='precomputed').fit_transform(dist)
palette = sns.color_palette("hls", len(sublist))
plt.figure(figsize=(10,10))
for i,cuisine in enumerate(sublist):
plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\
tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i])
plt.legend()
#interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
def create_future(fold, features_old, cfg_parameters):
"""
Just for testing purposes.
Sets up a replicate of the last day(s) data to create new data for testing. But in reality,
we should be able to create features for the upcoming days from past data, so this would not be needed???
"""
last_day = fold['window_end']
next_days = [last_day + timedelta(days=i) for i in xrange(1,(cfg_parameters['prediction_horizon'] +1 ))]
old_features_unique = features_old.drop_duplicates(subset='ToiletID')
l_future_features = []
for day in next_days:
next_day_features = old_features_unique.copy()
next_day_features["Collection_Date"] = day
l_future_features.append(next_day_features)
future_features = pd.concat(l_future_features, ignore_index=True)
return(future_features)
def __init__(self, **kwargs):
"""
Store the configuration of link DfConcatenator
:param str name: name of link
:param str storeKey: key of data to store in data store
:param list readKeys: keys of pandas dataframes in the data store
:param bool ignore_missing_input: Skip missing input datasets. If all missing, store empty dataset. Default is false.
:param kwargs: all other key word arguments are passed on to pandas concat function.
"""
Link.__init__(self, kwargs.pop('name', 'DfConcatenator'))
# process and register all relevant kwargs. kwargs are added as attributes of the link.
# second arg is default value for an attribute. key is popped from kwargs.
self._process_kwargs(kwargs, readKeys=[])
self._process_kwargs(kwargs, storeKey=None)
self._process_kwargs(kwargs, ignore_missing_input=False)
# pass on remaining kwargs to pandas reader
self.kwargs = copy.deepcopy(kwargs)
return
def collect_history_data(history_dir, days):
today = dt.datetime.now()
dfs = []
for d in glob.glob('{}/*'.format(history_dir)):
if os.path.isdir(d):
dirname = os.path.basename(d)
dirdate = None
try:
dirdate = dt.datetime.strptime(dirname, '%Y%m%d')
except Exception as ex:
logger.error(ex)
if dirdate and (days == -1 or (today - dirdate).days < days):
for fname in glob.glob('{}/{}/*.csv'.format(history_dir, dirname)):
try:
dfs.append(pd.read_csv(fname))
except Exception as ex:
logger.warn("Reading {} error: {}".format(fname, ex))
if not dfs:
return None
df = pd.concat(dfs, ignore_index=True)
df = df[df.Datetime != 'Datetime'].sort(
['User', 'Datetime']).drop_duplicates()
return df
def _aligned_series(*many_series):
"""
Return a new list of series containing the data in the input series, but
with their indices aligned. NaNs will be filled in for missing values.
Parameters
----------
many_series : list[pd.Series]
Returns
-------
aligned_series : list[pd.Series]
A new list of series containing the data in the input series, but
with their indices aligned. NaNs will be filled in for missing values.
"""
return [series
for col, series in iteritems(pd.concat(many_series, axis=1))]
def compute_panel(cls, data, scales, **params):
func = make_summary_fun(params['fun_data'], params['fun_y'],
params['fun_ymin'], params['fun_ymax'],
params['fun_args'])
# break a dataframe into pieces, summarise each piece,
# and join the pieces back together, retaining original
# columns unaffected by the summary.
summaries = []
for (group, x), df in data.groupby(['group', 'x']):
summary = func(df)
summary['x'] = x
summary['group'] = group
unique = uniquecols(df)
if 'y' in unique:
unique = unique.drop('y', axis=1)
merged = summary.merge(unique, on=['group', 'x'])
summaries.append(merged)
new_data = pd.concat(summaries, axis=0, ignore_index=True)
return new_data
def compute_panel(cls, data, scales, params):
if not params['var']:
return data
negative = data['ymax'] < 0
neg = data.loc[negative]
pos = data.loc[~negative]
neg.is_copy = None
pos.is_copy = None
if len(neg):
neg = cls.collide(neg, params=params)
if len(pos):
pos = cls.collide(pos, params=params)
data = pd.concat([neg, pos], axis=0, ignore_index=True)
return data
def add_missing_facets(data, layout, vars, facet_vals):
# When in a dataframe some layer does not have all
# the facet variables, add the missing facet variables
# and create new data where the points(duplicates) are
# present in all the facets
missing_facets = set(vars) - set(facet_vals)
if missing_facets:
to_add = layout.loc[:, missing_facets].drop_duplicates()
to_add.reset_index(drop=True, inplace=True)
# a point for each facet, [0, 1, ..., n-1, 0, 1, ..., n-1, ...]
data_rep = np.tile(np.arange(len(data)), len(to_add))
# a facet for each point, [0, 0, 0, 1, 1, 1, ... n-1, n-1, n-1]
facet_rep = np.repeat(np.arange(len(to_add)), len(data))
data = data.iloc[data_rep, :].reset_index(drop=True)
facet_vals = facet_vals.iloc[data_rep, :].reset_index(drop=True)
to_add = to_add.iloc[facet_rep, :].reset_index(drop=True)
facet_vals = pd.concat([facet_vals, to_add],
axis=1, ignore_index=False)
return data, facet_vals
def disp_gap_byweather(self):
df = self.gapdf
data_dir = g_singletonDataFilePath.getTrainDir()
dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle'
dumpload = DumpLoad(dumpfile_path)
if dumpload.isExisiting():
temp_df = dumpload.load()
else:
weather_dict = self.get_weather_dict(data_dir)
temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict)
dumpload.dump(temp_df)
df = pd.concat([df, temp_df], axis=1)
gaps_mean = df.groupby('preweather')['gap'].mean()
gaps_mean.plot(kind='bar')
plt.ylabel('Mean of gap')
plt.xlabel('Weather')
plt.title('Weather/Gap Correlation')
return
def disp_gap_bytraffic(self):
df = self.gapdf
data_dir = g_singletonDataFilePath.getTrainDir()
dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevtraffic.df.pickle'
dumpload = DumpLoad(dumpfile_path)
if dumpload.isExisiting():
temp_df = dumpload.load()
else:
traffic_dict = self.get_traffic_dict(data_dir)
temp_df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_traffic,axis = 1, traffic_dict=traffic_dict, pre_num = 3)
dumpload.dump(temp_df)
df = pd.concat([df, temp_df], axis=1)
by_traffic = df.groupby('traffic1')
x=[]
y=[]
for name, group in by_traffic:
x.append(name)
y.append(group['gap'].mean())
plt.scatter(x,y)
return
def __do_one_hot_encodings(self):
df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
enc = OneHotEncoder(sparse=False)
cross_feature_dict = self.__get_label_encode_dict()
to_be_encoded = []
for _, new_feature_name in cross_feature_dict.iteritems():
to_be_encoded.append(new_feature_name)
#fix all data source
to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0)
enc.fit(to_be_stacked_df)
enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2)
# transform on seprate data source
self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv
self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded)
self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
return
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
agg = concat(cols, axis=1)
agg.columns = names
if dropnan:
agg.dropna(inplace=True)
return agg
def predict_tf_all(path = None):
result_list = []
p = m_Pool(31)
result_list = p.map(predict_tf_once,range(1,32))
p.close()
p.join()
print 'writing...'
result_df = pd.DataFrame(index = range(1))
for day,result in result_list:
day_s = str(day)
if len(day_s)<=1:
day_s = '0'+day_s
result_df['201610'+day_s] = result
result_df = result_df.T
result_df.columns = ['predict_power_consumption']
if path == None:
date = str(pd.Timestamp(time.ctime())).replace(' ','_').replace(':','_')
path = './result/'+date+'.csv'
result_df.to_csv(path,index_label='predict_date')
l = map(lambda day:pd.DataFrame.from_csv('./result/predict_part/%d.csv'%day),range(1,32))
t = pd.concat(l)
t.to_csv('./result/predict_part/'+date+'.csv')
def rolling_mean(self, window=10):
means = self.df.rolling(window=window).mean()
ewm_means = self.df.ewm(halflife=window).mean()
means.columns = ['mean-%s' % col for col in means.columns]
ewm_means.columns = ['ewm-%s' % col for col in ewm_means.columns]
ts = pd.concat([means, ewm_means], axis=1)
return ts
def filter(self, lamb=1e5):
cycle, trend = sm.tsa.filters.hpfilter(self.df, lamb=lamb)
trend.columns = ['%s-trend' % col for col in trend.columns]
# cycle.columns = ['%s-cycle' % col for col in cycle.columns]
# ts = pd.concat([cycle, trend], axis=1)
# return ts
return trend
def HOCcat(data_, mvmodel, seed):
response = data_.ix[:, 10:25]
preditors = []
preditors.append(data_.ix[:, 10:15])
preditors.append(data_.ix[:, 15:20])
preditors.append(data_.ix[:, 20:25])
plsr_ = None
for i in range(3):
res_ = plsr2(preditors[i], response, seed=seed)[0]
plsr_ = res_ if plsr_ is None else np.hstack((plsr_, res_))
plsr_ = pd.DataFrame(plsr_)
plsr_.index = range(len(plsr_))
cols = list(plsr_.columns)
for s in range(len(cols)):
cols[cols.index(s)] = 'T' + str(s)
plsr_.columns = cols
data_ = pd.concat([data_, plsr_], axis=1)
Variables = pd.read_csv(mvmodel)
Variables = Variables[
Variables.latent.str.contains("Humanização") == False]
for i in range(len(cols)):
df_ = pd.DataFrame([['Humanização', cols[i], 'A']],
columns=Variables.columns)
Variables = Variables.append(df_)
Variables.index = range(len(Variables))
mvmodel = Variables
return[data_, mvmodel]
def _compare_services(srv_a, srv_b, path, prev_cluster_metadata):
df_a = read_service(srv_a, path, prev_cluster_metadata)
df_b = read_service(srv_b, path, prev_cluster_metadata)
p_values = defaultdict(list)
df = pd.concat([df_a, df_b]).resample("500ms").mean()
df.interpolate(method="time", limit_direction="both", inplace=True)
df.fillna(method="bfill", inplace=True)
for c1, c2 in combine(df_a.columns, df_b.columns):
if c1 == c2:
continue
grangercausality(df[[c1, c2]], p_values, 5)
grangercausality(df[[c2, c1]], p_values, 5)
return pd.DataFrame(p_values)
def apply(self, *args, **kwargs):
'''
Overwrite standart pandas.Series method.
Apply transform function to all elements in self.
*If transform function return dict like object,
transform XSeries to XDataFrame see XDataFrame constructor*
:param func: function to apply
:param prefix: prefix for columns if needs to return XDataFrame object
:return: XSeries of XDataFrame depending on transformation
'''
func = kwargs.get('func')
if func is None:
func = args[0]
# TODO
# Possibly change to handle NaN
mapped_series = self.dropna()
mapped_series = mapped_series.map(func, na_action='ignore')
mapped_data_type = mapped_series.data_type
custom_prefix = kwargs.get('prefix')
if custom_prefix is None:
custom_prefix = self.name
else:
custom_prefix = '{}_{}'.format(self.name, custom_prefix)
if mapped_series.__is_data_type_dict_like():
custom_df = XDataFrame.from_records(mapped_series.values)
if custom_prefix is not None:
custom_df.columns = custom_df.columns.map(lambda x: '{}_{}'.format(custom_prefix, x))
return custom_df
elif mapped_data_type == pd.DataFrame:
return pd.concat(mapped_series.values, ignore_index=True)
else:
mapped_series.name = custom_prefix
return mapped_series
def concat_dataframes(cls, data_frames):
'''
Concatenate XDataFrame using pandas.concat method
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html
over columns
:param data_frames: list of XDataFrame instances
:return: XDataFrame — concatenated list of data_frames
'''
return pd.concat(data_frames, axis=1)
def earn_dividends(self, dividend_frame):
"""
Given a frame of dividends whose ex_dates are all the next trading day,
calculate and store the cash and/or stock payments to be paid on each
dividend's pay date.
"""
earned = dividend_frame.apply(self._maybe_earn_dividend, axis=1)\
.dropna(how='all')
if len(earned) > 0:
# Store the earned dividends so that they can be paid on the
# dividends' pay_dates.
self._unpaid_dividends = pd.concat(
[self._unpaid_dividends, earned],
)
def load_prices_from_csv_folder(folderpath, identifier_col, tz='UTC'):
data = None
for file in os.listdir(folderpath):
if '.csv' not in file:
continue
raw = load_prices_from_csv(os.path.join(folderpath, file),
identifier_col, tz)
if data is None:
data = raw
else:
data = pd.concat([data, raw], axis=1)
return data
def get_data(self):
in_package_data = range(2002, 2017)
cur_year = datetime.datetime.now().year
last_in_package_data = max(in_package_data)
# download new data
to_downloads = range(last_in_package_data + 1, cur_year + 1)
# frist, get ycDefIds params
response = requests.get(self.YIELD_MAIN_URL)
matchs = re.search(r'\?ycDefIds=(.*?)\&', response.text)
ycdefids = matchs.group(1)
assert (ycdefids is not None)
fetched_data = []
for year in to_downloads:
print('Downloading from ' + self.DONWLOAD_URL % (year, ycdefids))
response = requests.get(self.DONWLOAD_URL % (year, ycdefids))
fetched_data.append(BytesIO(response.content))
# combine all data
dfs = []
basedir = os.path.join(os.path.dirname(__file__), "xlsx")
for i in in_package_data:
dfs.append(pd.read_excel(os.path.join(basedir, "%d.xlsx" % i)))
for memfile in fetched_data:
dfs.append(pd.read_excel(memfile))
df = pd.concat(dfs)
return df