我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.concat()。
def data_preprocess(train,test): outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447] train.drop(train.index[outlier_idx],inplace=True) all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition'])) to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'] all_data = all_data.drop(to_delete,axis=1) train["SalePrice"] = np.log1p(train["SalePrice"]) #log transform skewed numeric features numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) all_data = all_data.fillna(all_data.mean()) X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice return X_train,X_test,y
def data_preprocess(train, test): outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477, 478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169, 1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447] train.drop(train.index[outlier_idx], inplace=True) all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'], test.loc[:, 'MSSubClass':'SaleCondition'])) to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'] all_data = all_data.drop(to_delete, axis=1) train["SalePrice"] = np.log1p(train["SalePrice"]) # log transform skewed numeric features numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) all_data = all_data.fillna(method='ffill') X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice return X_train, X_test, y
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population): output = pd.DataFrame(population[item].position) output.columns = ['Split'] dataSplit = pd.concat([data, output], axis=1) f1 = [] results = [] for i in range(nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme, reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) # print((1 / np.sum(f1))) return (1 / np.sum(f1))
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population): output = pd.DataFrame(population[item].genes) output.columns = ['Split'] dataSplit = pd.concat([data, output], axis=1) f1 = [] results = [] for i in range(nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme, reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) return (1 / np.sum(f1)) # Main
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo): output = pd.DataFrame(population[item].position) output.columns = ['Split'] dataSplit = pd.concat([data, output], axis=1) f1 = [] results = [] for i in range(nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme, reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) print((1 / np.sum(f1))) return (1 / np.sum(f1))
def do_work_ga(self, item): output = pd.DataFrame(self.population[item].genes) output.columns = ['Split'] dataSplit = pd.concat([self.data, output], axis=1) f1 = [] results = [] for i in range(self.nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme, self.reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) print((1 / np.sum(f1))) return (1 / np.sum(f1))
def do_work_pso(self, item): output = pd.DataFrame(self.population[item].position) output.columns = ['Split'] dataSplit = pd.concat([self.data, output], axis=1) f1 = [] results = [] for i in range(self.nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme, self.reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) print((1 / np.sum(f1))) return (1 / np.sum(f1))
def merged(self, s, t): chars = [] for c1, c2 in zip_longest(s.sequence, t.sequence): if c1 is None: c = c2 elif c2 is None: c = c1 elif c1 == 'N': c = c2 elif c2 == 'N': c = c1 elif c1 != c2: return None else: assert c1 == c2 c = c1 chars.append(c) seq = ''.join(chars) requested = s.requested or t.requested name = s.name + ';' + t.name # take union of groups group = pd.concat([s.group, t.group]).groupby(level=0).last() return SiblingInfo(seq, requested, name, group)
def update_dividends(self, new_dividends): """ Update our dividend frame with new dividends. @new_dividends should be a DataFrame with columns containing at least the entries in zipline.protocol.DIVIDEND_FIELDS. """ # Mark each new dividend with a unique integer id. This ensures that # we can differentiate dividends whose date/sid fields are otherwise # identical. new_dividends['id'] = np.arange( self._dividend_count, self._dividend_count + len(new_dividends), ) self._dividend_count += len(new_dividends) self.dividend_frame = sort_values(pd.concat( [self.dividend_frame, new_dividends] ), ['pay_date', 'ex_date']).set_index('id', drop=False)
def pipeline_event_loader_args(self, dates): _, mapping = super( BlazeCashBuybackAuthLoaderTestCase, self, ).pipeline_event_loader_args(dates) return (bz.data(pd.concat( pd.DataFrame({ BUYBACK_ANNOUNCEMENT_FIELD_NAME: frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], CASH_FIELD_NAME: frame[CASH_FIELD_NAME], TS_FIELD_NAME: frame[TS_FIELD_NAME], SID_FIELD_NAME: sid, }) for sid, frame in iteritems(mapping) ).reset_index(drop=True)),)
def pipeline_event_loader_args(self, dates): _, mapping = super( BlazeShareBuybackAuthLoaderTestCase, self, ).pipeline_event_loader_args(dates) return (bz.data(pd.concat( pd.DataFrame({ BUYBACK_ANNOUNCEMENT_FIELD_NAME: frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], SHARE_COUNT_FIELD_NAME: frame[SHARE_COUNT_FIELD_NAME], TS_FIELD_NAME: frame[TS_FIELD_NAME], SID_FIELD_NAME: sid, }) for sid, frame in iteritems(mapping) ).reset_index(drop=True)),)
def load_names_data(): fp = os.path.join(tempfile.gettempdir(), ZIP_NAME) if not os.path.exists(fp): r = requests.get(URL_NAMES) with open(fp, 'wb') as f: f.write(r.content) post = collections.OrderedDict() with zipfile.ZipFile(fp) as zf: # get ZipInfo instances for zi in sorted(zf.infolist(), key=lambda zi: zi.filename): fn = zi.filename if fn.startswith('yob'): year = int(fn[3:7]) df = pd.read_csv( zf.open(zi), header=None, names=('name', 'gender', 'count')) df['year'] = year post[year] = df df = pd.concat(post.values()) df.set_index('name', inplace=True, drop=True) return df
def read_data(fname): """ Read football-data.co.uk csv """ data = ( pd.read_csv(fname) .rename(columns={ 'HomeTeam': 'home_team', 'AwayTeam': 'away_team', 'FTHG': 'home_goals', 'FTAG': 'away_goals' }) .loc[lambda df: ~pd.isnull(df['home_goals'])] # Remove future games ) team_map = stan_map(pd.concat([data['home_team'], data['away_team']])) data['home_team_id'] = data['home_team'].replace(team_map) data['away_team_id'] = data['away_team'].replace(team_map) for col in ('home_goals', 'away_goals'): data[col] = [int(c) for c in data[col]] return data, team_map
def QA_fetch_get_security_bars(code, _type, lens, ip=best_ip['stock'], port=7709): api = TdxHq_API() with api.connect(ip, port): data = pd.concat([api.to_df(api.get_security_bars(_select_type(_type), _select_market_code( code), code, (i - 1) * 800, 800)) for i in range(1, int(lens / 800) + 2)], axis=0) data = data\ .assign(datetime=pd.to_datetime(data['datetime']), code=str(code))\ .drop(['year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=False)\ .assign(date=data['datetime'].apply(lambda x: str(x)[0:10]))\ .assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(x)))\ .assign(time_stamp=data['datetime'].apply(lambda x: QA_util_time_stamp(x)))\ .assign(type=_type).set_index('datetime', drop=False, inplace=False).tail(lens) if data is not None: return data else: return None
def QA_fetch_get_stock_block(ip=best_ip['stock'], port=7709): '????' api = TdxHq_API() with api.connect(ip, port): data = pd.concat([api.to_df(api.get_and_parse_block_info("block_gn.dat")).assign(type='gn'), api.to_df(api.get_and_parse_block_info( "block.dat")).assign(type='yb'), api.to_df(api.get_and_parse_block_info( "block_zs.dat")).assign(type='zs'), api.to_df(api.get_and_parse_block_info("block_fg.dat")).assign(type='fg')]) if len(data) > 10: return data.assign(source='tdx').drop(['block_type', 'code_index'], axis=1).set_index('code', drop=False, inplace=False).drop_duplicates() else: QA_util_log_info('Wrong with fetch block ')
def QA_fetch_get_future_day(code, start_date, end_date, level='day', ip=best_ip['future'], port=7727): '???? ??' apix = TdxExHq_API() start_date = str(start_date)[0:10] today_ = datetime.date.today() lens = QA_util_get_trade_gap(start_date, today_) global extension_market_info extension_market_info=QA_fetch_get_future_list() if extension_market_info is None else extension_market_info with apix.connect(ip, port): code_market = extension_market_info.query('code=="{}"'.format(code)) data = pd.concat([apix.to_df(apix.get_instrument_bars(_select_type( level), int(code_market.market), str(code),(int(lens / 700) - i) * 700, 700))for i in range(int(lens / 700) + 1)], axis=0) data = data.assign(date=data['datetime'].apply(lambda x: str(x[0:10]))).assign(code=str(code))\ .assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(str(x)[0:10]))).set_index('date', drop=False, inplace=False) return data.drop(['year', 'month', 'day', 'hour', 'minute', 'datetime'], axis=1)[start_date:end_date].assign(date=data['date'].apply(lambda x: str(x)[0:10]))
def QA_data_make_qfq(bfq_data, xdxr_data): '???????????' info = xdxr_data[xdxr_data['category'] == 1] bfq_data['if_trade'] = 1 data = pd.concat([bfq_data, info[['category']] [bfq_data.index[0]:bfq_data.index[-1]]], axis=1) data['if_trade'].fillna(value=0, inplace=True) data = data.fillna(method='ffill') data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia', 'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1) data = data.fillna(0) data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu'] * data['peigujia']) / (10 + data['peigu'] + data['songzhuangu']) data['adj'] = (data['preclose'].shift(-1) / data['close']).fillna(1)[::-1].cumprod() data['open'] = data['open'] * data['adj'] data['high'] = data['high'] * data['adj'] data['low'] = data['low'] * data['adj'] data['close'] = data['close'] * data['adj'] data['preclose'] = data['preclose'] * data['adj'] return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu', 'if_trade', 'category'], axis=1).query("open != 0")
def QA_data_make_hfq(bfq_data, xdxr_data): '???????????' info = xdxr_data[xdxr_data['category'] == 1] bfq_data['if_trade'] = 1 data = pd.concat([bfq_data, info[['category']] [bfq_data.index[0]:bfq_data.index[-1]]], axis=1) data['if_trade'].fillna(value=0, inplace=True) data = data.fillna(method='ffill') data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia', 'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1) data = data.fillna(0) data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu'] * data['peigujia']) / (10 + data['peigu'] + data['songzhuangu']) data['adj'] = (data['preclose'].shift(-1) / data['close']).fillna(1).cumprod() data['open'] = data['open'] / data['adj'] data['high'] = data['high'] / data['adj'] data['low'] = data['low'] / data['adj'] data['close'] = data['close'] / data['adj'] data['preclose'] = data['preclose'] / data['adj'] return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu'], axis=1).query("open != 0")
def get_text_len(DB, tr, te): if tr is None: if te=='stage1': Data = [DB.data['training_text'],DB.data['test_text_filter']] else: Data = [pd.concat([DB.data['training_text'],DB.data['test_text_filter']],axis=0),DB.data['stage2_test_text']] else: Data = [DB.data['training_text']] for data in Data: data['tl'] = data['Text'].apply(lambda x:len(x)) data['tl2'] = data['Text'].apply(lambda x:len(x.split())) if tr is None: X,Xt = Data return X[['tl','tl2']].values, Xt[['tl','tl2']].values else: X = Data[0][['tl','tl2']].values return X[tr],X[te]
def get_pattern(DB,tr,te,patterns): cols = ['p%d'%c for c,p in enumerate(patterns)] if tr is None: test = DB.data['test_variants_filter'] if te=='stage1' else DB.data['stage2_test_variants'] if te=='stage1': train = DB.data['training_variants'] else: train = pd.concat([DB.data['training_variants'],DB.data["test_variants_filter"]],axis=0) Data =[train,test] else: Data = [DB.data['training_variants']] for data in Data: for c,p in enumerate(patterns): data['p%d'%c] = data['Variation'].apply(lambda x: len(re.findall(p,str(x).lower()))) if tr is None: return train[cols].values,test[cols].values else: X = data[cols].values return X[tr],X[te]
def onehot_gene(DB, tr, te): from utils.np_utils.encoder import onehot_encode if tr is None: train = DB.data['training_variants'] if te=="stage1": test = DB.data['test_variants_filter'] else: train = pd.concat([train,DB.data['test_variants_filter']],axis=0) test = DB.data['stage2_test_variants'] lbl_encode(train,test) n = max(train['Gene'].max(),test['Gene'].max()) gtr = onehot_encode(train['Gene'].values,n=n+1) gte = onehot_encode(test['Gene'].values) return gtr,gte else: data = DB.data['training_variants'] lbl_encode(data,cols=['Gene']) gene = data['Gene'].values gene = onehot_encode(gene) return gene[tr],gene[te]
def post_cv(flags): import re import os path = flags.data_path files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))] s = [] for name in files: s.append(pd.read_csv("%s/%s"%(path,name))) s = pd.concat(s,axis=0) print(s.head()) classes = len([i for i in s.columns.values if 'class' in i]) from utils.np_utils.utils import cross_entropy yp = s[['class%d'%i for i in range(1,classes+1)]].values y=s['real'].values print(cross_entropy(y,yp)) s.to_csv("%s/cv.csv"%path,index=False)
def create_agents(self, generator): """ Given information on a set of countries and a generator function, generate the agents and assign the results to ``self.agents``. :type generator: DataFrame, str, int :param generator: A function which generates the agents. """ self.generator = generator country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()]) country_array.index = range(len(country_array)) # Garbage collect before creating new processes. gc.collect() self.agents = pd.concat( self.pool.imap(self._gen_agents, np.array_split(country_array, self.processes * self.splits)) ) self.agents.index = range(len(self.agents))
def OHETr(self, tr): """""" OHEDict = {} for col in tr.columns: ValueCounts = [str(int(v)) for v in tr[col].value_counts().index.values] ValueCounts.append('missing') SelectedValues = dict((k, v) for (v, k) in enumerate(ValueCounts, start=0)) OHTr = self.__ApplyOH(tr[col].values, SelectedValues) headers = dict((('%s_%s' % (col, k)), SelectedValues[k]) for k in SelectedValues) tmp = [v[0] for v in sorted(headers.items(), key=lambda x: x[1])] OHDFTr = pd.DataFrame(OHTr, index=tr.index, columns=tmp) tr = pd.concat([tr, OHDFTr], axis=1) tr.drop(col, axis=1, inplace=True) OHEDict[col] = SelectedValues #print('Column %s was encoded.' % col) return tr, OHEDict
def combineFilesIntoDf(file_path, filenames, reset_index=False, drop_cols=None): df = None for filename in filenames: fdf = pd.DataFrame.from_csv(file_path + filename) if reset_index: fdf = fdf.reset_index() if df is None: df = fdf.copy(deep=True) else: df = pd.concat([df,fdf]) if drop_cols is not None: for feat in drop_cols: df = df.drop(feat, 1) return df
def tsne_cluster_cuisine(df,sublist): lenlist=[0] df_sub = df[df['cuisine']==sublist[0]] lenlist.append(df_sub.shape[0]) for cuisine in sublist[1:]: temp = df[df['cuisine']==cuisine] df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True) lenlist.append(df_sub.shape[0]) df_X = df_sub.drop(['cuisine','recipeName'],axis=1) print df_X.shape, lenlist dist = squareform(pdist(df_X, metric='cosine')) tsne = TSNE(metric='precomputed').fit_transform(dist) palette = sns.color_palette("hls", len(sublist)) plt.figure(figsize=(10,10)) for i,cuisine in enumerate(sublist): plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\ tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i]) plt.legend() #interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
def create_future(fold, features_old, cfg_parameters): """ Just for testing purposes. Sets up a replicate of the last day(s) data to create new data for testing. But in reality, we should be able to create features for the upcoming days from past data, so this would not be needed??? """ last_day = fold['window_end'] next_days = [last_day + timedelta(days=i) for i in xrange(1,(cfg_parameters['prediction_horizon'] +1 ))] old_features_unique = features_old.drop_duplicates(subset='ToiletID') l_future_features = [] for day in next_days: next_day_features = old_features_unique.copy() next_day_features["Collection_Date"] = day l_future_features.append(next_day_features) future_features = pd.concat(l_future_features, ignore_index=True) return(future_features)
def __init__(self, **kwargs): """ Store the configuration of link DfConcatenator :param str name: name of link :param str storeKey: key of data to store in data store :param list readKeys: keys of pandas dataframes in the data store :param bool ignore_missing_input: Skip missing input datasets. If all missing, store empty dataset. Default is false. :param kwargs: all other key word arguments are passed on to pandas concat function. """ Link.__init__(self, kwargs.pop('name', 'DfConcatenator')) # process and register all relevant kwargs. kwargs are added as attributes of the link. # second arg is default value for an attribute. key is popped from kwargs. self._process_kwargs(kwargs, readKeys=[]) self._process_kwargs(kwargs, storeKey=None) self._process_kwargs(kwargs, ignore_missing_input=False) # pass on remaining kwargs to pandas reader self.kwargs = copy.deepcopy(kwargs) return
def collect_history_data(history_dir, days): today = dt.datetime.now() dfs = [] for d in glob.glob('{}/*'.format(history_dir)): if os.path.isdir(d): dirname = os.path.basename(d) dirdate = None try: dirdate = dt.datetime.strptime(dirname, '%Y%m%d') except Exception as ex: logger.error(ex) if dirdate and (days == -1 or (today - dirdate).days < days): for fname in glob.glob('{}/{}/*.csv'.format(history_dir, dirname)): try: dfs.append(pd.read_csv(fname)) except Exception as ex: logger.warn("Reading {} error: {}".format(fname, ex)) if not dfs: return None df = pd.concat(dfs, ignore_index=True) df = df[df.Datetime != 'Datetime'].sort( ['User', 'Datetime']).drop_duplicates() return df
def _aligned_series(*many_series): """ Return a new list of series containing the data in the input series, but with their indices aligned. NaNs will be filled in for missing values. Parameters ---------- many_series : list[pd.Series] Returns ------- aligned_series : list[pd.Series] A new list of series containing the data in the input series, but with their indices aligned. NaNs will be filled in for missing values. """ return [series for col, series in iteritems(pd.concat(many_series, axis=1))]
def compute_panel(cls, data, scales, **params): func = make_summary_fun(params['fun_data'], params['fun_y'], params['fun_ymin'], params['fun_ymax'], params['fun_args']) # break a dataframe into pieces, summarise each piece, # and join the pieces back together, retaining original # columns unaffected by the summary. summaries = [] for (group, x), df in data.groupby(['group', 'x']): summary = func(df) summary['x'] = x summary['group'] = group unique = uniquecols(df) if 'y' in unique: unique = unique.drop('y', axis=1) merged = summary.merge(unique, on=['group', 'x']) summaries.append(merged) new_data = pd.concat(summaries, axis=0, ignore_index=True) return new_data
def compute_panel(cls, data, scales, params): if not params['var']: return data negative = data['ymax'] < 0 neg = data.loc[negative] pos = data.loc[~negative] neg.is_copy = None pos.is_copy = None if len(neg): neg = cls.collide(neg, params=params) if len(pos): pos = cls.collide(pos, params=params) data = pd.concat([neg, pos], axis=0, ignore_index=True) return data
def add_missing_facets(data, layout, vars, facet_vals): # When in a dataframe some layer does not have all # the facet variables, add the missing facet variables # and create new data where the points(duplicates) are # present in all the facets missing_facets = set(vars) - set(facet_vals) if missing_facets: to_add = layout.loc[:, missing_facets].drop_duplicates() to_add.reset_index(drop=True, inplace=True) # a point for each facet, [0, 1, ..., n-1, 0, 1, ..., n-1, ...] data_rep = np.tile(np.arange(len(data)), len(to_add)) # a facet for each point, [0, 0, 0, 1, 1, 1, ... n-1, n-1, n-1] facet_rep = np.repeat(np.arange(len(to_add)), len(data)) data = data.iloc[data_rep, :].reset_index(drop=True) facet_vals = facet_vals.iloc[data_rep, :].reset_index(drop=True) to_add = to_add.iloc[facet_rep, :].reset_index(drop=True) facet_vals = pd.concat([facet_vals, to_add], axis=1, ignore_index=False) return data, facet_vals
def disp_gap_byweather(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: weather_dict = self.get_weather_dict(data_dir) temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) gaps_mean = df.groupby('preweather')['gap'].mean() gaps_mean.plot(kind='bar') plt.ylabel('Mean of gap') plt.xlabel('Weather') plt.title('Weather/Gap Correlation') return
def disp_gap_bytraffic(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevtraffic.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: traffic_dict = self.get_traffic_dict(data_dir) temp_df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_traffic,axis = 1, traffic_dict=traffic_dict, pre_num = 3) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) by_traffic = df.groupby('traffic1') x=[] y=[] for name, group in by_traffic: x.append(name) y.append(group['gap'].mean()) plt.scatter(x,y) return
def __do_one_hot_encodings(self): df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()] df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] enc = OneHotEncoder(sparse=False) cross_feature_dict = self.__get_label_encode_dict() to_be_encoded = [] for _, new_feature_name in cross_feature_dict.iteritems(): to_be_encoded.append(new_feature_name) #fix all data source to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0) enc.fit(to_be_stacked_df) enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2) # transform on seprate data source self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded) self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded) return
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = list(), list() for i in range(n_in, 0, -1): cols.append(df.shift(i)) names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)] for i in range(0, n_out): cols.append(df.shift(-i)) if i == 0: names += [('var%d(t)' % (j+1)) for j in range(n_vars)] else: names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)] agg = concat(cols, axis=1) agg.columns = names if dropnan: agg.dropna(inplace=True) return agg
def predict_tf_all(path = None): result_list = [] p = m_Pool(31) result_list = p.map(predict_tf_once,range(1,32)) p.close() p.join() print 'writing...' result_df = pd.DataFrame(index = range(1)) for day,result in result_list: day_s = str(day) if len(day_s)<=1: day_s = '0'+day_s result_df['201610'+day_s] = result result_df = result_df.T result_df.columns = ['predict_power_consumption'] if path == None: date = str(pd.Timestamp(time.ctime())).replace(' ','_').replace(':','_') path = './result/'+date+'.csv' result_df.to_csv(path,index_label='predict_date') l = map(lambda day:pd.DataFrame.from_csv('./result/predict_part/%d.csv'%day),range(1,32)) t = pd.concat(l) t.to_csv('./result/predict_part/'+date+'.csv')
def rolling_mean(self, window=10): means = self.df.rolling(window=window).mean() ewm_means = self.df.ewm(halflife=window).mean() means.columns = ['mean-%s' % col for col in means.columns] ewm_means.columns = ['ewm-%s' % col for col in ewm_means.columns] ts = pd.concat([means, ewm_means], axis=1) return ts
def filter(self, lamb=1e5): cycle, trend = sm.tsa.filters.hpfilter(self.df, lamb=lamb) trend.columns = ['%s-trend' % col for col in trend.columns] # cycle.columns = ['%s-cycle' % col for col in cycle.columns] # ts = pd.concat([cycle, trend], axis=1) # return ts return trend
def HOCcat(data_, mvmodel, seed): response = data_.ix[:, 10:25] preditors = [] preditors.append(data_.ix[:, 10:15]) preditors.append(data_.ix[:, 15:20]) preditors.append(data_.ix[:, 20:25]) plsr_ = None for i in range(3): res_ = plsr2(preditors[i], response, seed=seed)[0] plsr_ = res_ if plsr_ is None else np.hstack((plsr_, res_)) plsr_ = pd.DataFrame(plsr_) plsr_.index = range(len(plsr_)) cols = list(plsr_.columns) for s in range(len(cols)): cols[cols.index(s)] = 'T' + str(s) plsr_.columns = cols data_ = pd.concat([data_, plsr_], axis=1) Variables = pd.read_csv(mvmodel) Variables = Variables[ Variables.latent.str.contains("Humanização") == False] for i in range(len(cols)): df_ = pd.DataFrame([['Humanização', cols[i], 'A']], columns=Variables.columns) Variables = Variables.append(df_) Variables.index = range(len(Variables)) mvmodel = Variables return[data_, mvmodel]
def _compare_services(srv_a, srv_b, path, prev_cluster_metadata): df_a = read_service(srv_a, path, prev_cluster_metadata) df_b = read_service(srv_b, path, prev_cluster_metadata) p_values = defaultdict(list) df = pd.concat([df_a, df_b]).resample("500ms").mean() df.interpolate(method="time", limit_direction="both", inplace=True) df.fillna(method="bfill", inplace=True) for c1, c2 in combine(df_a.columns, df_b.columns): if c1 == c2: continue grangercausality(df[[c1, c2]], p_values, 5) grangercausality(df[[c2, c1]], p_values, 5) return pd.DataFrame(p_values)
def apply(self, *args, **kwargs): ''' Overwrite standart pandas.Series method. Apply transform function to all elements in self. *If transform function return dict like object, transform XSeries to XDataFrame see XDataFrame constructor* :param func: function to apply :param prefix: prefix for columns if needs to return XDataFrame object :return: XSeries of XDataFrame depending on transformation ''' func = kwargs.get('func') if func is None: func = args[0] # TODO # Possibly change to handle NaN mapped_series = self.dropna() mapped_series = mapped_series.map(func, na_action='ignore') mapped_data_type = mapped_series.data_type custom_prefix = kwargs.get('prefix') if custom_prefix is None: custom_prefix = self.name else: custom_prefix = '{}_{}'.format(self.name, custom_prefix) if mapped_series.__is_data_type_dict_like(): custom_df = XDataFrame.from_records(mapped_series.values) if custom_prefix is not None: custom_df.columns = custom_df.columns.map(lambda x: '{}_{}'.format(custom_prefix, x)) return custom_df elif mapped_data_type == pd.DataFrame: return pd.concat(mapped_series.values, ignore_index=True) else: mapped_series.name = custom_prefix return mapped_series
def concat_dataframes(cls, data_frames): ''' Concatenate XDataFrame using pandas.concat method https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html over columns :param data_frames: list of XDataFrame instances :return: XDataFrame — concatenated list of data_frames ''' return pd.concat(data_frames, axis=1)
def earn_dividends(self, dividend_frame): """ Given a frame of dividends whose ex_dates are all the next trading day, calculate and store the cash and/or stock payments to be paid on each dividend's pay date. """ earned = dividend_frame.apply(self._maybe_earn_dividend, axis=1)\ .dropna(how='all') if len(earned) > 0: # Store the earned dividends so that they can be paid on the # dividends' pay_dates. self._unpaid_dividends = pd.concat( [self._unpaid_dividends, earned], )
def load_prices_from_csv_folder(folderpath, identifier_col, tz='UTC'): data = None for file in os.listdir(folderpath): if '.csv' not in file: continue raw = load_prices_from_csv(os.path.join(folderpath, file), identifier_col, tz) if data is None: data = raw else: data = pd.concat([data, raw], axis=1) return data
def get_data(self): in_package_data = range(2002, 2017) cur_year = datetime.datetime.now().year last_in_package_data = max(in_package_data) # download new data to_downloads = range(last_in_package_data + 1, cur_year + 1) # frist, get ycDefIds params response = requests.get(self.YIELD_MAIN_URL) matchs = re.search(r'\?ycDefIds=(.*?)\&', response.text) ycdefids = matchs.group(1) assert (ycdefids is not None) fetched_data = [] for year in to_downloads: print('Downloading from ' + self.DONWLOAD_URL % (year, ycdefids)) response = requests.get(self.DONWLOAD_URL % (year, ycdefids)) fetched_data.append(BytesIO(response.content)) # combine all data dfs = [] basedir = os.path.join(os.path.dirname(__file__), "xlsx") for i in in_package_data: dfs.append(pd.read_excel(os.path.join(basedir, "%d.xlsx" % i))) for memfile in fetched_data: dfs.append(pd.read_excel(memfile)) df = pd.concat(dfs) return df