我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.get_dummies()。
def transform(self, x): """ Parameters: x (Sequence): - ??????? Returns: np.array: - ????????????numpy?? """ s = pd.cut(x, bins=self.bins) d = pd.get_dummies(s) z = d.T.to_dict() re = [] for i, v in z.items(): for j, u in v.items(): if u == 1: re.append(str(j)) return np.array(re)
def data_preprocess(train,test): outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447] train.drop(train.index[outlier_idx],inplace=True) all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition'])) to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'] all_data = all_data.drop(to_delete,axis=1) train["SalePrice"] = np.log1p(train["SalePrice"]) #log transform skewed numeric features numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) all_data = all_data.fillna(all_data.mean()) X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice return X_train,X_test,y
def data_preprocess(train, test): outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477, 478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169, 1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447] train.drop(train.index[outlier_idx], inplace=True) all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'], test.loc[:, 'MSSubClass':'SaleCondition'])) to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'] all_data = all_data.drop(to_delete, axis=1) train["SalePrice"] = np.log1p(train["SalePrice"]) # log transform skewed numeric features numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) all_data = all_data.fillna(method='ffill') X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice return X_train, X_test, y
def generator_input(input_file, chunk_size): """Generator function to produce features and labels needed by keras fit_generator. """ input_reader = pd.read_csv(tf.gfile.Open(input_file[0]), names=CSV_COLUMNS, chunksize=chunk_size, na_values=" ?") for input_data in input_reader: input_data = input_data.dropna() label = pd.get_dummies(input_data.pop(LABEL_COLUMN)) input_data = to_numeric_features(input_data) n_rows = input_data.shape[0] return ( (input_data.iloc[[index % n_rows]], label.iloc[[index % n_rows]]) for index in itertools.count() )
def next_batch(df, i=None): """ :param df: pandas dataframe :param i: batch index :return: (numpy array x, numpy array y) """ if i is None: start = 0 end = df.shape[0] else: start = BATCH_SIZE * i end = BATCH_SIZE * (i + 1) result = df[start:end] if "Survived" in result: batch_ys = pd.get_dummies(result.pop('Survived').values).as_matrix() batch_xs = result.as_matrix() return batch_xs, batch_ys else: return result.as_matrix()
def transform(self, X, y=None): """Dummy encode the categorical columns in X Parameters ---------- X : pd.DataFrame or dd.DataFrame y : ignored Returns ------- transformed : pd.DataFrame or dd.DataFrame Same type as the input """ if not X.columns.equals(self.columns_): raise ValueError("Columns of 'X' do not match the training " "columns. Got {!r}, expected {!r}".format( X.columns, self.columns )) if isinstance(X, pd.DataFrame): return pd.get_dummies(X, drop_first=self.drop_first) elif isinstance(X, dd.DataFrame): return dd.get_dummies(X, drop_first=self.drop_first) else: raise TypeError("Unexpected type {}".format(type(X)))
def input_fn(df): """Format the downloaded data.""" # Creates a dictionary mapping from each continuous feature column name (k) # to the values of that column stored in a constant Tensor. continuous_cols = [df[k].values for k in CONTINUOUS_COLUMNS] X_con = np.stack(continuous_cols).astype(np.float32).T # Standardise X_con -= X_con.mean(axis=0) X_con /= X_con.std(axis=0) # Creates a dictionary mapping from each categorical feature column name categ_cols = [np.where(pd.get_dummies(df[k]).values)[1][:, np.newaxis] for k in CATEGORICAL_COLUMNS] n_values = [np.amax(c) + 1 for c in categ_cols] X_cat = np.concatenate(categ_cols, axis=1).astype(np.int32) # Converts the label column into a constant Tensor. label = df[LABEL_COLUMN].values[:, np.newaxis] # Returns the feature columns and the label. return X_con, X_cat, n_values, label
def replay(self): """Memory Management and training of the agent """ if len(self.memory) < self.batch_size: return state, action, reward, next_state, done = self._get_batches() reward += (self.gamma * np.logical_not(done) * np.amax(self.model.predict(next_state), axis=1)) q_target = self.target_model.predict(state) _ = pd.Series(action) one_hot = pd.get_dummies(_).as_matrix() action_batch = np.where(one_hot == 1) q_target[action_batch] = reward return self.model.fit(state, q_target, batch_size=self.batch_size, epochs=1, verbose=False)
def make_date_columns_categorical_binary(book_attributes): """Turn all date columns in book_attributes into binary categorical columns.""" # bucket publish dates & insert categorical data columns into data frame orig_pub_year_cat = transform_pub_dates(book_attributes['original_pub_year']) book_attributes.insert(loc=5, column='orig_pub_year_cat', value=orig_pub_year_cat) pub_year_cat = transform_pub_dates(book_attributes['pub_year']) book_attributes.insert(loc=5, column='pub_year_cat', value=pub_year_cat) # turn date categories into binary dataframes; merge back into book_attributes pub_year_dummies = pd.get_dummies(book_attributes['pub_year_cat']) orig_year_dummies = pd.get_dummies(book_attributes['orig_pub_year_cat']) book_full_attr = book_attributes.merge(pub_year_dummies,left_index=True, right_index=True) book_full_attr = book_full_attr.merge(orig_year_dummies,left_index=True, right_index=True) return book_full_attr
def load_user_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00'): ''' ????????,??????. ''' dump_path = './cache/user_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date=start_date, end_date=end_date, field=['user_id', 'time', 'type']) prefix = 'Action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10]) type_dummies = pd.get_dummies(df['type'], prefix=prefix) df = pd.concat([df, type_dummies], axis=1) drop_cols = ['time', 'type'] df.drop(drop_cols, axis=1, inplace=True) df = df.groupby(['user_id'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) return df
def load_base_user_feat(end_date='2016-04-16'): ''' ???????? ''' dump_path = './cache/base_user_feat_{0}.pkl'.format(end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = pd.read_csv(USER_FILE, encoding='gbk') # sex_dummies = pd.get_dummies(df.sex, prefix='sex') df.user_reg_tm.fillna('2016-02-01', inplace=True) df.user_reg_tm = pd.to_datetime(df.user_reg_tm).apply(lambda t: pd.to_datetime('2016-02-01') if t > pd.to_datetime('2016-04-15') else t) df['reg_tm_dist'] = df.user_reg_tm.apply(lambda t: (pd.to_datetime(end_date) - t).days) df = df[['user_id', 'user_lv_cd', 'reg_tm_dist']] # df = pd.concat([df, sex_dummies], axis=1) # age_dummies = pd.get_dummies(df.age, prefix='age') # N = age_dummies.shape[1] # age_dummies.columns = ['age_{0}'.format(i) for i in range(N)] # df = pd.concat([df, age_dummies], axis=1) with open(dump_path, 'wb') as f: pickle.dump(df, f) return df
def load_UIPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions= [1,2,3,4,5,6]): ''' UI pair???? ''' dump_path = './cache/UIPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'sku_id', 'cate', 'type']) prefix = 'UIPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10]) type_dummies = pd.get_dummies(df['type'], prefix=prefix) df = pd.concat([df, type_dummies], axis=1) df.drop(['type'], axis=1, inplace=True) df = df.groupby(['user_id', 'sku_id', 'cate'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) actions.sort() rt_cols = ['user_id', 'sku_id', 'cate'] rt_cols.extend(['UIPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions]) df = df[rt_cols] return df
def load_UCPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]): ''' ??UCPair??? ''' dump_path = './cache/UCPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'type', 'cate']) prefix = 'UCPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10]) type_dummies = pd.get_dummies(df['type'], prefix=prefix) df = pd.concat([df, type_dummies], axis=1) df = df.groupby(['user_id', 'cate'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) actions.sort() rt_cols = ['user_id', 'cate'] rt_cols.extend(['UCPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions]) df = df[rt_cols] return df
def load_base_item_feat(end_date = '2016/4/16'): ''' ?????? ''' JComment = pd.read_csv(COMMENT_FILE, encoding='gbk') end_date = pd.to_datetime(end_date) JComment.dt = pd.to_datetime(JComment.dt) dts = JComment.dt.drop_duplicates() dts.sort_index(inplace=True, ascending=False) for dt in dts.iteritems(): if dt[-1] < end_date: break JComment = JComment[JComment.dt == dt[-1]].drop(['dt'], axis=1) Comment_num_dummies = pd.get_dummies(JComment.comment_num, prefix='Comment_num') JComment = pd.concat([JComment, Comment_num_dummies], axis=1) return JComment.drop(['comment_num'], axis=1)
def load_item_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]): ''' ?????? ''' dump_path = './cache/item_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date = start_date, end_date = end_date, field=['sku_id', 'type']) prefix = 'item_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10]) type_dummies = pd.get_dummies(df['type'], prefix=prefix) df = pd.concat([df, type_dummies], axis=1) df.drop(['type'], axis=1, inplace=True) df = df.groupby(['sku_id'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) rt_cols = ['sku_id'] rt_cols.extend(['item_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions]) df = df[rt_cols] return df
def load_UBPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-01 00:00:00', actions = [1,2,3,4,5,6]): ''' ???????? ''' dump_path = './cache/UBPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'brand', 'type']) prefix = 'UBPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10]) type_dummies = pd.get_dummies(df.type, prefix=prefix) df = pd.concat([df, type_dummies], axis=1) df.drop(['type'], axis=1, inplace=True) df = df.groupby(['user_id', 'brand'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) rt_cols = ['user_id', 'brand'] rt_cols.extend(['UBPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions]) df = df[rt_cols] return df
def load_BCPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]): ''' ????-?????? ''' dump_path = './cache/BCPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date = start_date, end_date = end_date, field=['brand', 'cate', 'type']) prefix = 'BCPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10]) type_dummies = pd.get_dummies(df.type, prefix=prefix) df = pd.concat([df.drop(['type'], axis=1), type_dummies], axis=1) df = df.groupby(['brand', 'cate'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) rt_cols = ['brand', 'cate'] rt_cols.extend(['BCPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions]) df = df[rt_cols] return df
def load_user_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00'): ''' ???????????? ''' dump_path = './cache/user_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone']) timeZone_dummies = pd.get_dummies(df.time_zone, prefix='time_zone_cnt') df = pd.concat([df, timeZone_dummies], axis=1) df.drop(['time_zone'], axis=1, inplace=True) df = df.groupby(['user_id'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) return df
def load_UCPair_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]): ''' ????????????????? ''' dump_path = './cache/UCPair_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone', 'cate']) timeZone_dummies = pd.get_dummies(df.time_zone, prefix='uc_time_zone_cnt') df = pd.concat([df, timeZone_dummies], axis=1) df.drop(['time_zone'], axis=1, inplace=True) df = df.groupby(['user_id', 'cate'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) df = df[df.cate.isin(cate)] return df
def load_UIPair_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]): ''' ???????????????? ''' dump_path = './cache/UIPair_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone', 'sku_id']) timeZone_dummies = pd.get_dummies(df.time_zone, prefix='time_zone_cnt') df = pd.concat([df, timeZone_dummies], axis=1) df.drop(['time_zone'], axis=1, inplace=True) df = df.groupby(['user_id', 'sku_id'], as_index=False).sum() with open(dump_path, 'wb') as f: pickle.dump(df, f) return df
def get_table(train_table): x_cols = [] for col in train_table.columns: # print(data[col].value_counts()) if col not in ['result', 'team_name', 'competition', 'season_x', 'surname']: train_table[col] = train_table[col].astype(str) x_cols.append(col) # print(x_cols) X = pd.get_dummies(train_table[x_cols]) y = train_table['result'] print(train_table.shape) print(X.shape) print(y.shape) return X, y
def main(): df = pd.read_csv("dataset.csv") df = df.dropna() # print df x1 = df.copy() del x1['Customer'] del x1['Effective To Date'] x4 = pd.get_dummies(x1) # print x4 n = 10 clf = k_means(x4, n_clusters = n) centroids = clf[0] # 10 clusters labels = clf[1] # print x4[1] index_db_val = compute_DB_index(x4, labels, centroids, n) print "The value of Davies Bouldin index for a K-Means cluser of size " + str(n) + " is: " + str(index_db_val)
def dummify(df): ''' Given a dataframe, for all the columns which are not numericly typed already, create dummies. This will NOT remove one of the dummies which is required for linear regression. returns DataFrame -- a dataframe with all non-numeric columns swapped into dummy columns ''' obj_cols = [] for cname in df.columns: if df[cname].dtype == object: obj_cols.append(cname) df = pd.get_dummies(df, columns=obj_cols) # for cname in obj_cols: # del df[cname] return df
def apriori_alg(trans, support=0.01, minlen=2): print('appr_1') dna = trans.unstack().dropna() print('appr_2') ts = pandas.get_dummies(dna).groupby(level=1).sum() print('appr_3') collen, rowlen = ts.shape pattern = [] for cnum in range(minlen, rowlen + 1): for cols in combinations(ts, cnum): print('cnum', cnum) patsup = ts[list(cols)].all(axis=1).sum() patsup = float(patsup) / collen pattern.append([",".join(cols), patsup]) print('appr_4') sdf = pandas.DataFrame(pattern, columns=["Pattern", "Support"]) print('appr_5') results = sdf[sdf.Support >= support] print('appr_6') return results # ????????? Apriori ?? ? ??
def doOneHot(X_train, X_test): res = X_test[['instanceID']] X_test.drop('instanceID', axis=1, inplace=True) data = X_train.append(X_test, ignore_index=True) del X_train, X_test gc.collect() features_trans = ['gender','appCategory_main','connectionType'] data = pd.get_dummies(data, columns=features_trans) X_train = data.loc[data['label'] != -1, :] X_test = data.loc[data['label'] == -1, :] X_test.loc[:, 'instanceID'] = res.values del data gc.collect() return X_train, X_test
def prepare_gss(onehot=True): data = pd.read_csv('../data/GSShappiness.csv') del data['year'] del data['id'] data = data.dropna() target = "Happiness level" X = data[list(set(data.columns) - set([target]))] y = data[target] if onehot: X = pd.get_dummies(X) return X, y
def thunder(): if os.path.exists('../dataset/thunder.pkl'): return pd.read_pickle('../dataset/thunder.pkl') thunder_df = pd.read_csv('../input/thunder.csv', names=[ 'datetime', # ???? 'lat', # ??(10??) 'lon', # ??(10??) 'type' # ???, CG: ???, IC: ??? ]) # ????????? thunder_df.datetime = pd.to_datetime(thunder_df.datetime) # observation_point_df.to_pickle('../dataset/observation_point.pkl') thunder_df = pd.concat([thunder_df, pd.get_dummies(thunder_df.type)], axis=1) thunder_df.to_pickle('../dataset/thunder_df.pkl') return thunder_df
def load_data(): data = pd.read_csv('data/train.csv') # drop rows with empty features / gaps in columns data = data.dropna() # Categorical values into numerical (one hot encoding) one_hot_embarked = pd.get_dummies(data['Embarked'], prefix='embarked') data = data.join(one_hot_embarked) one_hot_pclass = pd.get_dummies(data['Pclass'], prefix='pclass') data = data.join(one_hot_pclass) # The sex column has only two values (M/F), so that only one column is required for encoding (0/1) # Intead of one hot encoding with two columns data['sex'] = data.apply(lambda x: 1 if (x['Sex'] == 'female') else 0, axis=1) # Drop features not used for training the model data = data.drop(['Cabin', 'Name', 'PassengerId', 'Pclass', 'Sex', 'Ticket', 'Embarked'], axis=1) return data.drop(['Survived'], axis=1), data[['Survived']]
def parse_context_dmop(path): df = read(path, "dmop") # ATTT-A and ATTT-B are different attt = df[df['subsystem'].str.startswith("ATTT")] attt['subsystem'] = attt['subsystem'].str[:3] + attt['subsystem'].str[-1] df = pd.concat([attt, df]) # take the first 4 chars df['subsystem'] = df['subsystem'].str[:4] # convert to 1 / 0 df = pd.get_dummies(df.subsystem) df = df.resample("1h").sum().fillna(0.0) df['sum_dmop'] = df.sum(axis=1) return df
def parse_context_ftl(path): raw = read(path, "ftl") df = raw.copy() df['ut_ms'] = pd.to_datetime(raw['utb_ms'], unit='ms') df.sort_values("ut_ms", inplace=True) # dummies df = df.set_index('ut_ms') dummies = pd.get_dummies(df.type).join(df['flagcomms'], how="outer") dummies = dummies.resample("1h").sum().fillna(0.0) df = raw.copy() df['event'] = df.type + df.flagcomms.astype("str") del df['type'], df['flagcomms'] df['ute_ms'] = pd.to_datetime(df['ute_ms'], unit='ms') df['utb_ms'] = pd.to_datetime(df['utb_ms'], unit='ms') durations = [event_to_min_per_hour(df, event) for event in df.event.unique()] durations = pd.concat(durations, axis=1).fillna(0) return dummies.join(durations, how="outer")
def predict(): if clf: try: json_ = request.json query = pd.get_dummies(pd.DataFrame(json_)) # https://github.com/amirziai/sklearnflask/issues/3 # Thanks to @lorenzori query = query.reindex(columns=model_columns, fill_value=0) prediction = list(clf.predict(query)) return jsonify({'prediction': prediction}) except Exception, e: return jsonify({'error': str(e), 'trace': traceback.format_exc()}) else: print 'train first' return 'no model here'
def preprocess(file,istrian): df=pd.read_csv(file,parse_dates=['Date'],dayfirst=True) end_missing=['Average_Atmospheric_Pressure','Max_Atmospheric_Pressure', 'Min_Atmospheric_Pressure','Min_Ambient_Pollution','Max_Ambient_Pollution'] df=df.fillna(-1) if istrian: outcome=df.Footfall df=df.drop(['Footfall'],axis=1) else: outcome=np.nan df['month']=df['Date'].apply(lambda x: x.month) df['date']=df['Date'].apply(lambda x: x.day) df['weekday']=df['Date'].apply(lambda x: x.weekday()) df['sardiya']=df['month'].apply(lambda x: 1 if x in [1,2,11,12,3] else 0) df.date=df.date.apply(get_normal_date) dummies=pd.get_dummies(df.Park_ID,prefix='park') dummies=pd.get_dummies(df.Location_Type,prefix='location') df['Direction_Of_Wind2']=df.Direction_Of_Wind.apply(get_wind_dir) return df,outcome #load training set
def get_comment_product_fea(endtime): enddt = pd.to_datetime(endtime,format = '%Y-%m-%d') if enddt == pd.to_datetime('2016-04-15',format = '%Y-%m-%d'): commentdata = pd.read_csv(FilePath + CommentFile) commentdata = commentdata[(commentdata["dt"] == "2016-04-15")] commentdata = commentdata.sort_values(by="sku_id").reset_index()[["sku_id", "comment_num", "has_bad_comment", "bad_comment_rate"]] return commentdata else: startdt = enddt - pd.Timedelta(days=7) commentpath = FilePath + CommentFile commentdata_ALL = pd.read_csv(commentpath) # ?Jdatya_comment.csv?????? commentdata_ALL.dt = pd.to_datetime(commentdata_ALL.dt, format='%Y-%m-%d') # ?dt????date?? comment = commentdata_ALL[(commentdata_ALL.dt <= enddt) & (commentdata_ALL.dt > startdt)] df = pd.get_dummies(comment['comment_num'], prefix='comment_num') comment = pd.concat([comment, df], axis=1) comment = comment[['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3','comment_num_4']] sorted_comment = comment.sort_values(by=['sku_id']).reset_index().drop('index',1) #sorted_comment.to_csv(FilePath + 'skuFeaInComment_before'+str(enddt), index=False) return sorted_comment # ????????
def get_action_feat(start_time, end_time,action_data): actions=action_data[(action_data['time']>=start_time)&(action_data['time']<=end_time)] #actions = get_actions(start_time, end_time) #actions = actions[actions['cate'] == 8] actions = actions[['user_id', 'sku_id', 'type']] df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_time, end_time)) actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum() actions.fillna(0,inplace=True) name='%s-%s-action' % (start_time, end_time) actions[name+'_1256']=actions[name+'_1']+actions[name+'_2']+actions[name+'_5']+actions[name+'_6'] actions[name+'_1256_d_4']=actions[name+'_4']/actions[name+'_1256'] del actions['type'] # action_fea_file = 'action_fea_' + STARTdt_str + 'to' + ENDdt_str + '.csv' # action_fea.to_csv(FilePath + action_fea_file, index=False) return actions #????????????????????
def get_basic_user_fea(): user = pd.read_csv(FilePath+UserFile, encoding='gbk') # user['age'] = user['age'].map(convert_age) user['age']=user['age'].replace([u'16-25?',u'26-35?',u'36-45?',u'46-55?',u'56???'],[1,2,3,4,5]) user=user[((user['age']==1) | (user['age']==2) | ( user['age']==3) | (user['age']==4) | (user['age']==5)| (user['age']==-1))] age_df = pd.get_dummies(user["age"], prefix="age") sex_df = pd.get_dummies(user["sex"], prefix="sex") user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd") user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1) user.to_csv(FilePath + 'user_basic_fea.csv',index=False) return user #???????????????
def next_batch(self): df = self.batch_df[self.pointer] x = np.array([d[0] for d in df]) xl = np.array([d[1] for d in df]) xr = np.array([d[2] for d in df]) tar = np.array([d[3] for d in df]) y = np.array([d[-1] for d in df]) y = pd.get_dummies(y).values.astype(np.int32) seq_len = [len(seq) for seq in x] seq_len_l = [len(seq) for seq in xl] seq_len_r = [len(seq) for seq in xr] if self.dynamic_padding: x = np.array(self.pad_minibatches(x, 'RIGHT')) xl = np.array(self.pad_minibatches(xl, 'RIGHT')) xr = np.array(self.pad_minibatches(xr, 'RIGHT')) self.pointer += 1 return x, y, seq_len, xl, seq_len_l, xr, seq_len_r, tar
def next_batch(self): df = self.batch_df[self.pointer] x = np.array([d[0] for d in df]) xl = np.array([d[1] for d in df]) xr = np.array([d[2] for d in df]) tar = np.array([d[3] for d in df]) y = np.array([d[-1] for d in df]) # y = pd.get_dummies(y).values.astype(np.int32) seq_len = [len(seq) for seq in x] seq_len_l = [len(seq) for seq in xl] seq_len_r = [len(seq) for seq in xr] if self.dynamic_padding: x = np.array(self.pad_minibatches(x, 'RIGHT')) xl = np.array(self.pad_minibatches(xl, 'RIGHT')) xr = np.array(self.pad_minibatches(xr, 'RIGHT')) self.pointer += 1 return x, y, seq_len, xl, seq_len_l, xr, seq_len_r, tar
def load_data(in_file): # read csv file prepared by freddie_data_analysis module df = pd.read_csv(in_file) # drop unneeded columns columns = df.columns.tolist() for col in columns: if 'Unnamed' in col: df.drop(col, axis=1, inplace=True) df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce') df.drop(['published_date'], axis=1, inplace=True) # replace nan values with 0 df.replace([np.inf, -np.inf], np.nan, inplace=True) df.fillna(0, inplace=True) # apply get_dummies to particular columns df = pd.get_dummies(df, prefix=['state'], columns=['property_state']) df = pd.get_dummies(df, prefix=['ss'], columns=['special_servicer']) # return prepared dataframe return df
def gen_fer2013_csv(csv_path, reshape_width=48, reshape_height=48): data = pd.read_csv(csv_path) pixels = data['pixels'].tolist() width, height = 48, 48 faces = [] for pixel_sequence in pixels: face = [int(pixel) for pixel in pixel_sequence.split(' ')] face = np.asarray(face).reshape(width, height) face = cv2.resize(face.astype('uint8'), (reshape_width, reshape_height)) faces.append(face.astype('float32')) faces = np.asarray(faces) faces = np.expand_dims(faces, -1) emotions = pd.get_dummies(data['emotion']).as_matrix() return faces, emotions
def make_x(self, df): x_spec = self.get_individualised_x_spec() X = df[XY.reduce_tuples( [a for a, b in x_spec if b != 'linear_by_categorical'] )].copy() cats = XY.reduce_tuples( [a for a, b in x_spec if b == 'categorical' or b == 'ordinal'] ) X = self.prep_work(X, x_spec) X = pd.get_dummies( X, prefix=cats, prefix_sep='_', columns=cats, drop_first=False, dummy_na=False ) return X
def get_comments_product_feat(start_date, end_date): dump_path = './cache/comments_accumulate_%s_%s.pkl' % (start_date, end_date) if os.path.exists(dump_path): comments = pickle.load(open(dump_path)) else: comments = pd.read_csv(comment_path) comment_date_end = end_date comment_date_begin = comment_date[0] for date in reversed(comment_date): if date < comment_date_end: comment_date_begin = date break comments = comments[(comments.dt >= comment_date_begin) & (comments.dt < comment_date_end)] df = pd.get_dummies(comments['comment_num'], prefix='comment_num') comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame # del comments['dt'] # del comments['comment_num'] comments = comments[ ['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']] pickle.dump(comments, open(dump_path, 'w')) return comments
def get_accumulate_product_feat(start_date, end_date): feature = ['sku_id', 'product_action_1_ratio', 'product_action_2_ratio', 'product_action_3_ratio', 'product_action_5_ratio', 'product_action_6_ratio'] dump_path = './cache/product_feat_accumulate_%s_%s.pkl' % (start_date, end_date) if os.path.exists(dump_path): actions = pickle.load(open(dump_path)) else: actions = get_actions(start_date, end_date) df = pd.get_dummies(actions['type'], prefix='action') actions = pd.concat([actions['sku_id'], df], axis=1) actions = actions.groupby(['sku_id'], as_index=False).sum() actions['product_action_1_ratio'] = actions['action_4'] / actions['action_1'] actions['product_action_2_ratio'] = actions['action_4'] / actions['action_2'] actions['product_action_3_ratio'] = actions['action_4'] / actions['action_3'] actions['product_action_5_ratio'] = actions['action_4'] / actions['action_5'] actions['product_action_6_ratio'] = actions['action_4'] / actions['action_6'] actions = actions[feature] pickle.dump(actions, open(dump_path, 'w')) return actions
def get_basic_user_feat(): dump_path = './cache/basic_user.csv' # one-hot coding age,sex,lv-cd if os.path.exists(dump_path): # user = pickle.load(open(dump_path)) user = pd.read_csv(dump_path) else: user = pd.read_csv(user_path, encoding='gbk') user['age'] = user['age'].map(convert_age) # ????? user['user_reg_tm'] = user['user_reg_tm'].map(convert_reg_date) age_df = pd.get_dummies(user["age"], prefix="age") sex_df = pd.get_dummies(user["sex"], prefix="sex") # user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd") user = pd.concat([user[['user_id', 'user_reg_tm', 'user_lv_cd']], age_df, sex_df], axis=1) # pickle.dump(user, open(dump_path, 'w')) user.to_csv(dump_path, index=False, encoding='utf-8') print 'finish get basic user info' return user
def get_basic_product_feat(): dump_path = './cache/basic_product.csv' # one-hot coding a1,a2,a3 if os.path.exists(dump_path): # product = pickle.load(open(dump_path)) product = pd.read_csv(dump_path) else: product = pd.read_csv(product_path) attr1_df = pd.get_dummies(product["a1"], prefix="a1") attr2_df = pd.get_dummies(product["a2"], prefix="a2") attr3_df = pd.get_dummies(product["a3"], prefix="a3") cate_df = pd.get_dummies(product['cate'], prefix='cate') brand_df = pd.get_dummies(product['brand'], prefix='brand') # product = pd.concat([product[['sku_id','brand']], attr1_df, attr2_df, attr3_df,cate_df], axis=1) product = pd.concat([product[['sku_id','brand']], attr1_df, attr2_df, attr3_df, brand_df, cate_df], axis=1) # pickle.dump(product, open(dump_path, 'w')) product.to_csv(dump_path, index=False) print 'finish get basic product info' return product
def get_action_feat(start_date, end_date): ''' Action: 1.???????????? 2.???3.??????4.???5.???6.?? ''' dump_path = './cache/action_accumulate_%s_%s.csv' % (start_date, end_date) if os.path.exists(dump_path): # actions = pickle.load(open(dump_path)) actions = pd.read_csv(dump_path) else: actions = get_actions(start_date, end_date) # actions = pd.read_csv(action_1_path) actions = actions[['user_id', 'sku_id', 'type']] df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date)) actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame # ?????????????? actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum() del actions['type'] # pickle.dump(actions, open(dump_path, 'w')) actions.to_csv(dump_path, index=False) print 'finish get action feat' return actions
def get_accumulate_brand_feat(start_date, end_date): feature = ['brand', 'brand_action_1_ratio', 'brand_action_2_ratio', 'brand_action_3_ratio', 'brand_action_5_ratio', 'brand_action_6_ratio', 'brand_action_num'] dump_path = './cache/brand_feat_accumulate_%s_%s.csv' %(start_date,end_date) if os._exists(dump_path): actions = pd.read_csv(dump_path) else: actions = get_actions(start_date,end_date) df = pd.get_dummies(actions['type'],prefix='action') actions = pd.concat([actions['brand'],df],axis=1) actions = actions.groupby(['brand'],as_index = False).sum() actions['brand_action_1_ratio'] = actions['action_4']/actions['action_1'] actions['brand_action_2_ratio'] = actions['action_4']/actions['action_2'] actions['brand_action_3_ratio'] = actions['action_4']/actions['action_3'] actions['brand_action_5_ratio'] = actions['action_4']/actions['action_5'] actions['brand_action_6_ratio'] = actions['action_4']/actions['action_6'] actions['brand_action_num'] = actions['action_1'] + actions['action_2'] + actions['action_3'] + actions[ 'action_4'] + actions['action_5'] + actions['action_6'] actions = actions[feature] actions.replace(np.inf, 9999) actions.to_csv(dump_path) return actions pass
def load_data(shuffle=True, n_cols=None): train_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.train.csv') test_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.test.csv') usecols = list(range(n_cols)) if n_cols else None df_train = pd.read_csv(train_path, engine='c', usecols=usecols) df_test = pd.read_csv(test_path, engine='c', usecols=usecols) if shuffle: df_train = df_train.sample(frac=1, random_state=seed) df_test = df_test.sample(frac=1, random_state=seed) X_train = df_train.iloc[:, 2:].as_matrix() X_test = df_test.iloc[:, 2:].as_matrix() y_train = pd.get_dummies(df_train[['cancer_type']]).as_matrix() y_test = pd.get_dummies(df_test[['cancer_type']]).as_matrix() return (X_train, y_train), (X_test, y_test)
def build(self): train, _, test, _ = data.get() cset = [] ntrain = len(train) df = pd.concat([train, test], axis=0) to_drop = df.columns for sc in ['height', 'weight', 'ap_hi', 'ap_lo']: tc = df[sc].apply(str) maxc = tc.apply(len).max() for n in range(maxc): df['ft_l_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[n]) if n < len(s) else -1) df['ft_r_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[-n]) if n < len(s) else -1) cset.append('ft_l_'+sc+'_'+str(n)) cset.append('ft_r_'+sc+'_'+str(n)) df = pd.get_dummies(df, columns=cset).drop(to_drop, axis=1) self.train_= df[:ntrain] self.test_ = df[ntrain:] return self.train_, self.test_, None
def build(self): train, y, test, _ = data.get() ntrain = len(train) df = pd.concat([train, test], axis=0) to_drop = df.columns dcn = [] for n in [2, 5, 10, 15, 25]: cname = 'kmeans_' + str(n) dcn.append(cname) df[cname] = cluster.KMeans(n_clusters=n).fit_predict(df) df = pd.get_dummies(df, columns=dcn) df = df.drop(to_drop, axis=1) train = df[:ntrain] test = df[ntrain:].copy() return train.astype('int32'), test.astype('int32'), None