小能豆

如何计算同一列中出现相同值而另一列中出现另一个值的次数?

python

这是我的示例数据框。

```py
‘CUSTOMER_ID’: [31381, 7180, 132, 334, 11, 78],
‘TX_DATE’: [‘2023-06-01’, ‘2024-03-08’, ‘2023-03-07’, ‘2003-03-01’, ‘2020-03-02’, ‘2023-03-08’],
‘TX_WEEK’: [1, 11, 14, 17, 18, 19] }

df_transactions = pd.DataFrame(data)
```

这是我到目前为止的代码,但是,数字是错误的,所以我不知道逻辑有什么问题以及下一步该怎么做。

previous day df_transactions[‘CUSTOMER_TOTAL_1D’] = df_transactions.groupby(‘CUSTOMER_ID’)[‘DAILY_COUNTS’].shift() previous week df_transactions[‘CUSTOMER_TOTAL_1W’] = df_transactions.groupby(‘CUSTOMER_ID’)[‘WEEKLY_COUNTS’].shift() df_transactions.drop(columns=[‘DAILY_COUNTS’,”WEEKLY_COUNTS”], inplace=True) df_transactions

It generates mostly 0 values which is wrong. The sample data frame might generate this but my actual datagram is large so this is not possible.

阅读 137

收藏
2023-09-19

共1个答案

小能豆

示例数据没有足够的行来演示所需的输出。下面的代码使用修改后的测试数据进行说明。这是上周的事情。

import pandas as pd 
data = {
    "CUSTOMER_ID": [718, 718, 718, 1320, 718, 718, 718],
    "TX_DATE": ["2023-02-25", "2023-03-01", "2023-03-08", "2023-03-07", 
                "2023-03-01", "2023-03-02", "2023-03-10"],
    "TX_WEEK": [9, 10, 11, 10, 10, 10, 11]
}
df_transactions = pd.DataFrame(data)
print(df_transactions)
df_transactions["TX_DATE"] = pd.to_datetime(df_transactions["TX_DATE"])
df_wk = df_transactions.groupby(["CUSTOMER_ID", 
                "TX_WEEK"]).size().reset_index(name="CNT_BY_WK")
df_wk["LAST_WEEK"] = df_wk["TX_WEEK"] - 1
print(df_wk)
df = df_wk.merge(df_wk, left_on=["CUSTOMER_ID", "LAST_WEEK"], 
    right_on=["CUSTOMER_ID", "TX_WEEK"], how="left", 
    suffixes=("_CURRENT", "_LAST"))
print(df[["CUSTOMER_ID", "TX_WEEK_CURRENT", "CNT_BY_WK_CURRENT", 
            "CNT_BY_WK_LAST"]].fillna(0))

输出

   CUSTOMER_ID     TX_DATE  TX_WEEK                                                                     
0          718  2023-02-25        9                                                                     
1          718  2023-03-01       10                                                                     
2          718  2023-03-08       11                                                                     
3         1320  2023-03-07       10                                                                     
4          718  2023-03-01       10                                                                     
5          718  2023-03-02       10                                                                     
6          718  2023-03-10       11   

   CUSTOMER_ID  TX_WEEK  CNT_BY_WK  LAST_WEEK                                                           
0          718        9          1          8                                                           
1          718       10          3          9                                                           
2          718       11          2         10                                                           
3         1320       10          1          9     

   CUSTOMER_ID  TX_WEEK_CURRENT  CNT_BY_WK_CURRENT  CNT_BY_WK_LAST                                      
0          718                9                  1             0.0                                      
1          718               10                  3             1.0 <<                                    
2          718               11                  2             3.0 <<                                    
3         1320               10                  1             0.0  

获取客户前一天的交易计数# 与此类似。

df_DAY = df_transactions.groupby(["CUSTOMER_ID", 
                "TX_DATE"]).size().reset_index(name="CNT_BY_DAY")
df_DAY["LAST_DATE"] = df_DAY["TX_DATE"] - pd.DateOffset(days=1)
print(df_DAY)
df = df_DAY.merge(df_DAY, left_on=["CUSTOMER_ID", "LAST_DATE"], 
    right_on=["CUSTOMER_ID", "TX_DATE"], how="left", 
    suffixes=("_CURRENT", "_LAST"))
print(df[["CUSTOMER_ID", "TX_DATE_CURRENT", "CNT_BY_DAY_CURRENT", 
            "CNT_BY_DAY_LAST"]].fillna(0))
   CUSTOMER_ID TX_DATE_CURRENT  CNT_BY_DAY_CURRENT  CNT_BY_DAY_LAST                                     
0          718      2023-02-25                   1              0.0                                     
1          718      2023-03-01                   2              0.0                                     
2          718      2023-03-02                   1              2.0  <<                                   
3          718      2023-03-08                   1              0.0                                     
4          718      2023-03-10                   1              0.0                                     
5         1320      2023-03-07                   1              0.0   

更新

问题:如何组合和合并它,以便仅将前一天的交易 (CUSTOMER_TOTAL_1D) 和上周的交易 (CUSTOMER_TOTAL_1W) 的列保留到我上周的原始数据框中?

df_transactions = pd.DataFrame(data)
print(df_transactions)
df_transactions["TX_DATE"] = pd.to_datetime(df_transactions["TX_DATE"])
df_wk = (df_transactions.groupby(["CUSTOMER_ID", "TX_WEEK"])
            .size().reset_index(name="CNT_BY_WK"))
df_wk["LAST_WEEK"] = df_wk["TX_WEEK"] - 1
df_wk2 = (df_wk
    .merge(df_wk, left_on=["CUSTOMER_ID", "LAST_WEEK"], 
        right_on=["CUSTOMER_ID", "TX_WEEK"], how="left", 
        suffixes=("_CURRENT", "_LAST"))
    .loc[:,["CUSTOMER_ID", "TX_WEEK_CURRENT", "CNT_BY_WK_LAST"]]
    .fillna(0))
df_day = (df_transactions.groupby(["CUSTOMER_ID", "TX_DATE"])
            .size().reset_index(name="CNT_BY_DAY"))
df_day["LAST_DATE"] = df_day["TX_DATE"] - pd.DateOffset(days=1)
df_day2 = (df_day
    .merge(df_day, left_on=["CUSTOMER_ID", "LAST_DATE"], 
        right_on=["CUSTOMER_ID", "TX_DATE"], how="left", 
        suffixes=("_CURRENT", "_LAST"))
    .loc[:, ["CUSTOMER_ID", "TX_DATE_CURRENT", "CNT_BY_DAY_LAST"]].fillna(0))
output = (df_transactions
    .merge(df_day2, left_on=["CUSTOMER_ID", "TX_DATE"], 
      right_on=["CUSTOMER_ID", "TX_DATE_CURRENT"], how="left")
    .merge(df_wk2, left_on=["CUSTOMER_ID", "TX_WEEK"], 
      right_on=["CUSTOMER_ID", "TX_WEEK_CURRENT"], how="left")
    .drop(columns=["TX_DATE_CURRENT", "TX_WEEK_CURRENT"])
    .rename(columns={'CNT_BY_DAY_LAST': 'CUSTOMER_TOTAL_1D',
             'CNT_BY_WK_LAST': 'CUSTOMER_TOTAL_1W'})
    .sort_values(by=['TX_DATE', 'CUSTOMER_ID']))
print(output)

输出

   CUSTOMER_ID    TX_DATE  TX_WEEK  CUSTOMER_TOTAL_1D  CUSTOMER_TOTAL_1W                                
0          718 2023-02-25        9                0.0                0.0                                
1          718 2023-03-01       10                0.0                1.0                                
4          718 2023-03-01       10                0.0                1.0                                
5          718 2023-03-02       10                2.0                1.0                                
3         1320 2023-03-07       10                0.0                0.0                                
2          718 2023-03-08       11                0.0                3.0                                
6          718 2023-03-10       11                0.0                3.0  

更新2

问题:如何找到该消费者前一天和上周的交易中值。

import pandas as pd 
data = {
    "CUSTOMER_ID": [718, 718, 718, 1320, 718, 718, 718],
    "TX_DATE": ["2023-02-25", "2023-03-01", "2023-03-08", "2023-03-07", 
                "2023-03-01", "2023-03-02", "2023-03-10"],
    "TX_WEEK": [9, 10, 11, 10, 10, 10, 11],
    "TX_AMOUNT": [86,89,100,101,191,200,201]
}
df_transactions = pd.DataFrame(data)
# print(df_transactions)
df_transactions["TX_DATE"] = pd.to_datetime(df_transactions["TX_DATE"])

df_wk = (df_transactions.groupby(["CUSTOMER_ID", "TX_WEEK"])
        .agg({"TX_AMOUNT": "median", "TX_DATE": "count"})
        .rename(columns={"TX_AMOUNT": "SPENT_1W","TX_DATE": "CUSTOMER_TOTAL_1W"})
        .reset_index()
        )
df_wk["LAST_WEEK"] = df_wk["TX_WEEK"] - 1
mask_wk = ["CUSTOMER_ID", "TX_WEEK_SHIFT", "CUSTOMER_TOTAL_1W_LAST", "SPENT_1W_LAST"]
df_wk2 = (df_wk
        .merge(df_wk, left_on=["CUSTOMER_ID", "LAST_WEEK"], 
            right_on=["CUSTOMER_ID", "TX_WEEK"], how="left", 
            suffixes=("_SHIFT", "_LAST"))
        .loc[:, mask_wk]
        .fillna(0))

df_day = (df_transactions
        .groupby(["CUSTOMER_ID", "TX_DATE"])
        .agg({"TX_AMOUNT": "median", "TX_WEEK": "count"})
        .rename(columns={"TX_AMOUNT": "SPENT_1D", "TX_WEEK": "CUSTOMER_TOTAL_1D"})
        .reset_index()
        )
df_day["LAST_DATE"] = df_day["TX_DATE"] - pd.DateOffset(days=1)
mask_day = ["CUSTOMER_ID", "TX_DATE_SHIFT", "CUSTOMER_TOTAL_1D_LAST", "SPENT_1D_LAST"]
df_day2 = (df_day
    .merge(df_day, left_on=["CUSTOMER_ID", "LAST_DATE"], 
        right_on=["CUSTOMER_ID", "TX_DATE"], how="left", 
        suffixes=("_SHIFT", "_LAST"))
    .loc[:, mask_day]
    .fillna(0))

output = (df_transactions
    .merge(df_day2, left_on=["CUSTOMER_ID", "TX_DATE"], 
        right_on=["CUSTOMER_ID", "TX_DATE_SHIFT"], how="left")
    .merge(df_wk2, left_on=["CUSTOMER_ID", "TX_WEEK"], 
        right_on=["CUSTOMER_ID", "TX_WEEK_SHIFT"], how="left")
    .drop(columns=["TX_DATE_SHIFT", "TX_WEEK_SHIFT"])
    .rename(columns=lambda x: x.replace('_LAST', ''), inplace=False)
    .sort_values(by=["TX_DATE", "CUSTOMER_ID"]))
print(output)

输出

   CUSTOMER_ID    TX_DATE  TX_WEEK  TX_AMOUNT  CUSTOMER_TOTAL_1D  SPENT_1D  CUSTOMER_TOTAL_1W  SPENT_1W 
0          718 2023-02-25        9         86                0.0       0.0                0.0       0.0 
1          718 2023-03-01       10         89                0.0       0.0                1.0      86.0 
4          718 2023-03-01       10        191                0.0       0.0                1.0      86.0 
5          718 2023-03-02       10        200                2.0     140.0                1.0      86.0 
3         1320 2023-03-07       10        101                0.0       0.0                0.0       0.0 
2          718 2023-03-08       11        100                0.0       0.0                3.0     191.0 
6          718 2023-03-10       11        201                0.0       0.0                3.0     191.0 
2023-09-19