【Kaggle】练习赛《预测贴纸的销量》(中)
前言
上篇利用各地区的GDP数据还填充目标标签的缺失值,顺着这个思路,能不能用这个原理来预测未来的销量呢?
具体方法思路:先一一对国家、产品和商店进行汇总,然后对未来三年的每日销售额进行预测,然后再进行分解,得到每个国家、产品和商店的销售额。
本篇文章介绍的是2025年首个Kaggle月赛《Forecasting Sticker Sales》,即《预测贴纸的销量》。与之前一样,也同样适合初学者,但与之前不同的是,本次比赛的数据集是个时间序列,从题目来看,就是通过之前销量来预测未来的销量。因此,本期用先用着重讲述如何填充缺失值。
加载库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
加载数据
train_df = pd.read_csv("/kaggle/input/playground-series-s5e1/train.csv", parse_dates=["date"])
test_df = pd.read_csv("/kaggle/input/playground-series-s5e1/test.csv", parse_dates=["date"])
print("Train - Earliest date:", train_df["date"].min())
print("Train - Latest date:", train_df["date"].max())
print("Test - Earliest date:", test_df["date"].min())
print("Test - Latest date:", test_df["date"].max())
Train - Earliest date: 2010-01-01 00:00:00
Train - Latest date: 2016-12-31 00:00:00
Test - Earliest date: 2017-01-01 00:00:00
Test - Latest date: 2019-12-31 00:00:00
获取各地区的权重
gdp_per_capita_df = pd.read_csv("/kaggle/input/world-gdpgdp-gdp-per-capita-and-annual-growths/gdp_per_capita.csv")
years = ["2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]
gdp_per_capita_filtered_df = gdp_per_capita_df.loc[gdp_per_capita_df["Country Name"].isin(train_df["country"].unique()), ["Country Name"] + years].set_index("Country Name")
gdp_per_capita_filtered_df["2010_ratio"] = gdp_per_capita_filtered_df["2010"] / gdp_per_capita_filtered_df.sum()["2010"]
for year in years:
gdp_per_capita_filtered_df[f"{year}_ratio"] = gdp_per_capita_filtered_df[year] / gdp_per_capita_filtered_df.sum()[year]
gdp_per_capita_filtered_ratios_df = gdp_per_capita_filtered_df[[i+"_ratio" for i in years]]
gdp_per_capita_filtered_ratios_df.columns = [int(i) for i in years]
gdp_per_capita_filtered_ratios_df = gdp_per_capita_filtered_ratios_df.unstack().reset_index().rename(columns = {"level_0": "year", 0: "ratio", "Country Name": "country"})
gdp_per_capita_filtered_ratios_df['year'] = pd.to_datetime(gdp_per_capita_filtered_ratios_df['year'], format='%Y')
与上篇不同的是,也需要将预测的未来年份数据也进行获取
填充缺失值
gdp_per_capita_filtered_ratios_df["year"] = gdp_per_capita_filtered_ratios_df["year"].dt.year
train_df_imputed = train_df.copy()
print(f"Missing values remaining: {train_df_imputed['num_sold'].isna().sum()}")
train_df_imputed["year"] = train_df_imputed["date"].dt.year
for year in train_df_imputed["year"].unique():
# Impute Time Series 1 (Canada, Discount Stickers, Holographic Goose)
target_ratio = gdp_per_capita_filtered_ratios_df.loc[(gdp_per_capita_filtered_ratios_df["year"] == year) & (gdp_per_capita_filtered_ratios_df["country"] == "Norway"), "ratio"].values[0] # Using Norway as should have the best precision
current_raito = gdp_per_capita_filtered_ratios_df.loc[(gdp_per_capita_filtered_ratios_df["year"] == year) & (gdp_per_capita_filtered_ratios_df["country"] == "Canada"), "ratio"].values[0]
ratio_can = current_raito / target_ratio
train_df_imputed.loc[(train_df_imputed["country"] == "Canada") & (train_df_imputed["store"] == "Discount Stickers") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year), "num_sold"] = (train_df_imputed.loc[(train_df_imputed["country"] == "Norway") & (train_df_imputed["store"] == "Discount Stickers") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year), "num_sold"] * ratio_can).values
# Impute Time Series 2 (Only Missing Values)
current_ts = train_df_imputed.loc[(train_df_imputed["country"] == "Canada") & (train_df_imputed["store"] == "Premium Sticker Mart") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year)]
missing_ts_dates = current_ts.loc[current_ts["num_sold"].isna(), "date"]
train_df_imputed.loc[(train_df_imputed["country"] == "Canada") & (train_df_imputed["store"] == "Premium Sticker Mart") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] = (train_df_imputed.loc[(train_df_imputed["country"] == "Norway") & (train_df_imputed["store"] == "Premium Sticker Mart") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] * ratio_can).values
# Impute Time Series 3 (Only Missing Values)
current_ts = train_df_imputed.loc[(train_df_imputed["country"] == "Canada") & (train_df_imputed["store"] == "Stickers for Less") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year)]
missing_ts_dates = current_ts.loc[current_ts["num_sold"].isna(), "date"]
train_df_imputed.loc[(train_df_imputed["country"] == "Canada") & (train_df_imputed["store"] == "Stickers for Less") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] = (train_df_imputed.loc[(train_df_imputed["country"] == "Norway") & (train_df_imputed["store"] == "Stickers for Less") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] * ratio_can).values
# Impute Time Series 4 (Kenya, Discount Stickers, Holographic Goose)
current_raito = gdp_per_capita_filtered_ratios_df.loc[(gdp_per_capita_filtered_ratios_df["year"] == year) & (gdp_per_capita_filtered_ratios_df["country"] == "Kenya"), "ratio"].values[0]
ratio_ken = current_raito / target_ratio
train_df_imputed.loc[(train_df_imputed["country"] == "Kenya") & (train_df_imputed["store"] == "Discount Stickers") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year), "num_sold"] = (train_df_imputed.loc[(train_df_imputed["country"] == "Norway") & (train_df_imputed["store"] == "Discount Stickers") & (train_df_imputed["product"] == "Holographic Goose")& (train_df_imputed["year"] == year), "num_sold"] * ratio_ken).values
# Impute Time Series 5 (Only Missing Values)
current_ts = train_df_imputed.loc[(train_df_imputed["country"] == "Kenya") & (train_df_imputed["store"] == "Premium Sticker Mart") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year)]
missing_ts_dates = current_ts.loc[current_ts["num_sold"].isna(), "date"]
train_df_imputed.loc[(train_df_imputed["country"] == "Kenya") & (train_df_imputed["store"] == "Premium Sticker Mart") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] = (train_df_imputed.loc[(train_df_imputed["country"] == "Norway") & (train_df_imputed["store"] == "Premium Sticker Mart") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] * ratio_ken).values
# Impute Time Series 6 (Only Missing Values)
current_ts = train_df_imputed.loc[(train_df_imputed["country"] == "Kenya") & (train_df_imputed["store"] == "Stickers for Less") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year)]
missing_ts_dates = current_ts.loc[current_ts["num_sold"].isna(), "date"]
train_df_imputed.loc[(train_df_imputed["country"] == "Kenya") & (train_df_imputed["store"] == "Stickers for Less") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] = (train_df_imputed.loc[(train_df_imputed["country"] == "Norway") & (train_df_imputed["store"] == "Stickers for Less") & (train_df_imputed["product"] == "Holographic Goose") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] * ratio_ken).values
# Impute Time Series 7 (Only Missing Values)
current_ts = train_df_imputed.loc[(train_df_imputed["country"] == "Kenya") & (train_df_imputed["store"] == "Discount Stickers") & (train_df_imputed["product"] == "Kerneler") & (train_df_imputed["year"] == year)]
missing_ts_dates = current_ts.loc[current_ts["num_sold"].isna(), "date"]
train_df_imputed.loc[(train_df_imputed["country"] == "Kenya") & (train_df_imputed["store"] == "Discount Stickers") & (train_df_imputed["product"] == "Kerneler") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] = (train_df_imputed.loc[(train_df_imputed["country"] == "Norway") & (train_df_imputed["store"] == "Discount Stickers") & (train_df_imputed["product"] == "Kerneler") & (train_df_imputed["year"] == year) & (train_df_imputed["date"].isin(missing_ts_dates)), "num_sold"] * ratio_ken).values
# 查看缺失值情况
print(f"Missing values remaining: {train_df_imputed['num_sold'].isna().sum()}")
填充的方法与上篇一样
Missing values remaining: 8871 # 处理前
Missing values remaining: 2 # 处理后
仅仅发现还只有2个数据缺失
直接用确定值进行填充缺失值
missing_rows = train_df_imputed.loc[train_df_imputed["num_sold"].isna()]
train_df_imputed.loc[train_df_imputed["id"] == 23719, "num_sold"] = 5
train_df_imputed.loc[train_df_imputed["id"] == 207003, "num_sold"] = 300
print(f"Missing values remaining: {train['num_sold'].isna().sum()}")
Missing values remaining: 0
计算各商店销量权重
汇总了各个商店,因为我们发现在EDA期间,每个商店每天的销售额比例相同。
store_weights = train_df_imputed.groupby("store")["num_sold"].sum()/train_df_imputed["num_sold"].sum()
产品权重
我们对产品进行汇总,正如我们在EDA期间发现的那样,每个产品的销售比例可以用三角函数拟合。
product_df = train_df_imputed.groupby(["date","product"])["num_sold"].sum().reset_index()
product_ratio_df = product_df.pivot(index="date", columns="product", values="num_sold")
product_ratio_df = product_ratio_df.apply(lambda x: x/x.sum(),axis=1)
product_ratio_df = product_ratio_df.stack().rename("ratios").reset_index()
product_ratio_2017_df = product_ratio_df.loc[product_ratio_df["date"].dt.year == 2015].copy()
product_ratio_2018_df = product_ratio_df.loc[product_ratio_df["date"].dt.year == 2016].copy()
product_ratio_2019_df = product_ratio_df.loc[product_ratio_df["date"].dt.year == 2015].copy()
product_ratio_2017_df["date"] = product_ratio_2017_df["date"] + pd.DateOffset(years=2)
product_ratio_2018_df["date"] = product_ratio_2018_df["date"] + pd.DateOffset(years=2)
product_ratio_2019_df["date"] = product_ratio_2019_df["date"] + pd.DateOffset(years=4)
forecasted_ratios_df = pd.concat([product_ratio_2017_df, product_ratio_2018_df, product_ratio_2019_df])
聚合时间序列
对所有地区、商店和产品进行汇总,获得每天的总销售额,作为我们要预测的新时间序列。
original_train_df_imputed = train_df_imputed.copy()
train_df_imputed = train_df_imputed.groupby(["date"])["num_sold"].sum().reset_index()
画一张图看一下,所有日期,与上篇一样
f,ax = plt.subplots(figsize=(20,9))
sns.lineplot(data=train_df_imputed, x="date", y="num_sold", linewidth = 1);
train_df_imputed["year"] = train_df_imputed["date"].dt.year
train_df_imputed["month"] = train_df_imputed["date"].dt.month
train_df_imputed["day"] = train_df_imputed["date"].dt.day
train_df_imputed["day_of_week"] = train_df_imputed["date"].dt.dayofweek
通过删除对销售数量的影响来说明预测中的星期几。例如,satarday的销售额比其他日期多,所以我们将其乘以一个小于1的数字,以尝试消除由一周中的某一天引起的变化。因此,标准化的销售数量的一周的一天。
day_of_week_ratio = (train_df_imputed.groupby("day_of_week")["num_sold"].mean() / train_df_imputed.groupby("day_of_week")["num_sold"].mean().mean()).rename("day_of_week_ratios")
display(day_of_week_ratio)
train_df_imputed = pd.merge(train_df_imputed, day_of_week_ratio, how="left", on="day_of_week")
train_df_imputed["adjusted_num_sold"] = train_df_imputed["num_sold"] / train_df_imputed["day_of_week_ratios"]
day_of_week
0 0.943221
1 0.943352
2 0.943859
3 0.943747
4 1.000503
5 1.056331
6 1.168988
Name: day_of_week_ratios, dtype: float64
sanity_check = (train_df_imputed["num_sold"].sum() - train_df_imputed["adjusted_num_sold"].sum()) / train_df_imputed["num_sold"].sum()
print(f"This number should be very small {sanity_check:.6f}")
This number should be very small 0.000022
这只是为了检查我们在标准化时没有犯错误
进行预测
我们通过采用过去x年中一年中该天的平均销售额进行预测。使用完整的数据可能是个好主意,但如果随着时间的推移存在任何趋势,最好只使用过去几年的数据。请随意尝试此值。
train_last_x_years_df = train_df_imputed.loc[train_df_imputed["year"] >= 2010]
train_day_mean_df = train_last_x_years_df.groupby(["month", "day"])["adjusted_num_sold"].mean().reset_index()
test_total_sales_df = pd.merge(test_total_sales_df, train_day_mean_df, how="left", on=["month", "day"])
# Applying the day of week ratios back, thus increasing the number of sales if its a Sataurday for example
test_total_sales_df = pd.merge(test_total_sales_df, day_of_week_ratio.reset_index(), how="left", on="day_of_week")
test_total_sales_df["num_sold"] = test_total_sales_df["adjusted_num_sold"] * test_total_sales_df["day_of_week_ratios"]
画个预测的销量图
f,ax = plt.subplots(figsize=(20,10))
sns.lineplot(data = pd.concat([train_df_imputed,test_total_sales_df]).reset_index(drop=True), x="date", y="num_sold", linewidth=0.6);
ax.axvline(pd.to_datetime("2017-01-01"), color='black', linestyle='--');
虚线右侧为预测的值
分解总销售预测
我们需要在分类变量之间划分每天的总销售预测,以便得到每天、地区、产品和商店的预测。
# Adding in the store ratios # 商店比率
store_weights_df = store_weights.reset_index()
test_sub_df = pd.merge(test_df, test_total_sales_df, how="left", on="date")
test_sub_df = test_sub_df.rename(columns = {"num_sold":"day_num_sold"})
# Adding in the product ratios 产品比率
test_sub_df = pd.merge(test_sub_df, store_weights_df, how="left", on="store")
test_sub_df = test_sub_df.rename(columns = {"num_sold":"store_ratio"})
# Adding in the country ratios 地区比率
test_sub_df["year"] = test_sub_df["date"].dt.year
test_sub_df = pd.merge(test_sub_df, gdp_per_capita_filtered_ratios_df, how="left", on=["year", "country"])
test_sub_df = test_sub_df.rename(columns = {"ratio":"country_ratio"})
# Adding in the product ratio
test_sub_df = pd.merge(test_sub_df, forecasted_ratios_df, how="left", on=["date", "product"])
test_sub_df = test_sub_df.rename(columns = {"ratios":"product_ratio"})
# Disaggregating the forecast 计算销量
test_sub_df["num_sold"] = test_sub_df["day_num_sold"] * test_sub_df["store_ratio"] * test_sub_df["country_ratio"] * test_sub_df["product_ratio"]
test_sub_df["num_sold"] = test_sub_df["num_sold"].round()
def plot_individual_ts(df):
colour_map = {"Canada": "blue", "Finland": "orange", "Italy": "green", "Kenya":"red", "Norway": "purple", "Singapore": "brown"}
for country in df["country"].unique():
f,axes = plt.subplots(df["store"].nunique()*df["product"].nunique(),figsize=(20,70))
count = 0
for store in df["store"].unique():
for product in df["product"].unique():
plot_df = df.loc[(df["product"] == product) & (df["country"] == country) & (df["store"] == store)]
sns.lineplot(data = plot_df, x="date", y="num_sold", linewidth=0.5, ax=axes[count], color=colour_map[country])
axes[count].set_title(f"{country} - {store} - {product}")
axes[count].axvline(pd.to_datetime("2017-01-01"), color='black', linestyle='--');
count+=1
让我们看看90个时间序列中的每一个,看看预测是否合理:
plot_individual_ts(pd.concat([original_train_df_imputed,test_sub_df]).reset_index(drop=True))
提交
submission = pd.read_csv("/kaggle/input/playground-series-s5e1/sample_submission.csv")
submission["num_sold"] = test_sub_df["num_sold"]
submission.to_csv('submission.csv', index = False)
display(submission.head(2))
- | id | num_sold |
---|---|---|
0 | 230130 | 158.0 |
1 | 230131 | 1031.0 |