当前位置：首页 > article >正文

kaggle实战3RossmanStore商店销售额预测XgBoost解决回归问题案例1

article 2024/10/7 1:47:16

kaggle实战2信用卡反欺诈逻辑回归模型案例1

数据集下载地址
https://download.csdn.net/download/AnalogElectronic/89844637
https://tianchi.aliyun.com/dataset/89785

加载数据

#预测销售额 回归问题
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import time

train = pd.read_csv('./train.csv',dtype={'StateHoliday' : np.string_}) #加载数据时为特定字段指定了数据类型
test = pd.read_csv('./test.csv',dtype={'StateHoliday' : np.string_}) #加载数据时为特定字段指定了数据类型
store = pd.read_csv('./store.csv')
display(train.head(),test.head(),store.head())
print(train.shape,test.shape,store.shape)

在这里插入图片描述

缺失值处理

在这里插入图片描述

#销售额和时间关系
cond = train['Sales']>0
sales_data = train[cond] #获取有销售额的数据
sales_data.loc[train['Store'] == 1].plot(x = 'Date',y = 'Sales',title = 'Store_1', figsize = (16, 4),color = 'red')

在这里插入图片描述

#6,7月份的销售趋势与8,9月份类似，而我们要预测的第6周在2015年8,9月份，
#因此我们可以把2015年6,7月份最近6周的1115家店的数据留出作为测试数据，用于模型的优化和验证

display(train.shape,test.shape)
#合并数据
cond = train['Sales'] > 0 
train = train[cond] #过滤了销售额小于0的数据
train = pd.merge(train,store,on='Store', how = 'left')
test = pd.merge(test,store,on='Store', how = 'left')
display(train.shape,test.shape) #测试数据没有销售额所以比训练数据少一列

在这里插入图片描述

特征工程

%%time
# 特征工程
for data in [train, test]:
    #修改时间
    data['year'] = data['Date'].apply(lambda x : x.split('-')[0]).astype(int)
    data['month'] = data['Date'].apply(lambda x : x.split('-')[1]).astype(int)
    data['day'] = data['Date'].apply(lambda x : x.split('-')[2]).astype(int)
    #PromoInterval，string类型无法进行建模, 转化为对应月份是否促销
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep',
                 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthstr'] = data['month'].map(month2str)
    #convert是转换函数
    convert = lambda x : 0 if x['PromoInterval']==0 else 1 if x['monthstr'] in x['PromoInterval'] else 0
    data['isPromoMonth'] = data.apply(convert,axis=1)
    #将字符串类型转换为数值类型
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data['StoreType'].replace(mappings, inplace = True)
    data['Assortment'].replace(mappings, inplace = True)
    data['StateHoliday'].replace(mappings, inplace = True)

在这里插入图片描述

构建训练数据和测试数据

#构建训练数据和测试数据
df_train = train.drop(['Date','PromoInterval','Customers','monthstr','Open'], axis = 1)
df_test = test.drop(['Date','PromoInterval','Id','Open','monthstr'], axis = 1)
display(df_train.shape, df_test.shape)

(844338, 18)
(41088, 17)

# df_train 训练数据, 历史数据，用历史数据建模对df_test进行预测
# df_train 这个数据要进行拆分，一部分用于训练， 一部分用于验证（评估）， 不然两眼一抹黑不知道模型的好坏

X_train = df_train[6*7*1115:] #建模的训练数据
X_test = df_train[:6*7*1115] #验证数据（评估） 2015年6-7月份的销售数据

数据属性间的相关性

### 数据属性间的相关性系数, 由于数据量有限珍惜每个特征,就不做数据清洗了（把与销售额<标签列>相关性比较低的特征删除）
plt.figure(figsize=(24,20))
plt.rcParams['font.size'] = 12
sns.heatmap(df_train.corr(), cmap='RdYlGn_r', annot=True, vmin=-1, vmax=1)

在这里插入图片描述

提取模型训练的数据

# 提取模型训练的数据
_ = plt.hist(X_train['Sales'], bins=100) #不太标准的正态分布

在这里插入图片描述

y_train = np.log1p(X_train['Sales']) #对数化 正态化更加规整
y_test = np.log1p(X_test['Sales'])

X_train = X_train.drop('Sales',axis = 1) #X_train,y_train 变量与目标值关系 建模
X_test = X_test.drop('Sales',axis = 1) #X_test,y_test 变量与目标值关系

_ = plt.hist(y_train, bins=100) #比较标准的正态分布

在这里插入图片描述

构建模型

%%time
#模型训练
params = {'objective':'reg:linear',
          'booster':'gbtree',
          'eta':0.03,
          'max_depth':10,
          'subsample':0.9,
          'colsample_bytree': 0.7,
          'silent':1,
          'seed':10}
num_boost_round = 6000
dtrain = xgb.DMatrix(X_train,y_train)
dtest = xgb.DMatrix(X_test,y_test) #保留的验证数据
print('模型训练开始...')
evals = [(dtrain,'train'),(dtest,'validation')]
gbm = xgb.train(params, #模型参数
          dtrain, #训练数据
          num_boost_round,#轮次 决策树的个数
          evals=evals, #验证评估的数据
          early_stopping_rounds=100, #准确度不再提高了，提前结束
          feval=rmspe_xg, #自定义模型评估函数
          verbose_eval=True #打印输出
         )

在这里插入图片描述

保存模型

# 保存模型
gbm.save_model('./train_model.json')

模型评估

# 模型评估
print('验证数据表现: ')
X_test.sort_index(inplace=True)
y_test.sort_index(inplace=True)

#使用模型进行预测
yhat = gbm.predict(xgb.DMatrix(X_test))

error = rmspe(np.expm1(y_test),np.expm1(yhat))

print('RMSPE:', error)

验证数据表现:
RMSPE: 0.021367080981625878

画图查看模型评估结果

#画图查看模型评估结果
res = pd.DataFrame(data = y_test) #真实数据
res['Prediction'] = yhat #预测数据
res = pd.merge(X_test,res,left_index=True,right_index=True)
res['Ratio'] = res['Prediction']/res['Sales'] #预测和真实销量比率
res['Error'] = abs(1-res['Ratio']) #误差率
res['wight'] = res['Sales']/res['Prediction'] #真实销量占预测值百分比
display(res.head())

在这里插入图片描述

数据可视化

#数据可视化
from matplotlib import font_manager
fm = font_manager.FontManager()
for font in fm.ttflist:
    print(font.n)

在这里插入图片描述

plt.rcParams['font.family'] = 'FangSong'
col_1 = ['Sales','Prediction']
col_2 = ['Ratio']
#随机选择三个店铺进行可视化
shops = np.random.randint(1,1116,size=3)
print('全部商店预测值和真实值的比率是%0.3f' %(res['Ratio'].mean()))

for shop in shops:
    cond = res['Store'] == shop
    df1 = pd.DataFrame(data = res[cond], columns = col_1)
    df2 = pd.DataFrame(data = res[cond], columns = col_2)
    df1.plot(title = '商店编码:%d 预测数据和真实销量对比' %(shop), figsize = (12,4))
    df2.plot(title = '商店编码:%d 预测数据和真实销量比率' %(shop), figsize = (12,4))

全部商店预测值和真实值的比率是1.002
在这里插入图片描述

#偏差数据
res.sort_values(by = ['Error'], ascending=False)

在这里插入图片描述

#偏差整体校正优化
weights = [(0.99 + (i/1000)) for i in range(20)]
errors = []
for w in weights:
    error = rmspe(np.expm1(y_test),np.expm1(yhat*w)) #对预测值 取权重，微小改变
    errors.append(error)

errors = pd.Series(errors, index = weights)
plt.figure(figsize=(9,6))
errors.plot()
plt.rcParams['font.sans-serif']=['SimHei']
plt.xlabel('权重系数',fontsize = 18)
plt.ylabel('均方根百分比误差',fontsize = 18)
index = errors.argmin()
print('最佳偏差校正的权重是',index, errors.iloc[index],weights[index])

最佳偏差校正的权重是 8 0.0035742695664629596 0.998

在这里插入图片描述

使用算法对测试数据进行预测

y_pred = gbm.predict(xgb.DMatrix(df_test)) #算法预测的结果, 提交kaggle

result = pd.DataFrame({'ID':np.arange(1,41089),'Sales':np.expm1(y_pred)})

result.to_csv('./result_1.csv',index=False)