融合模型VotingRegressor 在线性数据上的比对与应用
概述
本文主要验证融合模型与传统单一模型在线性数据上的比对与验证。
数据为汽车价格的预测,附件可下载。
数据连接:https://www.kaggle.com/competitions/playground-series-s4e9/overview
代码详情
加载
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import os
os.chdir('E:\Python code\汽车价格')
train = pd.read_csv('train.csv')
# 查看数据详情
train.info()
#查看数据详情
train.head()
数据预处理
#删除多余的数据
train = train.drop("id", axis=1)
test = test.drop("id", axis=1)
#缺失值填充
train['clean_title'] = train['clean_title'].fillna('No')
test['clean_title'] = test['clean_title'].fillna('No')
#str 替换成数字
train['clean_title']=train['clean_title'].map({'Yes': 0,'No': 1})
test['clean_title']=test['clean_title'].map({'Yes': 0,'No': 1})
train['accident'] = train['accident'].fillna('None reported')
test['accident'] = test['accident'].fillna('None reported')
#str 替换成数字
train['accident']=train['accident'].map({'None reported': 0,'At least 1 accident or damage reported': 1})
test['accident']=test['accident'].map({'None reported': 0,'At least 1 accident or damage reported': 1})
#看品牌的情况
#查看品牌的情况
print(train['brand'].unique())
print(len(train['brand'].unique()))
#按照价格看是否是豪华品牌,对车辆的溢价能力
grouped_mean = train.groupby('brand')['price'].mean()
# 输出结果
#print(grouped_mean)
# 将结果转换为DataFrame并输出
grouped_mean = grouped_mean.reset_index()
#去除小数点转为int
grouped_mean['price'] = grouped_mean['price'].round().astype(int)
# 对整数值进行排序
grouped_mean = grouped_mean.sort_values(by='price',)
print(grouped_mean)
高端品牌对价格的影响
#标记高端品牌
car_brand = ['Aston','Ferrari','McLaren','Bentley','Rolls-Royce','Lamborghini','Bugatti']
#推导式
car_brand_list = [1 if i in car_brand else 0 for i in train['brand']]
train['car_brand'] = car_brand_list
car_brand_list = [1 if i in car_brand else 0 for i in test['brand']]
test['car_brand'] = car_brand_list
查发动机看详细参数
print(train['model'].unique())
print(len(train['model'].unique()))
#查看数据出现的频率
print(train['model'].value_counts()[0:10])
查看出现的关键词的频率
from wordcloud import WordCloud #文本展示,按照数量
import matplotlib.pyplot as plt
#将需要列出的数据逐一添加
positive_text = ' '.join(train['model'])
# 创建并画图,宽800,高400 背景色白色
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()
#推导式
train['model_Base'] = [1 if 'Base'in i else 0 for i in train['model']]
# 使用向量化操作进行检查
train['model_sport'] = train['model'].str.contains('Sport').astype(int)
train['model_plus'] = train['model'].str.contains('plus').astype(int)
train['model_Premium'] = train['model'].str.contains('Premium').astype(int)
train['model_gt'] = train['model'].str.contains('GT').astype(int)
train['model_T'] = train['model'].str.contains('0T').astype(int)
# 使用向量化操作进行检查
test['model_Base'] = test['model'].str.contains('Base').astype(int)
test['model_sport'] = test['model'].str.contains('Sport').astype(int)
test['model_plus'] = test['model'].str.contains('plus').astype(int)
test['model_Premium'] = test['model'].str.contains('Premium').astype(int)
test['model_gt'] = test['model'].str.contains('GT').astype(int)
test['model_T'] = test['model'].str.contains('0T').astype(int)
年份对价格的影响
#生产年份
train['year'] =2024-train['model_year']
test['year'] =2024-test['model_year']
#字段是否完整
def fill_clean_title(row):
if pd.isna(row['clean_title']):
return 'missing'
return row['clean_title']
train['clean_title'] = train.apply(fill_clean_title, axis=1)
test['clean_title'] = test.apply(fill_clean_title, axis=1)
数据提取
def extract_data_from_engine(df):
"""
str ,extract 数据提取,
正则:
\d+:匹配一个或多个数字。
\.:匹配小数点(.需要转义,因为在正则表达式中.有特殊含义)。
\d+:再次匹配一个或多个数字。
(?=HP):是一个正向前瞻断言,它意味着紧跟在\d+\.\d+后面的必须是HP,但HP不会被提取出来。
\s:匹配空格字符。
V\d:以"V"开头后跟一个或多个数字的字符串(例如 "V8")
"""
df['horsepower'] = df['engine'].str.extract(r'(\d+\.\d+)(?=HP)').astype(float)
df['engine_size'] = df['engine'].str.extract(r'(\d+\.\d+)(?=L)').astype(float)
df['cylinders'] = df['engine'].str.extract(r'(\d+)\s(Cylinder|V\d|Straight)')[0].astype(float)
return df
#提取内容
train = extract_data_from_engine(train)
test = extract_data_from_engine(test)
引擎内容的提取
#从引擎数据中筛选出 不同的引擎类型
def extract_fuel_type(engine_info):
if pd.isna(engine_info):
return np.nan
if 'Gasoline' in engine_info:
return 'Gasoline'
elif 'Hybrid' in engine_info:
return 'Hybrid'
elif 'Flex Fuel' in engine_info or 'E85' in engine_info:
return 'Flex Fuel'
elif 'Diesel' in engine_info:
return 'Diesel'
elif 'Electric' in engine_info:
return 'Electric'
else:
return np.nan
train['extracted_fuel_type'] = train['engine'].apply(extract_fuel_type)
train['fuel_type'].fillna(train['extracted_fuel_type'], inplace=True)
train.drop(columns=['extracted_fuel_type'], inplace=True)
test['extracted_fuel_type'] = test['engine'].apply(extract_fuel_type)
test['fuel_type'].fillna(test['extracted_fuel_type'], inplace=True)
test.drop(columns=['extracted_fuel_type'], inplace=True)
变速器
#变速器
train['transmission'] = train['transmission'].astype('str')
# 热编码
categorical_columns = ['fuel_type','transmission','ext_col', 'int_col']
label_encoder = LabelEncoder()
for column in categorical_columns:
train[column] = label_encoder.fit_transform(train[column])
#查看缺失值
train.isnull().sum()
#均值填充
train['horsepower'].fillna(train['horsepower'].mean(), inplace=True)
train['engine_size'].fillna(train['engine_size'].mean(), inplace=True)
train['cylinders'].fillna(train['cylinders'].mean(), inplace=True)
拆分一个后期用于训练的数据
train_2 = train.drop(['brand','model','model_year','engine'],axis=1)
test_2 = test.drop(['brand','model','model_year','engine'],axis=1)
#数据压缩到相同纲领中
std_list = ['milage','year','horsepower']
#std = StandardScaler()
# 数据压缩
for i in std_list:
print(i)
scaler = StandardScaler()
scaler .fit(train_2[[i]])
train_2[i] = scaler.transform(train_2[[i]])
test_2[i] = scaler.transform(test_2[[i]])
#查看数据
train_2.head()
查看数据的分布情况 EDA
import seaborn as sns
import matplotlib.pyplot as plt
查看维度对于价格的影响与变化
fig = plt.figure(figsize = (12, 4), facecolor = '#fbf5e7')
#图像位置
plt.subplot(1, 6, 1)
plt.title('model_T')
#plt 与 sns 组合
#sns.histplot(train['price'][['price']<=], color='#021841ff')
sns.boxplot(x ='model_T', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 2)
plt.title('model_gt')
sns.boxplot(x = 'model_gt', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 3)
plt.title('model_Base')
sns.boxplot(x = 'model_Base', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 4)
plt.title('model_sport')
sns.boxplot(x = 'model_sport', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 5)
plt.title('model_plus')
sns.boxplot(x = 'model_plus', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 6)
plt.title('model_Premium')
sns.boxplot(x = 'model_Premium', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})
#plt.title('Adjusted price distribution')
#df_purged = df[df.price > df.price.quantile(0.95)]#`DataFrame.quantile()` 方法用于计算DataFrame中各列的指定分位数(Quantile)
"""
# 计算各列的0.25(下四分位数)、0.5(中位数)、0.75(上四分位数)分位数
quantiles = df.quantile(q=[0.25, 0.5, 0.75])
"""
#sns.histplot(train['year'], color='#021841ff')
## 自动调整子图参数,使之填充整个图像区域
plt.tight_layout()
plt.show()
去除高端品牌的影响
fig = plt.figure(figsize = (12, 4), facecolor = '#fbf5e7')
#图像位置
plt.subplot(1, 6, 1)
plt.title('model_T')
#plt 与 sns 组合
#sns.histplot(train['price'][['price']<=], color='#021841ff')
sns.boxplot(x ='model_T', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 2)
plt.title('model_gt')
sns.boxplot(x = 'model_gt', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 3)
plt.title('model_Base')
sns.boxplot(x = 'model_Base', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 4)
plt.title('model_sport')
sns.boxplot(x = 'model_sport', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 5)
plt.title('model_plus')
sns.boxplot(x = 'model_plus', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})
plt.subplot(1, 6, 6)
plt.title('model_Premium')
sns.boxplot(x = 'model_Premium', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})
#plt.title('Adjusted price distribution')
#df_purged = df[df.price > df.price.quantile(0.95)]#`DataFrame.quantile()` 方法用于计算DataFrame中各列的指定分位数(Quantile)
"""
# 计算各列的0.25(下四分位数)、0.5(中位数)、0.75(上四分位数)分位数
quantiles = df.quantile(q=[0.25, 0.5, 0.75])
"""
#sns.histplot(train['year'], color='#021841ff')
## 自动调整子图参数,使之填充整个图像区域
plt.tight_layout()
plt.show()
#查看豪车的占比情况
print(train_2.shape)
print(train_2[train_2['car_brand'] == 0].shape)
车龄的影响
fig = plt.figure(figsize = (6, 4), facecolor = '#fbf5e7')
plt.title('Year making and price')
df_grop = train[['year','price']].groupby(['year']).mean().astype(int)
#plt.text(df_graph['yr_built_cat'], df_graph['price'], ha='center')
plt.plot(df_grop.index , df_grop['price'], color='#021841ff')
plt.tight_layout()
plt.show()
从数据中看其中有一部分属于老爷车以的情况
马力的影响
fig = plt.figure(figsize = (6, 4), facecolor = '#fbf5e7')
plt.title('horsepower and price')
df_grop = train[['horsepower','price']].groupby(['horsepower']).mean().astype(int)
#plt.text(df_graph['yr_built_cat'], df_graph['price'], ha='center')
plt.plot(df_grop.index , df_grop['price'], color='#021841ff')
plt.tight_layout()
plt.show()
从数据看大马力还是有比较好的效果
引擎的排量影响
fig = plt.figure(figsize = (6, 4), facecolor = '#fbf5e7')
plt.title('engine_size and price')
df_grop = train_2[['engine_size','price']].astype(float).groupby(['engine_size']).mean().astype(int)
#plt.text(df_graph['yr_built_cat'], df_graph['price'], ha='center')
plt.plot(df_grop.index , df_grop['price'], color='#021841ff')
plt.tight_layout()
plt.show()
从整体的情况来查看
test = train_2[:1000].copy()
sns.pairplot(test[['price', 'milage', 'year', 'engine_size', 'cylinders']],kind='scatter')
plt.suptitle('Pair Plot of Selected Features Colored by Fuel Type', y=1.02)
plt.sho
# 查看相关程度
sns.heatmap(test.corr(), annot=True,cmap="YlGnBu");
plt.savefig("./heatmap.png")
模型选择与训练
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression ,Lasso,LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
import catboost as cat
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import VotingRegressor #投票回归
import numpy as np
from catboost import Pool, CatBoostRegressor
#这里因为比较耗时所以做了些限制
y = train_2['price'][0:40000]
x = train_2.drop(['price'],axis=1)[0:40000]
catboost
Cat_model = CatBoostRegressor(
iterations=100,#树迭代的次数
depth=5,#树最大的深度
learning_rate=0.2,
loss_function='RMSE',#损失函数采用RMSE
random_seed=22,
task_type = 'GPU',
verbose=False #不打印详情(不然会出现很多内容)
)
RFR
RFR_model = RandomForestRegressor(
n_estimators=100,
max_depth=5,
min_samples_split=50,#默认值2,指定每个内部节点(非叶子节点)包含的最少的样本数。
min_samples_leaf=30,#默认值1,指定每个叶子结点包含的最少的样本数。
bootstrap=True,# 是否有放回的采样。
random_state=22,# 随机种子
verbose=0,# (default=0) 是否显示任务进程
)
svr
SVR_model = SVR(
kernel='sigmoid',#:算法中所使用的核函数类型,其中有(‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’,‘precomputer’)
gamma=0.2, #核函数的系数, 在核函数为‘rbf’, ‘poly’, ‘sigmoid’时使用
C=10.0,#惩罚系数
tol = 0.001#残差收敛条件
)
xgb¶
XGB_model = xgb.XGBRegressor(
booster = 'gblinear',
learning_rate=0.2,
n_estimators = 100,
eval_metric = 'mae',#校验数据所需要的评价指标 mae平均绝对误差
#max_depth = 5 ,
random_state = 22
)
lgb
LGB_model = LGBMRegressor(
learning_rate = 0.2,
max_depth = 5,
num_leaves = 50,
n_estimators = 100,
metric = 'mse',
)
models = [
('Linear Regression',LinearRegression()),
('RandomForestRegressor',RFR_model),
('SVR',SVR_model),
('XGBRegressor',XGB_model),
('CatBoostRegressor',Cat_model),
('LGBMRegressor',LGB_model)
]
for modelname,model in models:
k_corss = cross_val_score(model, x, y, cv=5)#默认R2
print(modelname,':',k_corss)
k_folds = 5
for name, model in models:
mse_scores = cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=k_folds)
mse_r = cross_val_score(model, x, y, cv=k_folds)
# 将负MSE转换为正MSE neg_mean_squared_error 计算出的结果一般为负数
rmse_scores = np.sqrt(-mse_scores)
mean_rmse = np.mean(rmse_scores)
mse_r = np.mean(mse_r)
print(f'{name}的RMSE: {rmse_scores}')
print('\n')
print(f'{name}的平均 RMSE: {mean_rmse}')
print('*'*20,'\n\n')
print(f'{name}的平均 RMSE: {mse_r}')
print('*'*20,'\n\n')
模型融合 -投票回归
voting_model = VotingRegressor(
estimators=[
('Linear Regression', LinearRegression()),
('XGBRegressor',XGB_model),
('CatBoostRegressor',Cat_model),
('LGBMRegressor',LGB_model)
]
)
mse_scores = cross_val_score(voting_model, x, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-mse_scores)
mean_rmse = np.mean(rmse_scores)
print(f'{name}的RMSE: {rmse_scores}')
print(f'{name}的平均 RMSE: {mean_rmse}')
最终模型调整
XGB_model_v2 = xgb.XGBRegressor(
booster = 'gblinear',
learning_rate=0.02,
n_estimators = 400,
eval_metric = 'mae',#校验数据所需要的评价指标 mae平均绝对误差
#max_depth = 8 ,
random_state = 22,
reg_lambda = 2,#L2 正则化项的权重系数,越大模型越保守;
reg_alpha = 2,#L1 正则化项的权重系数,越大模型越保守;
#subsample = 0.9,#每棵树随机抽取的样本比例
#colsample_bytree = 0.9,#每棵树随机抽取的特征比例
#gamma = 10,#分裂节点时的最小损失减益,默认值为 0。该值越大,算法越保守,防止过拟合。
)
Cat_model_v2 = CatBoostRegressor(
iterations=400,#树迭代的次数
depth=8,#树最大的深度
learning_rate=0.02,
loss_function='RMSE',#损失函数采用RMSE
random_seed=22,
task_type = 'GPU',#使用GPU
verbose=False #不打印详情(不然会出现很多内容)
)
LGB_model_v2 = LGBMRegressor(
boosting_type = 'gbdt',
learning_rate = 0.02,
max_depth = 8,#树的最大深度。
num_leaves = 40,#一棵树的最大叶子数。叶子数越多,模型越复杂
n_estimators = 400,#提升树的数量。
min_child_samples = 50,#一个叶子节点上最少的样本数量
metric = 'rmse',
subsample = 0.9,#用于训练模型的样本比例
#colsample_bytree = 1,
#eature_fraction = 0.9,
#bagging_fraction = 0.9,
reg_alpha = 5,# L1 正则化项系数
reg_lambda = 5,# L2 正则化项系数
)
voting_model = VotingRegressor(
estimators=[
('Linear Regression', LinearRegression()),
('XGBRegressor',XGB_model_v2),
('CatBoostRegressor',Cat_model_v2),
('LGBMRegressor',LGB_model_v2)
]
)
mse_scores = cross_val_score(voting_model, x, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-mse_scores)
mean_rmse = np.mean(rmse_scores)
print(f'的RMSE: {rmse_scores}')
print(f'的平均 RMSE: {mean_rmse}')
有一定的提升
全量数据
x_train,x_test,y_train,y_test = train_test_split(train_2.drop(['price'],axis=1),train_2['price'],test_size=0.2,random_state=22)
model = voting_model.fit(x_train,y_train)
model_prodict = model.predict(x_test)
y_test.index = range(y_test.shape[0])
def rmse(arr1, arr2):
# 确保两个数组长度相同
if arr1.shape != arr2.shape:
raise ValueError("两个数组必须有相同的形状。")
# 计算平方差
square_diffs = (arr1 - arr2) ** 2
# 计算方根均值
mean_square_diffs = np.mean(square_diffs)
# 计算平方根
root_mean_square_diffs = np.sqrt(mean_square_diffs)
#计算平方根
mape = np.mean(np.abs((arr1 - arr2) / arr1))
return mean_square_diffs,root_mean_square_diffs,mape
# 计算RMSE
result_mse,result_rms,result_mape = rmse(model_prodict.astype(int), np.array(y_test))
print("MSE:", result_mse)
print("RMSE:", result_rms)
print("MAPE:", result_mape)
以上是文章的全部内容,感谢阅读。