当前位置：首页 > article >正文

融合模型VotingRegressor 在线性数据上的比对与应用

article 2025/2/21 3:03:48

概述

本文主要验证融合模型与传统单一模型在线性数据上的比对与验证。
数据为汽车价格的预测，附件可下载。
数据连接：https://www.kaggle.com/competitions/playground-series-s4e9/overview

代码详情

加载

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import os

os.chdir('E:\Python code\汽车价格')
train = pd.read_csv('train.csv')

# 查看数据详情
train.info()

在这里插入图片描述

#查看数据详情
train.head()

数据预处理

#删除多余的数据
train = train.drop("id", axis=1)
test = test.drop("id", axis=1)
#缺失值填充
train['clean_title'] = train['clean_title'].fillna('No')
test['clean_title'] = test['clean_title'].fillna('No')
#str 替换成数字
train['clean_title']=train['clean_title'].map({'Yes': 0,'No': 1})
test['clean_title']=test['clean_title'].map({'Yes': 0,'No': 1})

train['accident'] = train['accident'].fillna('None reported')
test['accident'] = test['accident'].fillna('None reported')
#str 替换成数字
train['accident']=train['accident'].map({'None reported': 0,'At least 1 accident or damage reported': 1})
test['accident']=test['accident'].map({'None reported': 0,'At least 1 accident or damage reported': 1})

#看品牌的情况
#查看品牌的情况
print(train['brand'].unique())
print(len(train['brand'].unique()))

#按照价格看是否是豪华品牌，对车辆的溢价能力
grouped_mean = train.groupby('brand')['price'].mean()
# 输出结果
#print(grouped_mean)
# 将结果转换为DataFrame并输出
grouped_mean = grouped_mean.reset_index()
#去除小数点转为int
grouped_mean['price'] = grouped_mean['price'].round().astype(int)
# 对整数值进行排序
grouped_mean = grouped_mean.sort_values(by='price',)

print(grouped_mean)

高端品牌对价格的影响

#标记高端品牌
car_brand = ['Aston','Ferrari','McLaren','Bentley','Rolls-Royce','Lamborghini','Bugatti']
#推导式
car_brand_list = [1 if i in car_brand else 0 for i in train['brand']]
train['car_brand'] = car_brand_list

car_brand_list = [1 if i in car_brand else 0  for i in test['brand']]
test['car_brand'] = car_brand_list

查发动机看详细参数

print(train['model'].unique())
print(len(train['model'].unique()))

#查看数据出现的频率
print(train['model'].value_counts()[0:10])

在这里插入图片描述

查看出现的关键词的频率

from wordcloud import WordCloud #文本展示，按照数量
import  matplotlib.pyplot as plt

#将需要列出的数据逐一添加
positive_text = ' '.join(train['model'])

# 创建并画图，宽800，高400 背景色白色
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('Word Cloud')
plt.axis('off')
plt.show()

在这里插入图片描述

#推导式
train['model_Base'] = [1 if 'Base'in i else 0 for i in train['model']]
# 使用向量化操作进行检查
train['model_sport'] = train['model'].str.contains('Sport').astype(int)
train['model_plus'] = train['model'].str.contains('plus').astype(int)
train['model_Premium'] = train['model'].str.contains('Premium').astype(int) 
train['model_gt'] = train['model'].str.contains('GT').astype(int)
train['model_T'] = train['model'].str.contains('0T').astype(int)


# 使用向量化操作进行检查
test['model_Base'] = test['model'].str.contains('Base').astype(int)
test['model_sport'] = test['model'].str.contains('Sport').astype(int)
test['model_plus'] = test['model'].str.contains('plus').astype(int)
test['model_Premium'] = test['model'].str.contains('Premium').astype(int) 
test['model_gt'] = test['model'].str.contains('GT').astype(int)
test['model_T'] = test['model'].str.contains('0T').astype(int)

年份对价格的影响

#生产年份
train['year'] =2024-train['model_year'] 
test['year'] =2024-test['model_year'] 

#字段是否完整
def fill_clean_title(row):
    if pd.isna(row['clean_title']):
        return 'missing'
    return row['clean_title']

train['clean_title'] = train.apply(fill_clean_title, axis=1)
test['clean_title'] = test.apply(fill_clean_title, axis=1)

数据提取

def extract_data_from_engine(df):
    """
     str ，extract 数据提取，
    正则:
    \d+：匹配一个或多个数字。
    \.：匹配小数点（.需要转义，因为在正则表达式中.有特殊含义）。
    \d+：再次匹配一个或多个数字。

    (?=HP)：是一个正向前瞻断言，它意味着紧跟在\d+\.\d+后面的必须是HP，但HP不会被提取出来。
    \s：匹配空格字符。
    V\d：以"V"开头后跟一个或多个数字的字符串（例如 "V8"）
    """
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.\d+)(?=HP)').astype(float)
    df['engine_size'] = df['engine'].str.extract(r'(\d+\.\d+)(?=L)').astype(float)
    df['cylinders'] = df['engine'].str.extract(r'(\d+)\s(Cylinder|V\d|Straight)')[0].astype(float)
    return df

#提取内容
train = extract_data_from_engine(train)
test = extract_data_from_engine(test)

引擎内容的提取

#从引擎数据中筛选出 不同的引擎类型
def extract_fuel_type(engine_info):
    if pd.isna(engine_info):
        return np.nan
    if 'Gasoline' in engine_info:
        return 'Gasoline'
    elif 'Hybrid' in engine_info:
        return 'Hybrid'
    elif 'Flex Fuel' in engine_info or 'E85' in engine_info:
        return 'Flex Fuel'
    elif 'Diesel' in engine_info:
        return 'Diesel'
    elif 'Electric' in engine_info:
        return 'Electric'
    else:
        return np.nan

train['extracted_fuel_type'] = train['engine'].apply(extract_fuel_type)
train['fuel_type'].fillna(train['extracted_fuel_type'], inplace=True)
train.drop(columns=['extracted_fuel_type'], inplace=True)

test['extracted_fuel_type'] = test['engine'].apply(extract_fuel_type)
test['fuel_type'].fillna(test['extracted_fuel_type'], inplace=True)
test.drop(columns=['extracted_fuel_type'], inplace=True)

变速器

#变速器
train['transmission'] = train['transmission'].astype('str')
# 热编码
categorical_columns = ['fuel_type','transmission','ext_col', 'int_col']
label_encoder = LabelEncoder()

for column in categorical_columns:
    train[column] = label_encoder.fit_transform(train[column])

#查看缺失值
train.isnull().sum()

#均值填充
train['horsepower'].fillna(train['horsepower'].mean(), inplace=True)
train['engine_size'].fillna(train['engine_size'].mean(), inplace=True)
train['cylinders'].fillna(train['cylinders'].mean(), inplace=True)

拆分一个后期用于训练的数据

train_2 = train.drop(['brand','model','model_year','engine'],axis=1)
test_2 = test.drop(['brand','model','model_year','engine'],axis=1)

#数据压缩到相同纲领中
std_list = ['milage','year','horsepower']
#std = StandardScaler()

# 数据压缩
for i in std_list:
    print(i)
    scaler  = StandardScaler()
    scaler .fit(train_2[[i]])


    train_2[i] = scaler.transform(train_2[[i]])
    test_2[i] = scaler.transform(test_2[[i]])
 
 #查看数据
 train_2.head()

查看数据的分布情况 EDA

import seaborn as sns
import matplotlib.pyplot as plt

查看维度对于价格的影响与变化

fig = plt.figure(figsize = (12, 4), facecolor = '#fbf5e7')
#图像位置
plt.subplot(1, 6, 1)
plt.title('model_T')
#plt 与 sns 组合
#sns.histplot(train['price'][['price']<=], color='#021841ff')
sns.boxplot(x ='model_T', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})

plt.subplot(1, 6, 2)
plt.title('model_gt')
sns.boxplot(x = 'model_gt', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})


plt.subplot(1, 6, 3)
plt.title('model_Base')
sns.boxplot(x = 'model_Base', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})


plt.subplot(1, 6, 4)
plt.title('model_sport')
sns.boxplot(x = 'model_sport', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})

plt.subplot(1, 6, 5)
plt.title('model_plus')
sns.boxplot(x = 'model_plus', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})


plt.subplot(1, 6, 6)
plt.title('model_Premium')
sns.boxplot(x = 'model_Premium', y='price', data=train_2, palette={False: "#e0d6f6ff", True: "#ea9999ff"})

#plt.title('Adjusted price distribution')
#df_purged = df[df.price > df.price.quantile(0.95)]#`DataFrame.quantile()` 方法用于计算DataFrame中各列的指定分位数（Quantile）
"""
# 计算各列的0.25（下四分位数）、0.5（中位数）、0.75（上四分位数）分位数
quantiles = df.quantile(q=[0.25, 0.5, 0.75])
"""
#sns.histplot(train['year'], color='#021841ff')

## 自动调整子图参数，使之填充整个图像区域
plt.tight_layout()
plt.show()

在这里插入图片描述

去除高端品牌的影响

fig = plt.figure(figsize = (12, 4), facecolor = '#fbf5e7')
#图像位置
plt.subplot(1, 6, 1)
plt.title('model_T')
#plt 与 sns 组合
#sns.histplot(train['price'][['price']<=], color='#021841ff')
sns.boxplot(x ='model_T', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})

plt.subplot(1, 6, 2)
plt.title('model_gt')
sns.boxplot(x = 'model_gt', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})


plt.subplot(1, 6, 3)
plt.title('model_Base')
sns.boxplot(x = 'model_Base', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})


plt.subplot(1, 6, 4)
plt.title('model_sport')
sns.boxplot(x = 'model_sport', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})

plt.subplot(1, 6, 5)
plt.title('model_plus')
sns.boxplot(x = 'model_plus', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})


plt.subplot(1, 6, 6)
plt.title('model_Premium')
sns.boxplot(x = 'model_Premium', y='price', data=train_2[train_2['car_brand'] == 0], palette={False: "#e0d6f6ff", True: "#ea9999ff"})

#plt.title('Adjusted price distribution')
#df_purged = df[df.price > df.price.quantile(0.95)]#`DataFrame.quantile()` 方法用于计算DataFrame中各列的指定分位数（Quantile）
"""
# 计算各列的0.25（下四分位数）、0.5（中位数）、0.75（上四分位数）分位数
quantiles = df.quantile(q=[0.25, 0.5, 0.75])
"""
#sns.histplot(train['year'], color='#021841ff')

## 自动调整子图参数，使之填充整个图像区域
plt.tight_layout()
plt.show()

在这里插入图片描述

#查看豪车的占比情况
print(train_2.shape)

print(train_2[train_2['car_brand'] == 0].shape)

车龄的影响

fig = plt.figure(figsize = (6, 4), facecolor = '#fbf5e7')
plt.title('Year making and  price')

df_grop = train[['year','price']].groupby(['year']).mean().astype(int)

#plt.text(df_graph['yr_built_cat'], df_graph['price'], ha='center')
plt.plot(df_grop.index , df_grop['price'], color='#021841ff')

plt.tight_layout()
plt.show()

在这里插入图片描述
从数据中看其中有一部分属于老爷车以的情况

马力的影响

fig = plt.figure(figsize = (6, 4), facecolor = '#fbf5e7')
plt.title('horsepower and  price')

df_grop = train[['horsepower','price']].groupby(['horsepower']).mean().astype(int)

#plt.text(df_graph['yr_built_cat'], df_graph['price'], ha='center')
plt.plot(df_grop.index , df_grop['price'], color='#021841ff')

plt.tight_layout()
plt.show()

在这里插入图片描述
从数据看大马力还是有比较好的效果

引擎的排量影响

fig = plt.figure(figsize = (6, 4), facecolor = '#fbf5e7')
plt.title('engine_size and  price')

df_grop = train_2[['engine_size','price']].astype(float).groupby(['engine_size']).mean().astype(int)

#plt.text(df_graph['yr_built_cat'], df_graph['price'], ha='center')
plt.plot(df_grop.index , df_grop['price'], color='#021841ff')

plt.tight_layout()
plt.show()

在这里插入图片描述

从整体的情况来查看

test = train_2[:1000].copy()
sns.pairplot(test[['price', 'milage', 'year', 'engine_size', 'cylinders']],kind='scatter')
plt.suptitle('Pair Plot of Selected Features Colored by Fuel Type', y=1.02)
plt.sho

在这里插入图片描述

# 查看相关程度
sns.heatmap(test.corr(), annot=True,cmap="YlGnBu");
plt.savefig("./heatmap.png")

在这里插入图片描述

模型选择与训练

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression ,Lasso,LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
import catboost as cat
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import VotingRegressor #投票回归
import numpy as np
from catboost import Pool, CatBoostRegressor

#这里因为比较耗时所以做了些限制
y = train_2['price'][0:40000]
x = train_2.drop(['price'],axis=1)[0:40000]

catboost

Cat_model = CatBoostRegressor(
                          iterations=100,#树迭代的次数
                          depth=5,#树最大的深度
                          learning_rate=0.2,
                          loss_function='RMSE',#损失函数采用RMSE
                          random_seed=22,
                          task_type = 'GPU',
                          verbose=False #不打印详情（不然会出现很多内容）
    
                         )

RFR

RFR_model = RandomForestRegressor(
                                n_estimators=100,
                                max_depth=5,
                                min_samples_split=50,#默认值2，指定每个内部节点(非叶子节点)包含的最少的样本数。
                                min_samples_leaf=30,#默认值1，指定每个叶子结点包含的最少的样本数。
                                bootstrap=True,# 是否有放回的采样。
                                random_state=22,# 随机种子
                                verbose=0,# (default=0) 是否显示任务进程
)

svr

SVR_model = SVR(
                kernel='sigmoid',#：算法中所使用的核函数类型，其中有（‘linear’， ‘poly’， ‘rbf’， ‘sigmoid’，‘precomputer’)
                gamma=0.2, #核函数的系数， 在核函数为‘rbf’， ‘poly’， ‘sigmoid’时使用
                C=10.0,#惩罚系数
                tol = 0.001#残差收敛条件
)

xgb¶

XGB_model = xgb.XGBRegressor(
                            booster = 'gblinear',
                            learning_rate=0.2,
                            n_estimators = 100,
                            eval_metric = 'mae',#校验数据所需要的评价指标  mae平均绝对误差
                            #max_depth = 5 , 
                            random_state = 22
)

lgb

LGB_model = LGBMRegressor(
                            learning_rate = 0.2,
                            max_depth = 5,
                            num_leaves = 50,
                            n_estimators = 100,
                            metric = 'mse',
)

models = [
    ('Linear Regression',LinearRegression()),
    ('RandomForestRegressor',RFR_model),
    ('SVR',SVR_model),
    ('XGBRegressor',XGB_model),
    ('CatBoostRegressor',Cat_model),
    ('LGBMRegressor',LGB_model)
    
]

for modelname,model in models:
    k_corss = cross_val_score(model, x, y, cv=5)#默认R2
    print(modelname,':',k_corss)

k_folds = 5

for name, model in models:
    mse_scores = cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=k_folds)
    mse_r = cross_val_score(model, x, y, cv=k_folds)
    # 将负MSE转换为正MSE neg_mean_squared_error 计算出的结果一般为负数
    rmse_scores = np.sqrt(-mse_scores)
    mean_rmse = np.mean(rmse_scores)
    mse_r = np.mean(mse_r)
    
    print(f'{name}的RMSE: {rmse_scores}')
    print('\n')
    print(f'{name}的平均 RMSE: {mean_rmse}')
    print('*'*20,'\n\n')
    
    print(f'{name}的平均 RMSE: {mse_r}')
    print('*'*20,'\n\n')

模型融合 -投票回归

voting_model = VotingRegressor(
    estimators=[
        ('Linear Regression', LinearRegression()),
        ('XGBRegressor',XGB_model),
        ('CatBoostRegressor',Cat_model),
        ('LGBMRegressor',LGB_model)
    ]
)

mse_scores = cross_val_score(voting_model, x, y, scoring='neg_mean_squared_error', cv=5)

rmse_scores = np.sqrt(-mse_scores)
mean_rmse = np.mean(rmse_scores)

print(f'{name}的RMSE: {rmse_scores}')
print(f'{name}的平均 RMSE: {mean_rmse}')

在这里插入图片描述

最终模型调整

XGB_model_v2 = xgb.XGBRegressor(
                            booster = 'gblinear',
                            learning_rate=0.02,
                            n_estimators = 400,
                            eval_metric = 'mae',#校验数据所需要的评价指标  mae平均绝对误差
                            #max_depth = 8 , 
                            random_state = 22,
                            reg_lambda = 2,#L2 正则化项的权重系数，越大模型越保守；
                            reg_alpha = 2,#L1 正则化项的权重系数，越大模型越保守；
                            #subsample = 0.9,#每棵树随机抽取的样本比例
                            #colsample_bytree = 0.9,#每棵树随机抽取的特征比例
                            #gamma = 10,#分裂节点时的最小损失减益，默认值为 0。该值越大，算法越保守，防止过拟合。
)

Cat_model_v2 = CatBoostRegressor(
                          iterations=400,#树迭代的次数
                          depth=8,#树最大的深度
                          learning_rate=0.02,
                          loss_function='RMSE',#损失函数采用RMSE
                          random_seed=22,
                          task_type = 'GPU',#使用GPU
                          verbose=False #不打印详情（不然会出现很多内容）
    
                         )


LGB_model_v2 = LGBMRegressor(
                            boosting_type = 'gbdt',
                            learning_rate = 0.02,
                            max_depth = 8,#树的最大深度。
                            num_leaves = 40,#一棵树的最大叶子数。叶子数越多，模型越复杂
                            n_estimators = 400,#提升树的数量。
                            min_child_samples = 50,#一个叶子节点上最少的样本数量
                            metric = 'rmse',
                            subsample = 0.9,#用于训练模型的样本比例
                            #colsample_bytree = 1,
                            #eature_fraction = 0.9,
                            #bagging_fraction = 0.9,
                            reg_alpha = 5,# L1 正则化项系数
                            reg_lambda = 5,# L2 正则化项系数
    
)

voting_model = VotingRegressor(
    estimators=[
        ('Linear Regression', LinearRegression()),
        ('XGBRegressor',XGB_model_v2),
        ('CatBoostRegressor',Cat_model_v2),
        ('LGBMRegressor',LGB_model_v2)
    ]
)

mse_scores = cross_val_score(voting_model, x, y, scoring='neg_mean_squared_error', cv=5)

rmse_scores = np.sqrt(-mse_scores)
mean_rmse = np.mean(rmse_scores)

print(f'的RMSE: {rmse_scores}')
print(f'的平均 RMSE: {mean_rmse}')

在这里插入图片描述
有一定的提升

全量数据

x_train,x_test,y_train,y_test = train_test_split(train_2.drop(['price'],axis=1),train_2['price'],test_size=0.2,random_state=22)

model = voting_model.fit(x_train,y_train)

model_prodict = model.predict(x_test)

y_test.index = range(y_test.shape[0])

def rmse(arr1, arr2):
    # 确保两个数组长度相同
    if arr1.shape != arr2.shape:
        raise ValueError("两个数组必须有相同的形状。")
    
    # 计算平方差
    square_diffs = (arr1 - arr2) ** 2
    
    # 计算方根均值
    mean_square_diffs = np.mean(square_diffs)
    
    # 计算平方根
    root_mean_square_diffs = np.sqrt(mean_square_diffs)

    #计算平方根
    mape = np.mean(np.abs((arr1 - arr2) / arr1))
    
    return mean_square_diffs,root_mean_square_diffs,mape


# 计算RMSE
result_mse,result_rms,result_mape = rmse(model_prodict.astype(int), np.array(y_test))
print("MSE:", result_mse)
print("RMSE:", result_rms)
print("MAPE:", result_mape)