多种机器学习模型预测房价
大家好,我是带我去滑雪!
本期是房价预测,它是一个连续型的变量,因此它是一个回归问题。所以这是一篇回归问题的全流程的代码。从数据清洗到特征工程到分析可视化,再到模型选择,模型训练,交叉验证,超参数搜索,有效性评估。下面开始代码实战。
目录
一、数据导入
二、划分80%训练集与20%验证集、数据标准化
三、十种机器学习模型构建
(1) 模块导入与模型设定
(2)定义评价指标
(3)训练拟合对比
(4)评价指标输出结果
(5)评价指标可视化
(6)模型交叉验证
(7)交叉验证整体效果
四、搜索超参数
(1)模型设置
(2)绘制变量重要性排序图
五、模型保存
一、数据导入
本次数据通过爬取链家二手房源信息,主要有标题、小区名称、房屋位置、房屋户型、房屋面积、房屋装修情况、有无电梯、楼层位置、附件有无地铁、关注度、房屋总价。
通过数据预处理,将数据转化为:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import warnings
data = pd.read_csv(r'E:\工作\硕士\博客\博客94-多种机器学习模型预测房价\data.csv')
data
二、划分80%训练集与20%验证集、数据标准化
y=data.y1
X=data.iloc[:,:-1]
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s = scaler.transform(X_val)
print('训练数据形状:')
print(X_train_s.shape,y_train.shape)
print('验证数据形状:')
(X_val_s.shape,y_val.shape,)
三、十种机器学习模型构建
(1) 模块导入与模型设定
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
#线性回归
model1 = LinearRegression()
#弹性网回归
model2 = ElasticNet(alpha=0.05, l1_ratio=0.5)
#K近邻
model3 = KNeighborsRegressor(n_neighbors=10)
#决策树
model4 = DecisionTreeRegressor(random_state=77)
#随机森林
model5= RandomForestRegressor(n_estimators=500, max_features=int(X_train.shape[1]/3) , random_state=0)
#梯度提升
model6 = GradientBoostingRegressor(n_estimators=500,random_state=123)
#极端梯度提升
model7 = XGBRegressor(objective='reg:squarederror', n_estimators=1000, random_state=0)
#轻量梯度提升
model8 = LGBMRegressor(n_estimators=1000,objective='regression', # 默认是二分类
random_state=0,force_row_wise=True)
#支持向量机
model9 = SVR(kernel="rbf")
#神经网络
model10 = MLPRegressor(hidden_layer_sizes=(16,8), random_state=77, max_iter=10000)
model_list=[model1,model2,model3,model4,model5,model6,model7,model8,model9,model10]
model_name=['线性回归','惩罚回归','K近邻','决策树','随机森林','梯度提升','极端梯度提升','轻量梯度提升','支持向量机','神经网络']
(2)定义评价指标
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error,r2_score
def evaluation(y_test, y_predict):
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
mape=(abs(y_predict -y_test)/ y_test).mean()
r_2=r2_score(y_test, y_predict)
return mae, rmse, mape,r_2 #mse
(3)训练拟合对比
df_eval=pd.DataFrame(columns=['MAE','RMSE','MAPE','R2'])
for i in range(len(model_list)):
model_C=model_list[i]
name=model_name[i]
print(f'{name}正在训练...')
model_C.fit(X_train_s, y_train)
pred=model_C.predict(X_val_s)
s=evaluation(y_val,pred)
df_eval.loc[name,:]=list(s)
(4)评价指标输出结果
通过评价指标里可以看出,十个模型里预测效果最好的是神经网络,总的来看,这些机器学习模型的预测效果均不是很理想。我想,原因可能是我使用的数据质量不太好,另外数据数量也过于少,所以模型的预测效果都不要好。我尝试换了其他数据测试之后,预测效果如下:
可以看见,更换数据量更大的数据之后,模型的预测效果得到了质的提升。但是这个数据不是房价相关的,所以后续还是使用房价的数据,后续有好的数据可以进行替换。
(5)评价指标可视化
bar_width = 0.4
colors=['c', 'b', 'g', 'tomato', 'm', 'y', 'lime', 'k','orange','pink','grey','tan','purple']
fig, ax = plt.subplots(2,2,figsize=(7,5),dpi=256)
for i,col in enumerate(df_eval.columns):
n=int(str('22')+str(i+1))
plt.subplot(n)
df_col=df_eval[col]
m =np.arange(len(df_col))
plt.bar(x=m,height=df_col.to_numpy(),width=bar_width,color=colors)
names=df_col.index
plt.xticks(range(len(df_col)),names,fontsize=8)
plt.xticks(rotation=40)
if col=='R2':
plt.ylabel(r'$R^{2}$',fontsize=14)
else:
plt.ylabel(col,fontsize=14)
plt.tight_layout()
plt.savefig(r"E:\工作\硕士\博客\博客94-多种机器学习模型预测房价\评价指标可视化.png",
bbox_inches="tight",
pad_inches=1,
transparent=True,
facecolor="w",
edgecolor='w',
dpi=300,
orientation='landscape')
输出结果:
(6)模型交叉验证
#回归问题交叉验证,使用拟合优度,mae,rmse,mape 作为评价标准
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold
def evaluation(y_test, y_predict):
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
mape=(abs(y_predict -y_test)/ y_test).mean()
r_2=r2_score(y_test, y_predict)
return mae, rmse, mape
def evaluation2(lis):
array=np.array(lis)
return array.mean() , array.std()
def cross_val(model=None,X=None,Y=None,K=5,repeated=1):
df_mean=pd.DataFrame(columns=['R2','MAE','RMSE','MAPE'])
df_std=pd.DataFrame(columns=['R2','MAE','RMSE','MAPE'])
for n in range(repeated):
print(f'正在进行第{n+1}次重复K折.....随机数种子为{n}\n')
kf = KFold(n_splits=K, shuffle=True, random_state=n)
R2=[]; MAE=[] ; RMSE=[] ; MAPE=[]
print(f" 开始本次在{K}折数据上的交叉验证.......\n")
i=1
for train_index, test_index in kf.split(X):
print(f' 正在进行第{i}折的计算')
X_train=X.values[train_index]
y_train=y.values[train_index]
X_test=X.values[test_index]
y_test=y.values[test_index]
model.fit(X_train,y_train)
score=model.score(X_test,y_test)
R2.append(score)
pred=model.predict(X_test)
mae, rmse, mape=evaluation(y_test, pred)
MAE.append(mae)
RMSE.append(rmse)
MAPE.append(mape)
print(f' 第{i}折的拟合优度为:{round(score,4)},MAE为{round(mae,4)},RMSE为{round(rmse,4)},MAPE为{round(mape,4)}')
i+=1
print(f' ———————————————完成本次的{K}折交叉验证———————————————————\n')
R2_mean,R2_std=evaluation2(R2)
MAE_mean,MAE_std=evaluation2(MAE)
RMSE_mean,RMSE_std=evaluation2(RMSE)
MAPE_mean,MAPE_std=evaluation2(MAPE)
print(f'第{n+1}次重复K折,本次{K}折交叉验证的总体拟合优度均值为{R2_mean},方差为{R2_std}')
print(f' 总体MAE均值为{MAE_mean},方差为{MAE_std}')
print(f' 总体RMSE均值为{RMSE_mean},方差为{RMSE_std}')
print(f' 总体MAPE均值为{MAPE_mean},方差为{MAPE_std}')
print("\n====================================================================================================================\n")
df1=pd.DataFrame(dict(zip(['R2','MAE','RMSE','MAPE'],[R2_mean,MAE_mean,RMSE_mean,MAPE_mean])),index=[n])
df_mean=pd.concat([df_mean,df1])
df2=pd.DataFrame(dict(zip(['R2','MAE','RMSE','MAPE'],[R2_std,MAE_std,RMSE_std,MAPE_std])),index=[n])
df_std=pd.concat([df_std,df2])
return df_mean,df_std
model = LGBMRegressor(n_estimators=1000,objective='regression',random_state=0)
lgb_crosseval,lgb_crosseval2=cross_val(model=model,X=X,Y=y,K=3,repeated=5)
model = XGBRegressor(n_estimators=1000,objective='reg:squarederror',random_state=0)
xgb_crosseval,xgb_crosseval2=cross_val(model=model,X=X,Y=y,K=3,repeated=5)
model = RandomForestRegressor(n_estimators=500, max_features=int(X_train.shape[1]/3) , random_state=0)
rf_crosseval,rf_crosseval2=cross_val(model=model,X=X,Y=y,K=3,repeated=5)
输出结果:
(7)交叉验证整体效果
绘制交叉验证评价指标均值对比图:
plt.subplots(1,4,figsize=(16,3))
for i,col in enumerate(lgb_crosseval.columns):
n=int(str('14')+str(i+1))
plt.subplot(n)
plt.plot(lgb_crosseval[col], 'k', label='LGB')
plt.plot(xgb_crosseval[col], 'b-.', label='XGB')
plt.plot(rf_crosseval[col], 'r-^', label='RF')
plt.title(f'不同模型的{col}对比')
plt.xlabel('重复交叉验证次数')
plt.ylabel(col,fontsize=16)
plt.legend()
plt.tight_layout()
plt.savefig(r"E:\工作\硕士\博客\博客94-多种机器学习模型预测房价\交叉验证评价指标均值图.png",
bbox_inches="tight",
pad_inches=1,
transparent=True,
facecolor="w",
edgecolor='w',
dpi=300,
orientation='landscape')
输出结果:
绘制交叉验证评价指标方差对比图:
plt.subplots(1,4,figsize=(16,3))
for i,col in enumerate(lgb_crosseval2.columns):
n=int(str('14')+str(i+1))
plt.subplot(n)
plt.plot(lgb_crosseval2[col], 'k', label='LGB')
plt.plot(xgb_crosseval2[col], 'b-.', label='XGB')
plt.plot(rf_crosseval2[col], 'r-^', label='RF')
plt.title(f'不同模型的{col}方差对比')
plt.xlabel('重复交叉验证次数')
plt.ylabel(col,fontsize=16)
plt.legend()
plt.tight_layout()
plt.savefig(r"E:\工作\硕士\博客\博客94-多种机器学习模型预测房价\交叉验证评价指标方差图.png",
bbox_inches="tight",
pad_inches=1,
transparent=True,
facecolor="w",
edgecolor='w',
dpi=300,
orientation='landscape')
输出结果:
随机森林的效果最好,我们选择他作为最终的模型,下面对随机森林搜索超参数。
四、搜索超参数
(1)模型设置
#利用K折交叉验证搜索最优超参数
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
# 设置参数分布
param_distributions = {
'n_estimators': [100, 300, 500, 800,1000], # 树的数量
'max_features': range(9, 15), # 最大特征数
'max_depth': range(8, 13), # 最大深度
}
# 定义交叉验证
kfold = KFold(n_splits=3, shuffle=True, random_state=1)
# 初始化随机森林回归器
model = RandomForestRegressor(random_state=0)
# 使用 RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions,
n_iter=50, cv=kfold, verbose=2, random_state=0, n_jobs=-1)
# 拟合模型
random_search.fit(X_train_s, y_train)
# 打印最佳参数
print("Best parameters:", random_search.best_params_)
model = random_search.best_estimator_
model.score(X_val_s, y_val)
model=RandomForestRegressor(n_estimators=1000, max_features=13 ,max_depth=12, random_state=0)
model.fit(X_train_s, y_train)
model.score(X_val_s, y_val)
输出结果:
(2)绘制变量重要性排序图
model=RandomForestRegressor(n_estimators=1000, max_features=13 ,max_depth=12, random_state=0)
model.fit(X.to_numpy(),y.to_numpy())
model.score(X.to_numpy(), y.to_numpy())
sorted_index = model.feature_importances_.argsort()[::-1]
plt.figure(figsize=(10, 8),dpi=128) # 可以调整尺寸以适应所有特征
# 使用 seaborn 来绘制条形图
sns.barplot(x=model.feature_importances_[sorted_index], y=X.columns[sorted_index], orient='h')
plt.xlabel('Feature Importance') # x轴标签
plt.ylabel('Feature') # y轴标签
plt.savefig(r"E:\工作\硕士\博客\博客94-多种机器学习模型预测房价\特征重要性排序图.png",
bbox_inches="tight",
pad_inches=1,
transparent=True,
facecolor="w",
edgecolor='w',
dpi=300,
orientation='landscape')
输出结果:
五、模型保存
import joblib
model_filename = 'regressor_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")
将模型就保存到本地,下次就不用重新训练,可以直接使用。
更多优质内容持续发布中,请移步主页查看。
若有问题可邮箱联系:1736732074@qq.com
博主的WeChat:TCB1736732074
点赞+关注,下次不迷路!