当前位置：首页 > article >正文

机器学习课堂4线性回归模型+特征缩放

article 2025/3/28 2:45:39

一、实验2-2，线性回归模型，计算模型在训练数据集和测试数据集上的均方根误差

代码：

#  2-2线性回归模型
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#  参数设置
iterations=3000  # 迭代次数
learning_rate=0.0001  # 学习率
m_train=3000  # 训练样本的数量
flag_plot_lines=False  # 是否画出拟合直线
plot_feature=1  # 使用哪个输入特征画拟合直线
plot_skip=4  # 每间隔几条拟合直线画出一条拟合直线
# 读入气温数据集
df = pd.read_csv('temperature_dataset.csv')
data=np.array(df)
m_all = np.shape(data)[0]  # 样本总数
d =1 if flag_plot_lines else np.shape(data)[1] - 1  # 输入特征的维数
m_test = m_all - m_train  # 测试数据集样本数量
# 划分数据集
X_train = data[0:m_train, plot_feature].reshape((1, -1)) if flag_plot_lines else data[0:m_train, 1:].T  # 根据是否画1拟合直线来决定输入特征的维度
X_test = data[m_train:, plot_feature].reshape((1, -1)) if flag_plot_lines else data[m_train:, 1:].T  # 根据是否画拟合直线来决定输入特征的维度
Y_train = data[0:m_train, 0].reshape((1, -1))
Y_test = data[m_train:, 0].reshape((1, -1))
# 初始化
w = np.zeros((d, 1)).reshape((-1, 1))  # 权重
b = 0  # 偏差（标量）
v = np.ones((1, m_train))  # 1向量
costs_saved = []
# 用来保存拟合直线的权重与偏差
w_saved=np.zeros(iterations+1)
b_saved=np.zeros(iterations+1)
# 迭代循环
for i in range(iterations):  # 更新权重与偏差
    Y_hat = np.dot(w.T, X_train) + b * v
    e = Y_hat - Y_train  # 计算误差
    b = b - 2. * learning_rate * np.dot(v, e.T) / m_train  # 更新偏差
    w = w - 2. * learning_rate * np.dot(X_train, e.T) / m_train  # 更新权重
    # 保存代价函数的值
    costs=np.dot(e, e.T)/m_train
    costs_saved.append(costs.item(0))
    # 保存每次迭代的权重与偏差
    w_saved[i + 1]=w[0]
    b_saved[i + 1] = b
# 打印最新的权重与偏差
print('Weights=', np.array2string(np.squeeze(w, axis=1), precision=3))
print(f'Bias={b.item(0):.3f}')
# 画代价函数的值
plt.plot(range(1,np.size(costs_saved)+1),costs_saved,'r-o',linewidth=2,markersize=5)
plt.ylabel('Costs')
plt.xlabel('Iterations')
plt.title('Learning rate='+str(learning_rate))
plt.show()
# 计算训练数据集上的均方根误差
y_hat=np.dot(w.T, X_train)+b*v  # 计算训练样本标注的预测值
e=y_hat-Y_train  # 计算标注预测值与标注之间的误差
mse=np.dot(e, e.T)/m_train  # 计算均方误差
rmse=np.sqrt(mse)  # 计算均方根误差
print(f'Trainset RMSE={rmse.item(0):.3f}')  # 打印均方根误差
#  计算测试数据集上的均方根误差
y_hat_test=np.dot(w.T, X_test)+b  # 计算测试样本标注的预测值（此处使用了广播操作）
e_test=y_hat_test-Y_test  # 计算标注预测值与标注之间的误差
mse_test=np.dot(e_test,e_test.T)/m_test   # 计算均方误差
rmse_test=np.sqrt(mse_test)  # 计算均方根误差
print(f'Testset RMSE={rmse_test.item(0):.3f}')  # 打印均方根误差
# 画拟合直线
if flag_plot_lines:
    plot_x_min=np.min(X_train)  # 训练样本一维输入特征的最小值
    plot_x_max = np.max(X_train)  # 训练样本一维输入特征的最大值
    plot_x=np.array([plot_x_min, plot_x_max])  # 将其组成一个一维数组
    plt.figure()  # 新建一个图形
    plt.plot(X_train[0, 0::10], Y_train[0,0::10], 'xm')  # 画训练样本（每10个训练样本画出一个）
    for i in range(0, iterations+1, plot_skip+1):  # 每plot_skip+1条拟合直线画出一条
        plot_y=w_saved[i]*plot_x+b_saved[i]  # 计算输入特征最大值与最小值对应的标注预测值
        plt.plot(plot_x, plot_y, '--')  # 用虚线画出这条拟合直线
    plot_y=w_saved[i]*plot_x+b_saved[i]  # 计算输入特征最大值与最小值对应的标注预测值（最后一条拟合直线）
    plt.plot(plot_x, plot_y, 'b', linewith=3)  # 用实线画出最后一条拟合直线
    plt.xlabel('x')
    plt.ylabel('y')
    plt.show()

运行结果

二、实验2-5对输入特征做特征缩放（标准化、最小最大归一化、均值归一化）+训练评估线性回归模型

代码

#  2-5特征缩放
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#  参数设置
iterations=3000  # 迭代次数
learning_rate=0.1  # 学习率
m_train=3000  # 训练样本的数量
flag_fs='std'  # 特征缩放法：’std‘为标准化、’norm_minmax‘为最小最大化归一化、’norm_mean‘为均值归一化，’none‘为不使用特征缩放
# 读入气温数据集
df = pd.read_csv('temperature_dataset.csv')
data=np.array(df)
m_all = np.shape(data)[0]  # 样本总数
d =np.shape(data)[1] - 1  # 输入特征的维数
m_test = m_all - m_train  # 测试数据集样本数量
# 特征缩放
if flag_fs=='std':  # 如果进行标准化
    mean=np.mean(data[0:m_train, 1:], axis=0)  # 计算训练样本输入特征的均值
    std=np.std(data[0:m_train, 1:], axis=0, ddof=1)  # 计算训练样本输入特诊的标准差
    data[:,1:]=(data[:,1:]-mean)/std  # 标准化所有样本的输入特征
elif flag_fs=='norm_minmax':  # 最小最大归一化
    xmin=np.amin(data[0:m_train, 1:], axis=0)  # 返回训练样本输入特征最小值
    xmax = np.amax(data[0:m_train, 1:], axis=0)  # 返回训练样本输入特征最大值
    data[:, 1:]=(data[:, 1:]-xmin)/(xmax-xmin)  # 最小最大归一化所有样本的输入特征
elif flag_fs=='norm_mean':
    xmin=np.amin(data[0:m_train, 1:], axis=0)  # 返回训练样本输入特征的最小值
    xmax=np.amax(data[0:m_train, 1:], axis=0)  # 返回训练样本输入特征的最大值
    mean=np.mean(data[0:m_train, 1:], axis=0)  # 计算训练样本输入特征的均值
    data[:, 1:]=(data[:, 1:]-mean)/(xmax-xmin)  # 均值归一所有样本的输入特征


# 划分数据集
X_train = data[0:m_train, 1:].T  # 训练集输入特征
X_test = data[m_train:, 1:].T  # 测试集输入特征
Y_train = data[0:m_train, 0].reshape((1, -1))  # 训练集目标值
Y_test = data[m_train:, 0].reshape((1, -1))  # 测试集目标值

# 初始化
w = np.zeros((d, 1))  # 权重
b = 0  # 偏差（标量）
v = np.ones((1, m_train))  # 1向量
costs_saved = []  # 用于保存代价函数的值
# 用来保存拟合直线的权重与偏差
w_saved = np.zeros(iterations + 1)
b_saved = np.zeros(iterations + 1)

# 迭代循环
for i in range(iterations):  # 更新权重与偏差
    Y_hat = np.dot(w.T, X_train) + b * v
    e = Y_hat - Y_train  # 计算误差
    b = b - 2. * learning_rate * np.dot(v, e.T) / m_train  # 更新偏差
    w = w - 2. * learning_rate * np.dot(X_train, e.T) / m_train  # 更新权重
    # 保存代价函数的值
    costs = np.dot(e, e.T) / m_train
    costs_saved.append(costs.item(0))
    # 保存每次迭代的权重与偏差
    w_saved[i + 1] = w[0]
    b_saved[i + 1] = b

# 打印最新的权重与偏差
print('Weights=', np.array2string(np.squeeze(w, axis=1), precision=3))
print(f'Bias={b.item(0):.3f}')

# 计算训练数据集上的均方根误差
y_hat = np.dot(w.T, X_train) + b * v  # 计算训练样本标注的预测值
e = y_hat - Y_train  # 计算标注预测值与标注之间的误差
mse = np.dot(e, e.T) / m_train  # 计算均方误差
rmse = np.sqrt(mse)  # 计算均方根误差
print(f'Trainset RMSE={rmse.item(0):.3f}')  # 打印均方根误差

# 计算测试数据集上的均方根误差
y_hat_test = np.dot(w.T, X_test) + b  # 计算测试样本标注的预测值（此处使用了广播操作）
e_test = y_hat_test - Y_test  # 计算标注预测值与标注之间的误差
mse_test = np.dot(e_test, e_test.T) / m_test  # 计算均方误差
rmse_test = np.sqrt(mse_test)  # 计算均方根误差
print(f'Testset RMSE={rmse_test.item(0):.3f}')  # 打印均方根误差

# 画拟合直线
plot_feature = 0  # 假设使用第一个特征画拟合直线
plot_skip = 4  # 每间隔几条拟合直线画出一条拟合直线

# 如果进行了特征缩放，则需要将特征值还原到原始范围
if flag_fs == 'std':
    X_train_plot = X_train[plot_feature, :] * std[plot_feature] + mean[plot_feature]
    X_test_plot = X_test[plot_feature, :] * std[plot_feature] + mean[plot_feature]
elif flag_fs == 'norm_minmax':
    X_train_plot = X_train[plot_feature, :] * (xmax[plot_feature] - xmin[plot_feature]) + xmin[plot_feature]
    X_test_plot = X_test[plot_feature, :] * (xmax[plot_feature] - xmin[plot_feature]) + xmin[plot_feature]
elif flag_fs == 'norm_mean':
    X_train_plot = X_train[plot_feature, :] * (xmax[plot_feature] - xmin[plot_feature]) + mean[plot_feature]
    X_test_plot = X_test[plot_feature, :] * (xmax[plot_feature] - xmin[plot_feature]) + mean[plot_feature]
else:
    X_train_plot = X_train[plot_feature, :]
    X_test_plot = X_test[plot_feature, :]

plot_x_min = np.min(X_train_plot)  # 训练样本一维输入特征的最小值
plot_x_max = np.max(X_train_plot)  # 训练样本一维输入特征的最大值
plot_x = np.array([plot_x_min, plot_x_max])  # 将其组成一个一维数组
plt.figure()  # 新建一个图形
plt.plot(X_train_plot[0::10], Y_train[0, 0::10], 'xm')  # 画训练样本（每10个训练样本画出一个）
for i in range(0, iterations + 1, plot_skip + 1):  # 每plot_skip+1条拟合直线画出一条
    plot_y = w_saved[i] * plot_x + b_saved[i]  # 计算输入特征最大值与最小值对应的标注预测值
    plt.plot(plot_x, plot_y, '--')  # 用虚线画出这条拟合直线
plot_y = w_saved[i] * plot_x + b_saved[i]  # 计算输入特征最大值与最小值对应的标注预测值（最后一条拟合直线）
plt.plot(plot_x, plot_y, 'b', linewidth=3)  # 用实线画出最后一条拟合直线
plt.xlabel('x')
plt.ylabel('y')
plt.title('Fitting Line with Feature Scaling=' + flag_fs)
plt.show()

运行结果