当前位置：首页 > article >正文

线性回归讲解

article 2025/2/19 7:06:22

要将Matplotlib绘制的动态图保存为.mp4格式的文件，你需要安装FFmpeg软件，并使用Matplotlib提供的FFMpegWriter对象来保存动态图。以下是在你的代码中添加保存为.mp4格式的示例代码：

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, FFMpegWriter
from tqdm import trange

# z-score,min_max
years = np.array([i for i in range(2000,2022)]) # 年份 2000 ~ 2021
years = (years - 2000) / 22 # batch_normalizer layer_noramlizer
prices = np.array([10000,11000,12000,13000,14000,12000,13000,16000,18000,20000,19000,22000,24000,23000,26000,35000,30000,40000,45000,52000,50000,60000])/60000

epoch = 10

k = 1
b = 1
lr = 0.01

x_data = []
y_data = []
losses = []

fig, ax = plt.subplots()
line, = ax.plot([], [])
ax.set_xlim(0, epoch)
ax.set_ylim(0, 1)
ax.set_xlabel('Iterations')
ax.set_ylabel('Loss')

def update(frame):
    global k, b
    loss = 0
    for x,label in zip(years,prices):
        pre = k * x + b
        loss += (pre - label) ** 2

        delta_k = 2 * (k * x + b - label) * x
        delta_b = 2 * (k * x + b - label)

        k = k - delta_k * lr
        b = b - delta_b * lr
        
    losses.append(loss)  # 记录损失值
    x_data.append(frame)
    y_data.append(loss)

    line.set_data(x_data, y_data)

    return line,

ani = FuncAnimation(fig, update, frames=trange(epoch), blit=True)

writer = FFMpegWriter(fps=30)
ani.save('animation.mp4', writer=writer)

while True:
    year = (float(input("请输入年份: ")) - 2000)/ 22
    print("预测房价: ",(k * year + b) * 60000)

在上述代码中，我们创建了一个FFMpegWriter对象，并将其作为参数传递给ani.save()函数，用于将动态图保存为.mp4格式的文件。需要注意的是，保存为.mp4格式的文件需要FFmpeg软件的支持，如果没有安装FFmpeg，需要先通过系统包管理器或者官网下载安装。ow()函数显示图形。

import numpy as np
import matplotlib.pyplot as plt


class MyDataset:
    def __init__(self,xs,ys,batch_size,shuffle):
        self.xs = xs
        self.ys =ys
        self.shuffle = shuffle
        self.batch_size = batch_size

    def __iter__(self):
        return DataLoader(self)

    def __len__(self):
        return len(self.xs)


class DataLoader:
    def __init__(self,dataset):
        self.dataset = dataset
        self.cursor = 0

        self.indexs = np.arange(len(self.dataset))

        if self.dataset.shuffle:
            np.random.shuffle(self.indexs)


    def __next__(self):
        if self.cursor >= len(self.dataset):
            raise StopIteration

        index = self.indexs[self.cursor:self.cursor + self.dataset.batch_size]


        x = self.dataset.xs[index]
        y = self.dataset.ys[index]

        self.cursor += self.dataset.batch_size

        return x , y




if __name__ == "__main__":
    years = np.array([i for i in range(2000,2022)])
    floors = np.array([i for i in range(23,1,-1)])

    years = (years - 2000) / 22

    prices = np.array([10000,11000,12000,13000,14000,12000,13000,16000,18000,20000,19000,22000,24000,23000,26000,35000,30000,40000,45000,52000,50000,60000])
    prices = prices/60000
    # 数据归一化: 除以最大值, z-score归一化, min-max

    k = 1
    b = 0
    lr = 0.07
    epoch = 5000

    batch_size = 2
    shuffle = True

    dataset = MyDataset(years,prices,batch_size,shuffle)

    for e in range(epoch):
        for year,price in dataset:
            predict = k * year + b
            loss = (predict - price) ** 2

            delta_k =  (k * year + b - price) * year
            delta_b =  (k * year + b - price)

            k -= np.sum(delta_k)/batch_size * lr
            b -= np.sum(delta_b)/batch_size * lr


    while True:
        test_year = (int(input("请输入预测的年份: ")) - 2000) / 22
        predict_price = test_year * k + b
        print(predict_price * 60000)

这是一个简单的线性回归模型，用于预测房价。模型使用梯度下降法进行训练，可以通过输入年份来预测房价。

在这个模型中，我们首先定义了一个MyDataset类，用于封装输入数据和目标数据，并实现了__iter__方法，以便在训练过程中使用DataLoader类进行数据批量加载和随机打乱。

在训练过程中，我们使用了一个双重循环，外层循环控制训练的迭代次数，内层循环用于按批次加载数据，并计算损失和更新模型参数。

最后，我们可以通过输入年份来进行房价的预测。需要注意的是，模型的预测结果可能会受到多种因素的影响，如数据样本的数量和质量、模型的复杂度和训练方法等，因此在应用模型时需要谨慎考虑。

	predict = k * year + b
	loss = (predict - price) ** 2
    delta_k =  (k * year + b - price) * year
    delta_b =  (k * year + b - price)

    k -= np.sum(delta_k)/batch_size * lr
    b -= np.sum(delta_b)/batch_size * lr

重点解释一下这一段

这一段代码实现了线性回归模型的训练过程。具体来说，它包括以下几个步骤：

预测值计算：根据当前模型参数 k 和 b，以及输入数据 year，计算出预测值 predict，即 predict = k * year + b。

损失计算：根据预测值 predict 和目标值 price，计算出损失 loss，即 loss = (predict - price) ** 2。

梯度计算：根据当前模型参数 k 和 b，以及输入数据 year 和目标值 price，计算出损失函数对 k 和 b 的偏导数 delta_k 和 delta_b，即 delta_k = (k * year + b - price) * year，delta_b = (k * year + b - price)。

参数更新：根据计算出的梯度 delta_k 和 delta_b，以及学习率 lr 和批量大小 batch_size，更新模型参数 k 和 b，即 k -= np.sum(delta_k)/batch_size * lr，b -= np.sum(delta_b)/batch_size * lr。

这个过程可以看做是利用梯度下降法对模型进行训练，通过不断迭代更新模型参数，逐步降低损失，使模型能够更好地拟合数据。需要注意的是，梯度下降法的效果受到学习率和批量大小的影响，过大或过小的学习率和批量大小都可能导致模型训练效果不佳，因此需要在实际应用中进行调参。

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FFMpegWriter, FuncAnimation


class MyDataset:
    def __init__(self, xs, ys, batch_size, shuffle):
        self.xs = xs
        self.ys = ys
        self.shuffle = shuffle
        self.batch_size = batch_size

    def __iter__(self):
        return DataLoader(self)

    def __len__(self):
        return len(self.xs)


class DataLoader:
    def __init__(self, dataset):
        self.dataset = dataset
        self.cursor = 0

        self.indexs = np.arange(len(self.dataset))

        if self.dataset.shuffle:
            np.random.shuffle(self.indexs)

    def __next__(self):
        if self.cursor >= len(self.dataset):
            raise StopIteration

        index = self.indexs[self.cursor : self.cursor + self.dataset.batch_size]

        x = self.dataset.xs[index]
        y = self.dataset.ys[index]

        self.cursor += self.dataset.batch_size

        return x, y


if __name__ == "__main__":
    years = np.array([i for i in range(2000, 2022)])
    floors = np.array([i for i in range(23, 1, -1)])

    years = (years - 2000) / 22

    prices = np.array(
        [10000, 11000, 12000, 13000, 14000, 12000, 13000, 16000, 18000, 20000, 19000, 22000, 24000, 23000, 26000, 35000, 30000, 40000, 45000, 52000, 50000, 60000]
    )
    prices = prices / 60000

    k = 1
    b = 0
    lr = 0.07
    epoch = 50

    batch_size = 2
    shuffle = True

    dataset = MyDataset(years, prices, batch_size, shuffle)

    fig, ax = plt.subplots()
    line, = ax.plot([], [], lw=2)
    scat = ax.scatter(years, prices)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)

    loss_ax = ax.twinx()
    loss_line, = loss_ax.plot([], [], lw=2, color='r')

    def init():
        line.set_data([], [])
        loss_line.set_data([], [])
        return (line,)

    def update(frame):
        global k, b
        x, y = next(iter(dataset))

        y_pred = x * k + b
        loss = np.mean((y - y_pred) ** 2)
        grad_k = np.mean((y_pred - y) * x)
        grad_b = np.mean(y_pred - y)

        k -= lr * grad_k
        b -= lr * grad_b

        print(f"frame: {frame}, k: {k:.3f}, b: {b:.3f}, loss: {loss:.3f}")

        line.set_data(years, k * years + b)
        scat.set_offsets(np.column_stack((years, prices)))
        loss_line.set_data(np.arange(frame), np.full(frame, loss))

        ax.relim()
        ax.autoscale_view()

        loss_ax.relim()
        loss_ax.autoscale_view()

        return (line, scat, loss_line)

    ani = FuncAnimation(fig, update, frames=range(epoch), init_func=init, blit=True, save_count=100)
    writer = FFMpegWriter(fps=30)
    ani.save('animation.mp4', writer=writer)

import numpy as np


class MyDataset:
    def __init__(self,xs,ys,zs,batch_size,shuffle):
        self.xs = xs
        self.ys = ys
        self.zs = zs
        self.shuffle = shuffle
        self.batch_size = batch_size

    def __iter__(self):
        return DataLoader(self)

    def __len__(self):
        return len(self.xs)


class DataLoader:
    def __init__(self,dataset):
        self.dataset = dataset
        self.cursor = 0

        self.indexs = np.arange(len(self.dataset))

        if self.dataset.shuffle:
            np.random.shuffle(self.indexs)


    def __next__(self):
        if self.cursor >= len(self.dataset):
            raise StopIteration

        index = self.indexs[self.cursor:self.cursor + self.dataset.batch_size]


        x = self.dataset.xs[index]
        y = self.dataset.ys[index]
        z = self.dataset.zs[index]
        self.cursor += self.dataset.batch_size

        return x ,y ,z

years = np.array([i for i in range(2000, 2022)])
years = (years - 2000) / 22

floors = np.array([i for i in range(23,1,-1)])
floors = floors/23


prices = np.array(
    [10000, 11000, 12000, 13000, 14000, 12000, 13000, 16000, 18000, 20000, 19000, 22000, 24000, 23000, 26000, 35000,
     30000, 40000, 45000, 52000, 50000, 60000])
prices = prices / 60000


lr = 0.05
epoch = 10000

k1 = 1
k2 = -1
b = 0
batch_size = 8

dataset = MyDataset(years,floors,prices,batch_size,True)


for e in range(epoch) :
    for year,floor,price in dataset:
        predict = k1 * year  + k2 * floor +  1 * b

        loss = np.sum((predict - price) ** 2)

        delta_k1 = np.sum((predict - price) * year)
        delta_k2 = np.sum((predict - price) * floor)

        delta_b = np.sum((predict - price))

        k1 -= lr * delta_k1
        k2 -= lr * delta_k2
        b -=  lr * delta_b

    if e % 100 == 0:
        print(loss)

这是一个多变量线性回归的示例代码，使用了三个特征变量years、floors和prices，其中years和floors是输入特征，prices是输出特征。

在每个epoch中，迭代整个数据集，计算预测值和损失，并更新模型参数k1、k2和b。在这个示例中，使用的优化算法是梯度下降法，学习率为lr。

其中，MyDataset类和DataLoader类用于实现数据集的迭代器，可以方便地使用for循环迭代数据集中的每个batch。MyDataset类的初始化函数接收输入特征xs、输出特征ys、第二个输入特征zs、batch大小batch_size和是否随机打乱数据集shuffle等参数。DataLoader类的初始化函数接收一个MyDataset对象，并根据shuffle参数打乱数据集的索引，以便按照随机顺序迭代数据集。next()方法实现了每次迭代返回一个batch的功能，其中使用了self.cursor变量来记录当前迭代到的位置。

整个代码实现了一个简单的多变量线性回归模型，可以用于预测房价等问题。
首先，我们可以在每个epoch结束时记录下模型的损失值，然后使用Matplotlib将损失值随时间的变化进行可视化。可以在代码中添加以下代码来记录损失值：

losses = []
for e in range(epoch):
    # ...
    losses.append(loss)
    # ...

然后，可以使用Matplotlib来绘制损失随时间的变化曲线。可以在代码的最后添加以下代码：

import matplotlib.pyplot as plt

plt.plot(np.arange(epoch), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

这将绘制一个损失随时间变化的曲线，横轴是epoch数，纵轴是损失值。可以用这个曲线来监视模型的训练过程，以及选择合适的epoch数来停止训练，以避免过拟合或欠拟合。

完整的可视化代码如下：

import numpy as np
import matplotlib.pyplot as plt


class MyDataset:
    def __init__(self, xs, ys, zs, batch_size, shuffle):
        self.xs = xs
        self.ys = ys
        self.zs = zs
        self.shuffle = shuffle
        self.batch_size = batch_size

    def __iter__(self):
        return DataLoader(self)

    def __len__(self):
        return len(self.xs)


class DataLoader:
    def __init__(self, dataset):
        self.dataset = dataset
        self.cursor = 0

        self.indexs = np.arange(len(self.dataset))

        if self.dataset.shuffle:
            np.random.shuffle(self.indexs)

    def __next__(self):
        if self.cursor >= len(self.dataset):
            raise StopIteration

        index = self.indexs[self.cursor:self.cursor + self.dataset.batch_size]

        x = self.dataset.xs[index]
        y = self.dataset.ys[index]
        z = self.dataset.zs[index]
        self.cursor += self.dataset.batch_size

        return x, y, z


years = np.array([i for i in range(2000, 2022)])
years = (years - 2000) / 22

floors = np.array([i for i in range(23, 1, -1)])
floors = floors / 23

prices = np.array(
    [10000, 11000, 12000, 13000, 14000, 12000, 13000, 16000, 18000, 20000, 19000, 22000, 24000, 23000, 26000, 35000,
     30000, 40000, 45000, 52000, 50000, 60000])
prices = prices / 60000

lr = 0.05
epoch = 10000

k1 = 1
k2 = -1
b = 0
batch_size = 8

dataset = MyDataset(years, floors, prices, batch_size, True)

losses = []
for e in range(epoch):
    for year, floor, price in dataset:
        predict = k1 * year + k2 * floor + 1 * b

        loss = np.sum((predict - price) ** 2)

        delta_k1 = np.sum((predict - price) * year)
        delta_k2 = np.sum((predict - price) * floor)
        delta_b = np.sum((predict - price))

        k1 -= lr * delta_k1
        k2 -= lr * delta_k2
        b -= lr * delta_b

    losses.append(loss)
    if e % 100 == 0:
        print(loss)

plt.plot(np.arange(epoch), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()