线性回归讲解
要将Matplotlib绘制的动态图保存为.mp4格式的文件,你需要安装FFmpeg软件,并使用Matplotlib提供的FFMpegWriter对象来保存动态图。以下是在你的代码中添加保存为.mp4格式的示例代码:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, FFMpegWriter
from tqdm import trange
# z-score,min_max
years = np.array([i for i in range(2000,2022)]) # 年份 2000 ~ 2021
years = (years - 2000) / 22 # batch_normalizer layer_noramlizer
prices = np.array([10000,11000,12000,13000,14000,12000,13000,16000,18000,20000,19000,22000,24000,23000,26000,35000,30000,40000,45000,52000,50000,60000])/60000
epoch = 10
k = 1
b = 1
lr = 0.01
x_data = []
y_data = []
losses = []
fig, ax = plt.subplots()
line, = ax.plot([], [])
ax.set_xlim(0, epoch)
ax.set_ylim(0, 1)
ax.set_xlabel('Iterations')
ax.set_ylabel('Loss')
def update(frame):
global k, b
loss = 0
for x,label in zip(years,prices):
pre = k * x + b
loss += (pre - label) ** 2
delta_k = 2 * (k * x + b - label) * x
delta_b = 2 * (k * x + b - label)
k = k - delta_k * lr
b = b - delta_b * lr
losses.append(loss) # 记录损失值
x_data.append(frame)
y_data.append(loss)
line.set_data(x_data, y_data)
return line,
ani = FuncAnimation(fig, update, frames=trange(epoch), blit=True)
writer = FFMpegWriter(fps=30)
ani.save('animation.mp4', writer=writer)
while True:
year = (float(input("请输入年份: ")) - 2000)/ 22
print("预测房价: ",(k * year + b) * 60000)
在上述代码中,我们创建了一个FFMpegWriter对象,并将其作为参数传递给ani.save()函数,用于将动态图保存为.mp4格式的文件。需要注意的是,保存为.mp4格式的文件需要FFmpeg软件的支持,如果没有安装FFmpeg,需要先通过系统包管理器或者官网下载安装。ow()函数显示图形。
import numpy as np
import matplotlib.pyplot as plt
class MyDataset:
def __init__(self,xs,ys,batch_size,shuffle):
self.xs = xs
self.ys =ys
self.shuffle = shuffle
self.batch_size = batch_size
def __iter__(self):
return DataLoader(self)
def __len__(self):
return len(self.xs)
class DataLoader:
def __init__(self,dataset):
self.dataset = dataset
self.cursor = 0
self.indexs = np.arange(len(self.dataset))
if self.dataset.shuffle:
np.random.shuffle(self.indexs)
def __next__(self):
if self.cursor >= len(self.dataset):
raise StopIteration
index = self.indexs[self.cursor:self.cursor + self.dataset.batch_size]
x = self.dataset.xs[index]
y = self.dataset.ys[index]
self.cursor += self.dataset.batch_size
return x , y
if __name__ == "__main__":
years = np.array([i for i in range(2000,2022)])
floors = np.array([i for i in range(23,1,-1)])
years = (years - 2000) / 22
prices = np.array([10000,11000,12000,13000,14000,12000,13000,16000,18000,20000,19000,22000,24000,23000,26000,35000,30000,40000,45000,52000,50000,60000])
prices = prices/60000
# 数据归一化: 除以最大值, z-score归一化, min-max
k = 1
b = 0
lr = 0.07
epoch = 5000
batch_size = 2
shuffle = True
dataset = MyDataset(years,prices,batch_size,shuffle)
for e in range(epoch):
for year,price in dataset:
predict = k * year + b
loss = (predict - price) ** 2
delta_k = (k * year + b - price) * year
delta_b = (k * year + b - price)
k -= np.sum(delta_k)/batch_size * lr
b -= np.sum(delta_b)/batch_size * lr
while True:
test_year = (int(input("请输入预测的年份: ")) - 2000) / 22
predict_price = test_year * k + b
print(predict_price * 60000)
这是一个简单的线性回归模型,用于预测房价。模型使用梯度下降法进行训练,可以通过输入年份来预测房价。
在这个模型中,我们首先定义了一个MyDataset类,用于封装输入数据和目标数据,并实现了__iter__方法,以便在训练过程中使用DataLoader类进行数据批量加载和随机打乱。
在训练过程中,我们使用了一个双重循环,外层循环控制训练的迭代次数,内层循环用于按批次加载数据,并计算损失和更新模型参数。
最后,我们可以通过输入年份来进行房价的预测。需要注意的是,模型的预测结果可能会受到多种因素的影响,如数据样本的数量和质量、模型的复杂度和训练方法等,因此在应用模型时需要谨慎考虑。
predict = k * year + b
loss = (predict - price) ** 2
delta_k = (k * year + b - price) * year
delta_b = (k * year + b - price)
k -= np.sum(delta_k)/batch_size * lr
b -= np.sum(delta_b)/batch_size * lr
重点解释一下这一段
这一段代码实现了线性回归模型的训练过程。具体来说,它包括以下几个步骤:
预测值计算:根据当前模型参数 k 和 b,以及输入数据 year,计算出预测值 predict,即 predict = k * year + b。
损失计算:根据预测值 predict 和目标值 price,计算出损失 loss,即 loss = (predict - price) ** 2。
梯度计算:根据当前模型参数 k 和 b,以及输入数据 year 和目标值 price,计算出损失函数对 k 和 b 的偏导数 delta_k 和 delta_b,即 delta_k = (k * year + b - price) * year,delta_b = (k * year + b - price)。
参数更新:根据计算出的梯度 delta_k 和 delta_b,以及学习率 lr 和批量大小 batch_size,更新模型参数 k 和 b,即 k -= np.sum(delta_k)/batch_size * lr,b -= np.sum(delta_b)/batch_size * lr。
这个过程可以看做是利用梯度下降法对模型进行训练,通过不断迭代更新模型参数,逐步降低损失,使模型能够更好地拟合数据。需要注意的是,梯度下降法的效果受到学习率和批量大小的影响,过大或过小的学习率和批量大小都可能导致模型训练效果不佳,因此需要在实际应用中进行调参。
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FFMpegWriter, FuncAnimation
class MyDataset:
def __init__(self, xs, ys, batch_size, shuffle):
self.xs = xs
self.ys = ys
self.shuffle = shuffle
self.batch_size = batch_size
def __iter__(self):
return DataLoader(self)
def __len__(self):
return len(self.xs)
class DataLoader:
def __init__(self, dataset):
self.dataset = dataset
self.cursor = 0
self.indexs = np.arange(len(self.dataset))
if self.dataset.shuffle:
np.random.shuffle(self.indexs)
def __next__(self):
if self.cursor >= len(self.dataset):
raise StopIteration
index = self.indexs[self.cursor : self.cursor + self.dataset.batch_size]
x = self.dataset.xs[index]
y = self.dataset.ys[index]
self.cursor += self.dataset.batch_size
return x, y
if __name__ == "__main__":
years = np.array([i for i in range(2000, 2022)])
floors = np.array([i for i in range(23, 1, -1)])
years = (years - 2000) / 22
prices = np.array(
[10000, 11000, 12000, 13000, 14000, 12000, 13000, 16000, 18000, 20000, 19000, 22000, 24000, 23000, 26000, 35000, 30000, 40000, 45000, 52000, 50000, 60000]
)
prices = prices / 60000
k = 1
b = 0
lr = 0.07
epoch = 50
batch_size = 2
shuffle = True
dataset = MyDataset(years, prices, batch_size, shuffle)
fig, ax = plt.subplots()
line, = ax.plot([], [], lw=2)
scat = ax.scatter(years, prices)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
loss_ax = ax.twinx()
loss_line, = loss_ax.plot([], [], lw=2, color='r')
def init():
line.set_data([], [])
loss_line.set_data([], [])
return (line,)
def update(frame):
global k, b
x, y = next(iter(dataset))
y_pred = x * k + b
loss = np.mean((y - y_pred) ** 2)
grad_k = np.mean((y_pred - y) * x)
grad_b = np.mean(y_pred - y)
k -= lr * grad_k
b -= lr * grad_b
print(f"frame: {frame}, k: {k:.3f}, b: {b:.3f}, loss: {loss:.3f}")
line.set_data(years, k * years + b)
scat.set_offsets(np.column_stack((years, prices)))
loss_line.set_data(np.arange(frame), np.full(frame, loss))
ax.relim()
ax.autoscale_view()
loss_ax.relim()
loss_ax.autoscale_view()
return (line, scat, loss_line)
ani = FuncAnimation(fig, update, frames=range(epoch), init_func=init, blit=True, save_count=100)
writer = FFMpegWriter(fps=30)
ani.save('animation.mp4', writer=writer)
import numpy as np
class MyDataset:
def __init__(self,xs,ys,zs,batch_size,shuffle):
self.xs = xs
self.ys = ys
self.zs = zs
self.shuffle = shuffle
self.batch_size = batch_size
def __iter__(self):
return DataLoader(self)
def __len__(self):
return len(self.xs)
class DataLoader:
def __init__(self,dataset):
self.dataset = dataset
self.cursor = 0
self.indexs = np.arange(len(self.dataset))
if self.dataset.shuffle:
np.random.shuffle(self.indexs)
def __next__(self):
if self.cursor >= len(self.dataset):
raise StopIteration
index = self.indexs[self.cursor:self.cursor + self.dataset.batch_size]
x = self.dataset.xs[index]
y = self.dataset.ys[index]
z = self.dataset.zs[index]
self.cursor += self.dataset.batch_size
return x ,y ,z
years = np.array([i for i in range(2000, 2022)])
years = (years - 2000) / 22
floors = np.array([i for i in range(23,1,-1)])
floors = floors/23
prices = np.array(
[10000, 11000, 12000, 13000, 14000, 12000, 13000, 16000, 18000, 20000, 19000, 22000, 24000, 23000, 26000, 35000,
30000, 40000, 45000, 52000, 50000, 60000])
prices = prices / 60000
lr = 0.05
epoch = 10000
k1 = 1
k2 = -1
b = 0
batch_size = 8
dataset = MyDataset(years,floors,prices,batch_size,True)
for e in range(epoch) :
for year,floor,price in dataset:
predict = k1 * year + k2 * floor + 1 * b
loss = np.sum((predict - price) ** 2)
delta_k1 = np.sum((predict - price) * year)
delta_k2 = np.sum((predict - price) * floor)
delta_b = np.sum((predict - price))
k1 -= lr * delta_k1
k2 -= lr * delta_k2
b -= lr * delta_b
if e % 100 == 0:
print(loss)
这是一个多变量线性回归的示例代码,使用了三个特征变量years、floors和prices,其中years和floors是输入特征,prices是输出特征。
在每个epoch中,迭代整个数据集,计算预测值和损失,并更新模型参数k1、k2和b。在这个示例中,使用的优化算法是梯度下降法,学习率为lr。
其中,MyDataset类和DataLoader类用于实现数据集的迭代器,可以方便地使用for循环迭代数据集中的每个batch。MyDataset类的初始化函数接收输入特征xs、输出特征ys、第二个输入特征zs、batch大小batch_size和是否随机打乱数据集shuffle等参数。DataLoader类的初始化函数接收一个MyDataset对象,并根据shuffle参数打乱数据集的索引,以便按照随机顺序迭代数据集。next()方法实现了每次迭代返回一个batch的功能,其中使用了self.cursor变量来记录当前迭代到的位置。
整个代码实现了一个简单的多变量线性回归模型,可以用于预测房价等问题。
首先,我们可以在每个epoch结束时记录下模型的损失值,然后使用Matplotlib将损失值随时间的变化进行可视化。可以在代码中添加以下代码来记录损失值:
losses = []
for e in range(epoch):
# ...
losses.append(loss)
# ...
然后,可以使用Matplotlib来绘制损失随时间的变化曲线。可以在代码的最后添加以下代码:
import matplotlib.pyplot as plt
plt.plot(np.arange(epoch), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()
这将绘制一个损失随时间变化的曲线,横轴是epoch数,纵轴是损失值。可以用这个曲线来监视模型的训练过程,以及选择合适的epoch数来停止训练,以避免过拟合或欠拟合。
完整的可视化代码如下:
import numpy as np
import matplotlib.pyplot as plt
class MyDataset:
def __init__(self, xs, ys, zs, batch_size, shuffle):
self.xs = xs
self.ys = ys
self.zs = zs
self.shuffle = shuffle
self.batch_size = batch_size
def __iter__(self):
return DataLoader(self)
def __len__(self):
return len(self.xs)
class DataLoader:
def __init__(self, dataset):
self.dataset = dataset
self.cursor = 0
self.indexs = np.arange(len(self.dataset))
if self.dataset.shuffle:
np.random.shuffle(self.indexs)
def __next__(self):
if self.cursor >= len(self.dataset):
raise StopIteration
index = self.indexs[self.cursor:self.cursor + self.dataset.batch_size]
x = self.dataset.xs[index]
y = self.dataset.ys[index]
z = self.dataset.zs[index]
self.cursor += self.dataset.batch_size
return x, y, z
years = np.array([i for i in range(2000, 2022)])
years = (years - 2000) / 22
floors = np.array([i for i in range(23, 1, -1)])
floors = floors / 23
prices = np.array(
[10000, 11000, 12000, 13000, 14000, 12000, 13000, 16000, 18000, 20000, 19000, 22000, 24000, 23000, 26000, 35000,
30000, 40000, 45000, 52000, 50000, 60000])
prices = prices / 60000
lr = 0.05
epoch = 10000
k1 = 1
k2 = -1
b = 0
batch_size = 8
dataset = MyDataset(years, floors, prices, batch_size, True)
losses = []
for e in range(epoch):
for year, floor, price in dataset:
predict = k1 * year + k2 * floor + 1 * b
loss = np.sum((predict - price) ** 2)
delta_k1 = np.sum((predict - price) * year)
delta_k2 = np.sum((predict - price) * floor)
delta_b = np.sum((predict - price))
k1 -= lr * delta_k1
k2 -= lr * delta_k2
b -= lr * delta_b
losses.append(loss)
if e % 100 == 0:
print(loss)
plt.plot(np.arange(epoch), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()