李宏毅2022深度学习作业代码记录(hw1)—— COVID19
本栏目用于记录李宏毅教授 2022年度深度学习的 作业代码理解
此次为homework1,COVID19的阳性率预测,其中训练数据共有117个特征及一个label。
import math #提供浮点数运算函数
import numpy as np #支持高维数组与矩阵运算
import pandas as pd #读取excel文件
import os #提供与操作系统交互的接口 如创建目录
import csv #处理csv格式的文件
from tqdm import tqdm #显示进度条
import torch #激活函数、对tensor的操作
import torch.nn as nn #神经网络层
from torch.utils.data import DataLoader,Dataset,random_split #数据及操作
from tensorboardX import SummaryWriter #创建文件并写入事件,以使用tensorboard可视化
"""设置随机种子"""
def same_seed(seed):
torch.backends.cudnn.deterministic = True #使用确定的卷积算法
torch.backends.cudnn.benchmark = False #若为true,则选择最优算法,算法不确定
np.random.seed(seed) #随机数可重复
torch.manual_seed(seed) #cpu随机种子
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed) #若用gpu训练,也要设置
"""从训练集中划分出验证集——优化函数"""
def train_valid_spilt(data_set,valid_ratio,seed):
valid_set_size = int(valid_ratio * len(data_set)) #验证集个数
train_set_size = len(data_set) - valid_set_size
train_set,valid_set = random_split(data_set,[train_set_size,valid_set_size],
generator=torch.Generator().manual_seed(seed))#划分
return np.array(train_set),np.array(valid_set) #以np形式返回验证集和训练集
"""从117个特征中,选择合适的特征"""
def select_feat(train_data,valid_data,test_data,select_all = True):
y_train = train_data[:, -1]
y_valid = valid_data[:, -1] #提取所有label
raw_x_train = train_data[:, :-1] # 提取所有输入(特征)
raw_x_valid = valid_data[:, :-1] # :意思是除最后一列全都提取
raw_x_test = test_data
if select_all: # 若使用所有特征,
feat_idx = list(range(raw_x_train.shape[1])) #shape:(行数,列数),索引后者
else: # 筛选有用特征,可自行调整对应列数
feat_idx = [0, 1, 2, 3, 4]
# 返回使用特征及label值
return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid
"""构建Dataset(数据集)"""
# 继承pytorch库中的Dataset
class COVID19Dataset(Dataset):
def __init__(self,features, targets=None): #读取数据,并进行预处理
if targets is None: #targets:label、目标,若为none,则做prediction,只需features
self.targets = targets
else: #反之,则作训练,二者都需要
self.targets = torch.FloatTensor(targets)
self.features = torch.FloatTensor(features)
def __getitem__(self, idx): #每次取出一笔数据
if self.targets is None: #做预测
return self.features[idx]
else: #做训练
return self.features[idx], self.targets[idx]
def __len__(self): #返回数据的长度
return len(self.features)
"""构建神经网络(继承nn.Module类)"""
class My_Model(nn.Module):
def __init__(self,input_dim):
super(My_Model,self).__init__() #调用父类的构造函数
# 准备“容器”,添加layer(层)
# 本次采用三层全连接(linear)网络
self.layers=nn.Sequential(
nn.Linear(input_dim,out_features=16),#第一层输入维度,输出维度
nn.ReLU(),
nn.Linear(in_features=16,out_features=8), #第二层...
nn.ReLU(),
nn.Linear(in_features=8,out_features=1)#最后一层输出1维
)
def forward(self,x): #前向传播,即输入在神经网络中处理的过程
x = self.layers(x) #把输入在网络中跑一遍
x = x.squeeze(1) #(B, 1) -> (B) 删除多余维度
return x
"""参数设置"""
device = 'cuda' if torch.cuda.is_available() else 'cpu' #选择训练平台
config = { #字典
'seed':8484848,
'select_all':True,
'valid_ratio':0.2,
'n_epochs':3000,
'batch_size':256,
'learning_rate':1e-5,
'early_stop':400,
'save_path':'D:\python code\deep learning\models\model.ckpt'
}
"""训练过程"""
def trainer(train_loader,valid_loader,model,config,device):
criterion = nn.MSELoss(reduction='mean') #指定loss函数为均方误差函数
optimizer = torch.optim.SGD(model.parameters(),lr = config['learning_rate'],
momentum=0.9) #指定优化器——随机梯度下降法
writer = SummaryWriter()
if not os.path.isdir('./models'):
os.mkdir('./models') #创建模型存储路径
#设置一些参数
n_epochs = config['n_epochs']
best_loss = math.inf #记录最小的loss值,初始为无穷大
step = 0
early_stop_count = 0
#开始训练
for epoch in range(n_epochs):
model.train() #开启训练模式
loss_record = [] #记录每笔数据的loss值
# 进度条(可视化),它是对dataloader的封装,本质还是dataloader
train_pbar = tqdm(train_loader, position=0, leave=True)
for x, y in train_pbar:
optimizer.zero_grad() # 梯度值清零
x, y = x.to(device), y.to(device) #将feature和label导入训练平台
pred = model(x) #预测值
loss = criterion(pred, y) #计算loss
loss.backward() # 计算梯度(反向传播)
optimizer.step() # 更新参数
step += 1
loss_record.append(loss.detach().item()) #记录loss值
#.detach:返回一个与当前 graph 分离的、不再需要梯度的新张量
#.item:将张 量的值转换为标准的 Python 数值,只有当张量仅含一个元素时才能使用它
# 展示训练轮次及loss值
train_pbar.set_description(f'Epoch [{epoch + 1}/{n_epochs}]')
train_pbar.set_postfix({'loss': loss.detach().item()})
#计算平均loss并画图
mean_train_loss = sum(loss_record) / len(loss_record)
writer.add_scalar('Loss/train', mean_train_loss, step)
#验证过程
model.eval() # 开启验证模式
loss_record = []
for x, y in valid_loader:
x, y = x.to(device), y.to(device)
with torch.no_grad(): #关闭梯度计算,验证过程不需要
pred = model(x)
loss = criterion(pred, y)
loss_record.append(loss.item())
mean_valid_loss = sum(loss_record) / len(loss_record)
print(f'Epoch [{epoch + 1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
writer.add_scalar('Loss/valid', mean_valid_loss, step)
# 如果验证集loss小于之前最小的loss,说明本次模型参数更新有意义,记录下目前最好的模型
if mean_valid_loss < best_loss:
best_loss = mean_valid_loss
torch.save(model.state_dict(), config['save_path']) # 记录下最好的模型
print('Saving model with loss {:.3f}...'.format(best_loss))
early_stop_count = 0
else:
early_stop_count += 1
# 若连续400次模型参数更新都无意义,则跳出模型
if early_stop_count >= config['early_stop']:
print('\nModel is not improving, so we halt the training session.')
return
"""Dataloader"""
same_seed(config['seed']) #设置种子
# 读取并划分数据
# dataframe.values:返回给定df的numpy形式
train_data = pd.read_csv(r'E:\python 资料\李宏毅 深度学习 2021-2023\05 李宏毅机器学习\2022 ML\作业\HW1\covid.train_new.csv').values
test_data = pd.read_csv(r'E:\python 资料\李宏毅 深度学习 2021-2023\05 李宏毅机器学习\2022 ML\作业\HW1\covid.test_un.csv').values
train_data,valid_data = train_valid_spilt(train_data,config['valid_ratio'],config['seed'])
print(f"""train_data size: {train_data.shape}
valid_data size: {valid_data.shape}
test_data size: {test_data.shape}""")
# 选择特征
x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])
print(f'the number of features:{x_train.shape[1]}')
# 构造数据集(横向拼接)
train_dataset = COVID19Dataset(x_train, y_train)
valid_dataset = COVID19Dataset(x_valid, y_valid)
test_dataset = COVID19Dataset(x_test)
# 构造dataloader
# batch_size:每个batch有多少个样本 shuffle:在每个epoch开始的时候, 是否对数据进行重新打乱
train_loader = DataLoader(train_dataset,batch_size=config['batch_size'],shuffle=True,pin_memory=True)
valid_loader = DataLoader(valid_dataset,batch_size=config['batch_size'],shuffle=True,pin_memory=True)
test_loader = DataLoader(test_dataset,batch_size=config['batch_size'],shuffle=False,pin_memory=True)
"""开始训练!"""
model = My_Model(input_dim=x_train.shape[1]).to(device)
trainer(train_loader,valid_loader,model,config,device)
"""预测"""
def predict(test_loader,model,device):
model.eval() #开启验证模式
preds = []
for x in tqdm(test_loader): #进度条
x = x.to(device) #将数据导入device
with torch.no_grad(): #关闭梯度计算
pred = model(x)
preds.append(pred.detach().cpu()) #把数据从GPU挪到CPU,方便转为数组形式
preds = torch.cat(preds,dim = 0).numpy()
# cat: 拼接输入的张量序列,dim为拼接方向
return preds
def save_pred(preds, file):
with open(file, 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['id', 'tested_positive'])
for i, p in enumerate(preds):
writer.writerow([i, p])
#执行预测
model = My_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader,model,device)
save_pred(preds,'pred.csv')