NLP任务之预测最后一个词
目录
1.加载预训练模型
2 从本地加载数据集
3.数据集处理
4.下游任务模型
5.测试代码
6.训练代码
7.保存训练好的模型
8. 加载 保存的模型
1.加载预训练模型
#加载预训练模型
from transformers import AutoTokenizer
#预训练模型:distilgpt2
#use_fast=True:用rust语言写的分词器,速度比python语言写的分词器快很多
tokenizer = AutoTokenizer.from_pretrained(r'../data/model/distilgpt2/', use_fast=True)
tokenizer.batch_encode_plus([
'hide new secretions from the parental units',
'this moive is great'
])
#输出:
#{'input_ids': [[24717, 649, 3200, 507, 422, 262, 21694, 4991], [5661, 6941, 425, 318, 1049]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}
print(tokenizer)
GPT2TokenizerFast(name_or_path='../data/model/distilgpt2/', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False), added_tokens_decoder={ 50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True), }
#预测最后一个词:是一个多分类问题
#针对这个vocab_size=50257的问题,分类类别就是50257个类别
2 从本地加载数据集
from datasets import load_from_disk
dataset = load_from_disk(r'E:/ALOT/10_deep_learning/data/datasets/glue_sst2/')
dataset
DatasetDict({ train: Dataset({ features: ['sentence', 'label', 'idx'], num_rows: 67349 }) validation: Dataset({ features: ['sentence', 'label', 'idx'], num_rows: 872 }) test: Dataset({ features: ['sentence', 'label', 'idx'], num_rows: 1821 }) })
3.数据集处理
#预测下一个词,只需要数据集中的sentence, 不需要label和idx
#使用map函数做映射。处理只剩下sentence
def f(dataset, tokenizer):
return tokenizer.batch_encode_plus(dataset['sentence'])
#num_proc=8 查看任务管理器--性能--逻辑处理器的数量
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=8,
remove_columns=['sentence', 'label', 'idx'],
fn_kwargs={'tokenizer': tokenizer})
dataset
DatasetDict({ train: Dataset({ features: ['input_ids', 'attention_mask'], num_rows: 67349 }) validation: Dataset({ features: ['input_ids', 'attention_mask'], num_rows: 872 }) test: Dataset({ features: ['input_ids', 'attention_mask'], num_rows: 1821 }) })
#规定一个句子最小要有8个单词
#过滤掉太短的句子
def f_1(dataset):
return [len(i) >=8 for i in dataset['input_ids']] #i是每一句话
dataset = dataset.filter(f_1, batched=True, batch_size=1000, num_proc=8)
dataset
DatasetDict({ train: Dataset({ features: ['input_ids', 'attention_mask'], num_rows: 39905 }) validation: Dataset({ features: ['input_ids', 'attention_mask'], num_rows: 848 }) test: Dataset({ features: ['input_ids', 'attention_mask'], num_rows: 1730 }) })
#截断句子
def f_2(dataset):
#分别获取截断之后的input_ids, attention_mask
dataset['input_ids'] = [i[:8] for i in dataset['input_ids']]
dataset['attention_mask'] = [[1] * 8] * len(dataset['attention_mask'])
#预测最后一个词,前七个单词输入,最后一个输入
#模型帮我们做了偏移量问题, 这里输入和输出保持一致即可
dataset['labels'] = dataset['input_ids']
return dataset
dataset = dataset.map(f_2, batched=True, batch_size=1000, num_proc=8)
dataset
DatasetDict({ train: Dataset({ features: ['input_ids', 'attention_mask', 'labels'], num_rows: 39905 }) validation: Dataset({ features: ['input_ids', 'attention_mask', 'labels'], num_rows: 848 }) test: Dataset({ features: ['input_ids', 'attention_mask', 'labels'], num_rows: 1730 }) })
dataset['train'][0]
{'input_ids': [24717, 649, 3200, 507, 422, 262, 21694, 4991], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'labels': [24717, 649, 3200, 507, 422, 262, 21694, 4991]}
#定义数据加载器
import torch
#default_data_collator: 将一条条取数据变成批量取数据
from transformers.data.data_collator import default_data_collator
loader = torch.utils.data.DataLoader(
dataset=dataset['train'], #取出训练数据集
batch_size=16,
collate_fn=default_data_collator,
shuffle=True,
drop_last=True #若最后一批数据没有batch_size个数据,就删掉不用
)
for data in loader:
break
len(loader), data
(2494, {'input_ids': tensor([[22602, 4340, 262, 2126, 286, 1642, 479, 993], [ 5832, 651, 262, 10647, 326, 6260, 290, 3437], [ 11, 645, 530, 460, 3285, 345, 3013, 382], [48580, 257, 2612, 290, 3950, 326, 36675, 262], [ 361, 345, 467, 287, 6970, 326, 837, 345], [ 270, 705, 82, 257, 4950, 2646, 837, 1336], [ 71, 1794, 6819, 837, 26996, 6819, 6776, 837], [11246, 7650, 30669, 13766, 17548, 351, 6159, 220], [ 1169, 11918, 286, 281, 7876, 1013, 30909, 1358], [22437, 299, 470, 1612, 340, 705, 82, 922], [ 270, 705, 82, 23056, 284, 766, 257, 3807], [ 5832, 1244, 892, 339, 373, 2491, 329, 2607], [ 2395, 9259, 736, 49253, 837, 11441, 2223, 16311], [ 8505, 837, 1312, 11691, 340, 705, 82, 14081], [ 1169, 1306, 13203, 29185, 286, 262, 1842, 8848], [26535, 867, 14138, 290, 41169, 12, 44517, 2628]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[22602, 4340, 262, 2126, 286, 1642, 479, 993], [ 5832, 651, 262, 10647, 326, 6260, 290, 3437], [ 11, 645, 530, 460, 3285, 345, 3013, 382], [48580, 257, 2612, 290, 3950, 326, 36675, 262], [ 361, 345, 467, 287, 6970, 326, 837, 345], [ 270, 705, 82, 257, 4950, 2646, 837, 1336], [ 71, 1794, 6819, 837, 26996, 6819, 6776, 837], [11246, 7650, 30669, 13766, 17548, 351, 6159, 220], [ 1169, 11918, 286, 281, 7876, 1013, 30909, 1358], [22437, 299, 470, 1612, 340, 705, 82, 922], [ 270, 705, 82, 23056, 284, 766, 257, 3807], [ 5832, 1244, 892, 339, 373, 2491, 329, 2607], [ 2395, 9259, 736, 49253, 837, 11441, 2223, 16311], [ 8505, 837, 1312, 11691, 340, 705, 82, 14081], [ 1169, 1306, 13203, 29185, 286, 262, 1842, 8848], [26535, 867, 14138, 290, 41169, 12, 44517, 2628]])})
4.下游任务模型
from transformers import AutoModelForCausalLM, GPT2Model
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.pretrained = GPT2Model.from_pretrained('../data/model/distilgpt2/')
self.fc = torch.nn.Linear(768, tokenizer.vocab_size, bias=False)
#加载预训练权重的模型
parameters = AutoModelForCausalLM.from_pretrained('../data/model/distilgpt2/')
#全连接层加载预训练权重
self.fc.load_state_dict(parameters.lm_head.state_dict())
#前四句代码的简便写法
#self_pretrained = AutoModelForCausalLM.from_pretrained('../data/model/distilgpt2/')
self.criterion = torch.nn.CrossEntropyLoss()
def forward(self, input_ids, attention_mask, labels=None):
logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)
logits = logits.last_hidden_state
logits = self.fc(logits)
loss=None
if labels is not None:
#传入数据与labels 的取值范围的偏移量为1
#第0维度与第2维度都要,第1维度的最后一个不要
shift_logits = logits[:, :-1].reshape(-1, tokenizer.vocab_size) #-1表示前两个维度合并
#第0维度与第2维度都要,第1维度的第一个不要
shift_labels = labels[:, 1:].reshape(-1) #二维tensor数组变成一维
loss = self.criterion(shift_logits, shift_labels)
return {'loss':loss, 'logits':logits}
model = Model()
#参数量
print(sum(i.numel() for i in model.parameters()) / 10000) #除以10000是以万为单位
12050.9952
#python中**的用法,可以自动把一个字典解包成关键字参数{} -->xxx = xxx, xxx-->xxx
out = model(**data)
# print(out) #一个字典
print(out['loss'], out['logits'].shape)
tensor(6.2742, grad_fn=<NllLossBackward0>) torch.Size([16, 8, 50257])
5.测试代码
# 测试代码
def test(model):
model.eval()
# 加载测试数据
loader_test = torch.utils.data.DataLoader(
dataset=dataset['test'],
batch_size=16,
collate_fn=default_data_collator,
shuffle=True, #测试预测时,可以不打乱数据
drop_last=True
)
correct = 0
total = 0
for i, data in enumerate(loader_test):
# 只计算最后一个词的准确率.
label = data['input_ids'][:, -1].clone() #clone()克隆数据,不修改原数据
# 再从数据中抹除最后一个词, 防止模型作弊.
data['input_ids'][:, -1] = 0 #把最后一列数据都赋值为0
# label就不需要了 都赋值为0
data['labels'][:, :] = 0
# 计算
with torch.no_grad(): #测试预测时,不需要求导梯度下降(是处于训练时做的事)
#相当于 out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
out = model(**data)
# 最后一个词的准确率, 因为有偏移量的关系, 这里取的是倒数第二个词
out = out['logits'].argmax(dim=2)[:, -2]
#在PyTorch中,.item() 是一个常用于从包含单个元素的张量(通常是一个0维张量,即标量scalar)中提取Python数值的方法。
correct += (label==out).sum().item()
total += 16 #16就是batch_szie的数值
if i % 10 == 0: #每隔10次
print(i)
print(label)
print(out)
if i == 50: #只循环50次
break
print('accuracy: ', correct / total)
for i in range(8):
print(tokenizer.decode(data['input_ids'][i, :-1]))
print(tokenizer.decode(label[i]), tokenizer.decode(out[i]))
print()
test(model)
0 tensor([ 416, 42738, 7297, 2709, 651, 837, 290, 349, 290, 11815, 72, 14505, 7559, 532, 3822, 262]) tensor([ 11, 262, 3807, 5000, 1064, 13, 11, 453, 13, 13664, 72, 983, 340, 13, 12, 262]) 10 tensor([ 284, 428, 991, 705, 318, 11783, 1787, 65, 43527, 2306, 460, 8395, 743, 6386, 2370, 393]) tensor([ 284, 262, 262, 447, 11, 11, 76, 65, 38520, 20706, 318, 502, 468, 6386, 4899, 1022]) 20 tensor([ 7357, 1936, 4572, 2465, 1049, 257, 7358, 262, 29963, 2646, 517, 290, 9188, 1647, 278, 1241]) tensor([7357, 584, 290, 1143, 649, 262, 2656, 262, 7051, 11, 257, 284, 262, 12, 278, 1210]) 30 tensor([14969, 5239, 3016, 837, 43207, 262, 764, 4129, 307, 262, 705, 465, 262, 837, 1100, 2700]) tensor([ 67, 5239, 12302, 13, 2033, 262, 284, 1988, 307, 262, 1053, 262, 262, 13, 1621, 2700]) 40 tensor([21730, 13770, 2737, 264, 477, 2218, 262, 257, 340, 8886, 848, 14821, 1178, 705, 787, 1239]) tensor([ 13, 290, 2737, 670, 262, 290, 262, 340, 6, 983, 257, 262, 1438, 460, 787, 318]) 50 tensor([ 262, 6840, 763, 286, 611, 286, 475, 2915, 764, 837, 4379, 12986, 10997, 272, 257, 1200]) tensor([ 262, 651, 318, 1683, 621, 286, 475, 12, 284, 837, 257, 262, 10997, 272, 257, 262]) accuracy: 0.18382352941176472 automatically pegs itself for the the if there 's a way to effectively get while benigni ( who stars and co is the stupidest , most insulting movie of ever the film might have been more satisfying if than a journey through memory , a celebration of of the story may not be new , but but there are touching moments in eto iles -
6.训练
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda', index=0)
from transformers import AdamW
from transformers.optimization import get_scheduler
#训练代码
def train():
#设置优化器:优化梯度下降的算法
optimizer = AdamW(model.parameters(), lr=2e-5)
#学习率下降计划
scheduler = get_scheduler(name='linear',
num_warmup_steps=0, #下降步长从0开始
num_training_steps=len(loader), #训练次数就是训练数据加载器的长度
optimizer=optimizer)
#将模型发送到设备上
model.to(device)
model.train() #模型训练
for i, data in enumerate(loader):
input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#计算损失, 损失在model中以及计算了, 可以从返回值中获取
loss = out['loss']
#为了训练稳定(不要波动太大), 进行梯度裁剪
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) #max_norm就是公式中的c
#用计算的损失来反向传播
loss.backward()
#梯度更新
optimizer.step()
scheduler.step()
#梯度清零
optimizer.zero_grad()
model.zero_grad()
if i % 50 == 0: #每隔50次
labels = labels[:, 1:]
#out是三维tensor数组, 在索引2的维度上找最大的概率
out = out['logits'].argmax(dim=2)[:, :-1]
correct = (labels == out).sum().item() #.item()一维tensor标量可以取出来
accuracy = correct / (16 * 7) #一批16个数据, 每个数据7个数据(8-1个偏移量)
lr = optimizer.state_dict()['param_groups'][0]['lr']
print(i, loss.item(), accuracy, lr)
train()
0 6.9149088859558105 0.14285714285714285 1.9991980753809144e-05 50 5.734440803527832 0.1875 1.959101844426624e-05 100 5.432187557220459 0.1875 1.919005613472334e-05 150 5.808253765106201 0.16964285714285715 1.8789093825180436e-05 200 5.217792510986328 0.16071428571428573 1.838813151563753e-05 250 5.223909854888916 0.20535714285714285 1.7987169206094627e-05。。。。。
2250 5.031280040740967 0.14285714285714285 1.948676824378509e-06 2300 4.822522163391113 0.2857142857142857 1.5477145148356058e-06 2350 4.803909778594971 0.25 1.1467522052927025e-06 2400 4.606936931610107 0.26785714285714285 7.457898957497996e-07 2450 4.976705074310303 0.24107142857142858 3.4482758620689656e-07
7.保存训练好的模型
torch.save(model, '../data//预测最后一个词模型.model')
8. 加载 保存的模型
#加载上一行代码保存的模型
#注意:加载的模型是传送到GPU训练得到的, 在加载时需要改到cpu上-->map_location='cpu'
model2 = torch.load('../data/预测最后一个词模型.model', map_location='cpu')
test(model2)
0 tensor([ 627, 616, 1486, 4608, 290, 38132, 880, 262, 3900, 336, 890, 428, 764, 428, 837, 1377]) tensor([ 286, 262, 318, 3988, 837, 3807, 257, 262, 705, 14397, 3807, 262, 621, 262, 290, 326]) 10 tensor([11815, 326, 663, 7464, 340, 3898, 287, 82, 257, 546, 3281, 262, 16631, 3807, 428, 2646]) tensor([7635, 286, 663, 3807, 286, 3898, 287, 82, 257, 257, 1146, 262, 837, 837, 262, 2646]) 20 tensor([ 546, 1312, 307, 340, 262, 422, 1049, 11648, 640, 1267, 2089, 1683, 800, 1502, 355, 475]) tensor([ 345, 340, 307, 340, 262, 837, 2646, 3807, 262, 1267, 2089, 705, 6275, 262, 355, 475]) 30 tensor([ 82, 19377, 764, 837, 3316, 1751, 508, 809, 1621, 3755, 12, 4681, 2071, 1039, 48133, 290]) tensor([ 82, 257, 326, 837, 2860, 290, 508, 705, 2646, 2567, 48133, 20170, 555, 1039, 48133, 290]) 40 tensor([ 3704, 705, 2589, 36138, 534, 503, 262, 20024, 8591, 290, 788, 3923, 3807, 8925, 1128, 764]) tensor([ 898, 423, 1218, 772, 534, 345, 262, 20024, 262, 290, 257, 1692, 2646, 2568, 837, 837]) 50 tensor([ 837, 5581, 764, 21981, 287, 12, 4379, 318, 705, 286, 2962, 10997, 3146, 717, 764, 1165]) tensor([ 837, 3159, 326, 257, 837, 288, 257, 318, 705, 286, 21452, 10512, 1438, 262, 326, 257]) accuracy: 0.25612745098039214 no movement , no yuks , , whether seen on a 10-inch television screen a taut , intelligent psychological drama . that there are times when a rumor of angels a greene delivers a typically solid performance in , some movies are like a tasty hors - d the visuals alone make metropolis worth seeing a an experience so engrossing it is is