当前位置：首页 > article >正文

NLP任务之预测最后一个词

article 2024/12/20 22:32:03

1.加载预训练模型

2 从本地加载数据集

3.数据集处理

4.下游任务模型

5.测试代码

6.训练代码

7.保存训练好的模型

8. 加载保存的模型

1.加载预训练模型

#加载预训练模型
from transformers import AutoTokenizer


#预训练模型：distilgpt2
#use_fast=True：用rust语言写的分词器，速度比python语言写的分词器快很多
tokenizer = AutoTokenizer.from_pretrained(r'../data/model/distilgpt2/', use_fast=True)
tokenizer.batch_encode_plus([
    'hide new secretions from the parental units',
    'this moive is great'  
])


#输出：
#{'input_ids': [[24717, 649, 3200, 507, 422, 262, 21694, 4991], [5661, 6941, 425, 318, 1049]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}


print(tokenizer)

GPT2TokenizerFast(name_or_path='../data/model/distilgpt2/', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

#预测最后一个词：是一个多分类问题
#针对这个vocab_size=50257的问题，分类类别就是50257个类别

2 从本地加载数据集

from datasets import load_from_disk


dataset = load_from_disk(r'E:/ALOT/10_deep_learning/data/datasets/glue_sst2/')

dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

3.数据集处理

#预测下一个词，只需要数据集中的sentence, 不需要label和idx
#使用map函数做映射。处理只剩下sentence

def f(dataset, tokenizer):
    return tokenizer.batch_encode_plus(dataset['sentence'])

#num_proc=8  查看任务管理器--性能--逻辑处理器的数量
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=8,
                      remove_columns=['sentence', 'label', 'idx'],
                      fn_kwargs={'tokenizer': tokenizer})  


dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

#规定一个句子最小要有8个单词
#过滤掉太短的句子
def f_1(dataset):
    return [len(i) >=8 for i in dataset['input_ids']]  #i是每一句话

dataset = dataset.filter(f_1, batched=True, batch_size=1000, num_proc=8)


dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 848
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1730
    })
})

#截断句子
def f_2(dataset):
    #分别获取截断之后的input_ids, attention_mask
    dataset['input_ids'] = [i[:8] for i in dataset['input_ids']]
    dataset['attention_mask'] = [[1] * 8] * len(dataset['attention_mask'])
    #预测最后一个词，前七个单词输入，最后一个输入
    #模型帮我们做了偏移量问题， 这里输入和输出保持一致即可
    dataset['labels'] = dataset['input_ids']
    return dataset

dataset = dataset.map(f_2, batched=True, batch_size=1000, num_proc=8)


dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 848
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1730
    })
})

dataset['train'][0]

{'input_ids': [24717, 649, 3200, 507, 422, 262, 21694, 4991],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [24717, 649, 3200, 507, 422, 262, 21694, 4991]}

#定义数据加载器
import torch
#default_data_collator: 将一条条取数据变成批量取数据
from transformers.data.data_collator import default_data_collator

loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],  #取出训练数据集
    batch_size=16,
    collate_fn=default_data_collator,
    shuffle=True,
    drop_last=True  #若最后一批数据没有batch_size个数据，就删掉不用
)

for data in loader:
    break
    
len(loader), data

(2494,
 {'input_ids': tensor([[22602,  4340,   262,  2126,   286,  1642,   479,   993],
          [ 5832,   651,   262, 10647,   326,  6260,   290,  3437],
          [   11,   645,   530,   460,  3285,   345,  3013,   382],
          [48580,   257,  2612,   290,  3950,   326, 36675,   262],
          [  361,   345,   467,   287,  6970,   326,   837,   345],
          [  270,   705,    82,   257,  4950,  2646,   837,  1336],
          [   71,  1794,  6819,   837, 26996,  6819,  6776,   837],
          [11246,  7650, 30669, 13766, 17548,   351,  6159,   220],
          [ 1169, 11918,   286,   281,  7876,  1013, 30909,  1358],
          [22437,   299,   470,  1612,   340,   705,    82,   922],
          [  270,   705,    82, 23056,   284,   766,   257,  3807],
          [ 5832,  1244,   892,   339,   373,  2491,   329,  2607],
          [ 2395,  9259,   736, 49253,   837, 11441,  2223, 16311],
          [ 8505,   837,  1312, 11691,   340,   705,    82, 14081],
          [ 1169,  1306, 13203, 29185,   286,   262,  1842,  8848],
          [26535,   867, 14138,   290, 41169,    12, 44517,  2628]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1]]),
  'labels': tensor([[22602,  4340,   262,  2126,   286,  1642,   479,   993],
          [ 5832,   651,   262, 10647,   326,  6260,   290,  3437],
          [   11,   645,   530,   460,  3285,   345,  3013,   382],
          [48580,   257,  2612,   290,  3950,   326, 36675,   262],
          [  361,   345,   467,   287,  6970,   326,   837,   345],
          [  270,   705,    82,   257,  4950,  2646,   837,  1336],
          [   71,  1794,  6819,   837, 26996,  6819,  6776,   837],
          [11246,  7650, 30669, 13766, 17548,   351,  6159,   220],
          [ 1169, 11918,   286,   281,  7876,  1013, 30909,  1358],
          [22437,   299,   470,  1612,   340,   705,    82,   922],
          [  270,   705,    82, 23056,   284,   766,   257,  3807],
          [ 5832,  1244,   892,   339,   373,  2491,   329,  2607],
          [ 2395,  9259,   736, 49253,   837, 11441,  2223, 16311],
          [ 8505,   837,  1312, 11691,   340,   705,    82, 14081],
          [ 1169,  1306, 13203, 29185,   286,   262,  1842,  8848],
          [26535,   867, 14138,   290, 41169,    12, 44517,  2628]])})

4.下游任务模型

from transformers import AutoModelForCausalLM, GPT2Model


class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pretrained = GPT2Model.from_pretrained('../data/model/distilgpt2/')
        self.fc = torch.nn.Linear(768, tokenizer.vocab_size, bias=False)
        
        #加载预训练权重的模型
        parameters = AutoModelForCausalLM.from_pretrained('../data/model/distilgpt2/')
        #全连接层加载预训练权重
        self.fc.load_state_dict(parameters.lm_head.state_dict()) 
        
        #前四句代码的简便写法
        #self_pretrained = AutoModelForCausalLM.from_pretrained('../data/model/distilgpt2/')
        
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)
        logits = logits.last_hidden_state
        logits = self.fc(logits)
        
        loss=None
        if labels is not None:
            #传入数据与labels 的取值范围的偏移量为1
            #第0维度与第2维度都要，第1维度的最后一个不要
            shift_logits = logits[:, :-1].reshape(-1, tokenizer.vocab_size)  #-1表示前两个维度合并
            #第0维度与第2维度都要，第1维度的第一个不要
            shift_labels = labels[:, 1:].reshape(-1)  #二维tensor数组变成一维
            loss = self.criterion(shift_logits, shift_labels)
        return {'loss':loss, 'logits':logits}

model = Model()
#参数量
print(sum(i.numel() for i in model.parameters()) / 10000)  #除以10000是以万为单位

12050.9952

#python中**的用法，可以自动把一个字典解包成关键字参数{} -->xxx = xxx， xxx-->xxx
out = model(**data)
# print(out)  #一个字典
print(out['loss'], out['logits'].shape)

tensor(6.2742, grad_fn=<NllLossBackward0>) torch.Size([16, 8, 50257])

5.测试代码

# 测试代码
def test(model):
    model.eval()
    
    # 加载测试数据
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['test'],
        batch_size=16,
        collate_fn=default_data_collator,
        shuffle=True,   #测试预测时，可以不打乱数据
        drop_last=True
    )
    
    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        # 只计算最后一个词的准确率. 
        label = data['input_ids'][:, -1].clone()    #clone()克隆数据，不修改原数据
        # 再从数据中抹除最后一个词, 防止模型作弊. 
        data['input_ids'][:, -1] = 0   #把最后一列数据都赋值为0
        # label就不需要了   都赋值为0
        data['labels'][:, :]  = 0
        
        # 计算
        with torch.no_grad():      #测试预测时，不需要求导梯度下降（是处于训练时做的事）
            #相当于 out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            out = model(**data)
            
        # 最后一个词的准确率, 因为有偏移量的关系, 这里取的是倒数第二个词
        out = out['logits'].argmax(dim=2)[:, -2]  
        #在PyTorch中,.item() 是一个常用于从包含单个元素的张量(通常是一个0维张量,即标量scalar)中提取Python数值的方法。
        correct += (label==out).sum().item()
        total += 16    #16就是batch_szie的数值
        
        if i % 10 == 0:   #每隔10次
            print(i)
            print(label)
            print(out)
            
        if i == 50:   #只循环50次
            break
            
    print('accuracy: ', correct / total)
    
    for i in range(8):
        print(tokenizer.decode(data['input_ids'][i, :-1]))
        print(tokenizer.decode(label[i]), tokenizer.decode(out[i]))
        print()

test(model)

0
tensor([  416, 42738,  7297,  2709,   651,   837,   290,   349,   290, 11815,
           72, 14505,  7559,   532,  3822,   262])
tensor([   11,   262,  3807,  5000,  1064,    13,    11,   453,    13, 13664,
           72,   983,   340,    13,    12,   262])
10
tensor([  284,   428,   991,   705,   318, 11783,  1787,    65, 43527,  2306,
          460,  8395,   743,  6386,  2370,   393])
tensor([  284,   262,   262,   447,    11,    11,    76,    65, 38520, 20706,
          318,   502,   468,  6386,  4899,  1022])
20
tensor([ 7357,  1936,  4572,  2465,  1049,   257,  7358,   262, 29963,  2646,
          517,   290,  9188,  1647,   278,  1241])
tensor([7357,  584,  290, 1143,  649,  262, 2656,  262, 7051,   11,  257,  284,
         262,   12,  278, 1210])
30
tensor([14969,  5239,  3016,   837, 43207,   262,   764,  4129,   307,   262,
          705,   465,   262,   837,  1100,  2700])
tensor([   67,  5239, 12302,    13,  2033,   262,   284,  1988,   307,   262,
         1053,   262,   262,    13,  1621,  2700])
40
tensor([21730, 13770,  2737,   264,   477,  2218,   262,   257,   340,  8886,
          848, 14821,  1178,   705,   787,  1239])
tensor([  13,  290, 2737,  670,  262,  290,  262,  340,    6,  983,  257,  262,
        1438,  460,  787,  318])
50
tensor([  262,  6840,   763,   286,   611,   286,   475,  2915,   764,   837,
         4379, 12986, 10997,   272,   257,  1200])
tensor([  262,   651,   318,  1683,   621,   286,   475,    12,   284,   837,
          257,   262, 10997,   272,   257,   262])
accuracy:  0.18382352941176472
automatically pegs itself for
 the  the

if there 's a way to
 effectively  get

while benigni ( who stars and
 co  is

the stupidest , most insulting movie
 of  ever

the film might have been more satisfying
 if  than

a journey through memory , a celebration
 of  of

the story may not be new ,
 but  but

there are touching moments in eto
iles -

6.训练

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

from transformers import AdamW
from transformers.optimization import get_scheduler


#训练代码
def train():
    #设置优化器：优化梯度下降的算法
    optimizer = AdamW(model.parameters(), lr=2e-5)
    #学习率下降计划
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,   #下降步长从0开始
                              num_training_steps=len(loader), #训练次数就是训练数据加载器的长度
                              optimizer=optimizer)
    #将模型发送到设备上
    model.to(device)
    model.train() #模型训练
    for i, data in enumerate(loader):
        input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        #计算损失， 损失在model中以及计算了， 可以从返回值中获取
        loss = out['loss']
        
        #为了训练稳定（不要波动太大）， 进行梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)    #max_norm就是公式中的c
        
        #用计算的损失来反向传播
        loss.backward()
        #梯度更新
        optimizer.step()
        scheduler.step()
        #梯度清零
        optimizer.zero_grad()
        model.zero_grad()
        
        if i % 50 == 0:  #每隔50次
            labels = labels[:, 1:]
            #out是三维tensor数组， 在索引2的维度上找最大的概率
            out = out['logits'].argmax(dim=2)[:, :-1]
            correct = (labels == out).sum().item()  #.item()一维tensor标量可以取出来
            accuracy = correct / (16 * 7)  #一批16个数据， 每个数据7个数据（8-1个偏移量）
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            
            print(i, loss.item(), accuracy, lr)

train()

0 6.9149088859558105 0.14285714285714285 1.9991980753809144e-05
50 5.734440803527832 0.1875 1.959101844426624e-05
100 5.432187557220459 0.1875 1.919005613472334e-05
150 5.808253765106201 0.16964285714285715 1.8789093825180436e-05
200 5.217792510986328 0.16071428571428573 1.838813151563753e-05
250 5.223909854888916 0.20535714285714285 1.7987169206094627e-05

。。。。。

2250 5.031280040740967 0.14285714285714285 1.948676824378509e-06
2300 4.822522163391113 0.2857142857142857 1.5477145148356058e-06
2350 4.803909778594971 0.25 1.1467522052927025e-06
2400 4.606936931610107 0.26785714285714285 7.457898957497996e-07
2450 4.976705074310303 0.24107142857142858 3.4482758620689656e-07

7.保存训练好的模型

torch.save(model, '../data//预测最后一个词模型.model')

8. 加载保存的模型

#加载上一行代码保存的模型
#注意：加载的模型是传送到GPU训练得到的， 在加载时需要改到cpu上-->map_location='cpu'
model2 = torch.load('../data/预测最后一个词模型.model', map_location='cpu')

test(model2)

0
tensor([  627,   616,  1486,  4608,   290, 38132,   880,   262,  3900,   336,
          890,   428,   764,   428,   837,  1377])
tensor([  286,   262,   318,  3988,   837,  3807,   257,   262,   705, 14397,
         3807,   262,   621,   262,   290,   326])
10
tensor([11815,   326,   663,  7464,   340,  3898,   287,    82,   257,   546,
         3281,   262, 16631,  3807,   428,  2646])
tensor([7635,  286,  663, 3807,  286, 3898,  287,   82,  257,  257, 1146,  262,
         837,  837,  262, 2646])
20
tensor([  546,  1312,   307,   340,   262,   422,  1049, 11648,   640,  1267,
         2089,  1683,   800,  1502,   355,   475])
tensor([ 345,  340,  307,  340,  262,  837, 2646, 3807,  262, 1267, 2089,  705,
        6275,  262,  355,  475])
30
tensor([   82, 19377,   764,   837,  3316,  1751,   508,   809,  1621,  3755,
           12,  4681,  2071,  1039, 48133,   290])
tensor([   82,   257,   326,   837,  2860,   290,   508,   705,  2646,  2567,
        48133, 20170,   555,  1039, 48133,   290])
40
tensor([ 3704,   705,  2589, 36138,   534,   503,   262, 20024,  8591,   290,
          788,  3923,  3807,  8925,  1128,   764])
tensor([  898,   423,  1218,   772,   534,   345,   262, 20024,   262,   290,
          257,  1692,  2646,  2568,   837,   837])
50
tensor([  837,  5581,   764, 21981,   287,    12,  4379,   318,   705,   286,
         2962, 10997,  3146,   717,   764,  1165])
tensor([  837,  3159,   326,   257,   837,   288,   257,   318,   705,   286,
        21452, 10512,  1438,   262,   326,   257])
accuracy:  0.25612745098039214
no movement , no yuks
 ,  ,

whether seen on a 10-inch
 television  screen

a taut , intelligent psychological drama
 .  that

there are times when a rumor of
 angels  a

greene delivers a typically solid performance
 in  ,

some movies are like a tasty hors
-  d

the visuals alone make metropolis worth
 seeing  a

an experience so engrossing it
 is  is