当前位置: 首页 > article >正文

20_BERT微调训练

1.导包

import json   #通过路径加载预训练模型
import os
import torch
from torch import nn
import dltools

2.加载预训练模型BERT函数¶

def load_pretrained_model(pretrained_model, num_hiddens, ffn_num_hiddens,num_heads, num_layers, dropout, max_len, devices):
    data_dir = "./bert.small.torch/"
    # 定义空词表以加载预定义词表
    vocab = dltools.Vocab()
    vocab.idx_to_token = json.load(open(os.path.join(data_dir,'vocab.json')))
    vocab.token_to_idx = {token: idx for idx, token in enumerate(vocab.idx_to_token)}
    bert = dltools.BERTModel(len(vocab), num_hiddens, norm_shape=[256],
                         ffn_num_input=256, ffn_num_hiddens=ffn_num_hiddens,
                         num_heads=4, num_layers=2, dropout=0.2,
                         max_len=max_len, key_size=256, query_size=256,
                         value_size=256, hid_in_features=256,
                         mlm_in_features=256, nsp_in_features=256)
    # 加载预训练BERT参数
    bert.load_state_dict(torch.load(os.path.join(data_dir,'pretrained.params')))
    return bert, vocab
devices = dltools.try_all_gpus()
#调用加载预训练模型BERT的封装函数
bert, vocab = load_pretrained_model('bert.small', num_hiddens=256, ffn_num_hiddens=512, num_heads=4, num_layers=2, dropout=0.1, max_len=512, devices=devices)
    
# standford natural language inference 
class SNLIBERTDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_len, vocab=None):
        all_premise_hypothesis_tokens = [
            [p_tokens, h_tokens] for p_tokens, h_tokens in 
            zip(*[dltools.tokenize([s.lower() for s in sentences])for sentences in dataset[:2]])
        ]

        self.labels = torch.tensor(dataset[2])
        self.vocab = vocab
        self.max_len = max_len
        (self.all_token_ids, self.all_segments,self.valid_lens) = self._preprocess(all_premise_hypothesis_tokens)
        print('read ' + str(len(self.all_token_ids)) + ' examples')

    def _preprocess(self, all_premise_hypothesis_tokens):
        out = [self._mp_worker(x) for x in all_premise_hypothesis_tokens]
        all_token_ids = [token_ids for token_ids, segments, valid_len in out]
        all_segments = [segments for token_ids, segments, valid_len in out]
        valid_lens = [valid_len for token_ids, segments, valid_len in out]
        return (torch.tensor(all_token_ids, dtype=torch.long),
                torch.tensor(all_segments, dtype=torch.long),
                torch.tensor(valid_lens))

    def _mp_worker(self, premise_hypothesis_tokens):
        p_tokens, h_tokens = premise_hypothesis_tokens
        self._truncate_pair_of_tokens(p_tokens, h_tokens)
        tokens, segments = dltools.get_tokens_and_segments(p_tokens, h_tokens)
        token_ids = self.vocab[tokens] + [self.vocab['<pad>']] * (self.max_len - len(tokens))
        segments = segments + [0] * (self.max_len - len(segments))
        valid_len = len(tokens)
        return token_ids, segments, valid_len

    def _truncate_pair_of_tokens(self, p_tokens, h_tokens):
        # 为BERT输入中的'<CLS>'、'<SEP>'和'<SEP>'词元保留位置
        while len(p_tokens) + len(h_tokens) > self.max_len - 3:
            if len(p_tokens) > len(h_tokens):
                p_tokens.pop()
            else:
                h_tokens.pop()

    def __getitem__(self, idx):
        return (self.all_token_ids[idx], self.all_segments[idx],
                self.valid_lens[idx]), self.labels[idx]

    def __len__(self):
        return len(self.all_token_ids)
#若出现显存不足错误,请减少‘batch——size’。在原始的BERT模型中,max_len=512
batch_size, max_len, num_workers = 128, 128, dltools.get_dataloader_workers()
data_dir = './snli_1.0/'
train_set = SNLIBERTDataset(dltools.read_snli(data_dir, True), max_len, vocab)
test_set = SNLIBERTDataset(dltools.read_snli(data_dir, False), max_len, vocab)
train_iter = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True, num_workers=num_workers)
test_iter = torch.utils.data.DataLoader(test_set, batch_size, num_workers=num_workers)
read 549367 examples
read 9824 examples
train_data = dltools.read_snli(data_dir, is_train=True)
for x0, x1, y in zip(train_data[0][:3], train_data[1][:3], train_data[2][:3]):
    print('premise', x0)
    print('hypothesis:', x1)
    print('label', y)

 

premise A person on a horse jumps over a broken down airplane .
hypothesis: A person is training his horse for a competition .
label 2
premise A person on a horse jumps over a broken down airplane .
hypothesis: A person is at a diner , ordering an omelette .
label 1
premise A person on a horse jumps over a broken down airplane .
hypothesis: A person is outdoors , on a horse .
label 0
class BERTClassifier(nn.Module):
    def __init__(self, bert):
        super(BERTClassifier, self).__init__()
        self.encoder = bert.encoder
        self.hidden = bert.hidden
        self.output = nn.Linear(256, 3)

    def forward(self, inputs):
        tokens_X, segments_X, valid_lens_x = inputs
        encoded_X = self.encoder(tokens_X, segments_X, valid_lens_x)
        return self.output(self.hidden(encoded_X[:, 0, :])) #去除中间的维度
net = BERTClassifier(bert)
lr, num_epochs = 1e-4, 2
trainer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss(reduction='none')
dltools.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices)
loss 0.640, train acc 0.733, test acc 0.762
2658.2 examples/sec on [device(type='cuda', index=0)]

 


http://www.kler.cn/news/323817.html

相关文章:

  • 探索Python网络世界的利器:Requests-HTML库
  • Python自学查漏9.28
  • Spark 中 任务集 TaskSet 详解
  • 探索私有化聊天软件:即时通讯与音视频技术的结合
  • RSpec简析及应用案例
  • leetcode刷题day32|动态规划Part01(509. 斐波那契数、70. 爬楼梯、746. 使用最小花费爬楼梯)
  • uni-app进行微信小程序开发,快速上手
  • STM32 F1移植FATFS文件系统 USMART组件测试相关函数功能
  • 二、初步编写drf API
  • 太速科技-389-基于KU5P的双路100G光纤网络加速计算卡
  • linux系统的常用命令
  • 【系统规划与管理师】【案例分析】【考点】【答案篇】第10章 团队建设与管理
  • docker相关命令
  • 基于单片机的精确电压表DA-AD转换
  • 【笔记】神领物流day1.1.13前后端部署【未完】
  • JVM、JRE、JDK关系。HotSpot。JVM规范
  • 【R语言】fs 工具功能速查
  • 【项目经验分享】深度学习点云算法毕业设计项目案例定制
  • 【JavaEE】——内存可见性问题
  • 支付宝远程收款api之小荷包跳转码
  • 画两个数的平方和的曲线
  • ECharts图表图例3
  • 【记录】Excel|不允许的操作:合并或隐藏单元格出现的问题列表及解决方案
  • el-table给列加单位,表头加样式,加斑马纹
  • 【YashanDB知识库】如何dump数据文件,转换rowid, 查询对应内容
  • 9月27日,每日信息差
  • XSS基础
  • 蓝桥杯—STM32G431RBT6(TIM定时器输入捕获频率和占空比)
  • 北斗三号多模对讲机TD70:公专网融合、数模一体、音视频调度,推动应急通信效能升级
  • Xiaojie雷达之路---doa估计(dbf、capon、music算法)