【NLP251】命名实体实战(基于Transformer分类)
1. 查看数据集
json解析工具:JSON 在线解析 | 菜鸟工具
快速了解json文件内容分布
2.构建项目框架
project_root/
│
├── src/
│ ├── medical_ner/
│ │ ├── datas/
│ │ │ ├── __init__.py
│ │ │ ├── bmeso_ner_labels.py
│ │ │ ├── json_ner_dataloader.py
│ │ │ └── utils.py
│ │ ├── losses/
│ │ │ ├── __init__.py
│ │ │ └── loss.py
│ │ ├── metrics/
│ │ │ ├── __init__.py
│ │ │ └── accuracy.py
│ │ └── models/
│ │ ├── __init__.py
│ │ ├── ner_bert_softmax.py
│ │ └── ner_softmax/
│ │ ├── __init__.py
│ │ ├── bert_training.py
│ │ └── training.py
└──test/
│ └── datas/
│ ├── medical/
│ │ ├── categories.json
│ │ ├── test.json
│ │ └── training.txt
│ └── output/
│ ├── training_softmax_ner_model.py
3. src代码详解
3.1 datas.bmeso_ner_labels.py
# -*- coding: utf-8 -*-
import json
"""
从单个文件中提取所有标签类型的函数。
"""
def extract_labels_per_file(in_file):
labels = set()
with open(in_file, "r", encoding="utf-8") as reader: # 打开文件进行读取
for line in reader: # 逐行读取文件内容
record = json.loads(line.strip()) # 将JSON格式的字符串转换为字典
entities = record['entities'] # 获取记录中的实体列表
for entity in entities: # 遍历实体列表
labels.add(entity['label_type']) # 将实体的标签类型添加到标签集合中
return labels # 返回提取的标签集合
"""
从多个数据文件中提取标签,并保存到输出文件中的函数。
"""
def extract_labels(data_paths, categories_out_file):
if isinstance(data_paths, str): # 如果data_paths是字符串,则转换为列表
data_paths = [data_paths]
labels = set() # 初始化一个空集合用于存储所有标签
for data_path in data_paths: # 遍历数据文件路径列表
tmp_labels = extract_labels_per_file(data_path) # 从单个文件中提取标签
labels = labels.union(tmp_labels) # 将提取的标签合并到总标签集合中
labels = sorted(list(labels)) # 将标签集合转换为排序后的列表
print(f"所有的标签:{labels}") # 打印所有标签
categories = {'O': 0} # 初始化类别字典,'O'标签对应索引0
for label in labels: # 遍历标签列表
for prefix in ['B', 'M', 'E', 'S']: # 遍历前缀列表
categories[f'{prefix}-{label}'] = len(categories) # 将前缀-标签组合添加到类别字典中
print(categories) # 打印类别字典
with open(categories_out_file, 'w', encoding='utf-8') as writer: # 打开输出文件进行写入
json.dump(categories, writer, indent=2, ensure_ascii=False) # 将类别字典以JSON格式写入文件
"""
从文件中加载标签类别的函数。
"""
def load_label_categories(file_path):
with open(file_path, "r", encoding="utf-8") as reader: # 打开文件进行读取
categories = json.load(reader) # 从文件中加载JSON格式的类别字典
return categories # 返回加载的类别字典
3.2 datas.json_ner_dataloader.py
# -*- coding: utf-8 -*-
import copy
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from NamedEntityRecognition.medical_ner.src.medical_ner.datas.utils import special_token_processor
def split_chars(chars, labels, max_len):
seq_len = len(chars)
st = 0
while st < seq_len:
et = min(st + max_len, seq_len)
# 判断labels[et-1]这个位置是否真的可以作为分割点
while True:
if labels[et - 1] == 'O':
break
et = et - 1 # 前移
# TODO: 能不能写一个代码,完成数据的split,split的时候要求如下:
# 1. entity不能够被分到两个样本中
# 2. 能不能尽量的在标点符号位置进行分割
sub_chars = chars[st: et]
sub_labels = labels[st: et]
yield sub_chars, sub_labels
st = et
# noinspection DuplicatedCode
class JsonNerDataset(Dataset):
"""
定义一个加载json格式原始命名实体识别格式数据的Dataset
一行一条样本(json字符串),包含: originalText、entities
"""
def __init__(self,
data_path, tokenizer, categories, target_padding_idx=-100,
add_special_token=False, first_special_token='[CLS]', last_special_token='[SEP]'
):
super(JsonNerDataset, self).__init__()
self.tokenizer: BertTokenizer = tokenizer
# self.sentence_max_len = self.tokenizer.max_len_single_sentence # 510
self.sentence_max_len = 126
self.categories = categories
self.token_padding_idx = self.tokenizer.convert_tokens_to_ids('[PAD]')
self.target_padding_idx = target_padding_idx
self.add_special_token = add_special_token
self.first_token_id = self.tokenizer.convert_tokens_to_ids(first_special_token)
self.last_token_id = self.tokenizer.convert_tokens_to_ids(last_special_token)
self.records = self.load_records(data_path)
def load_records(self, data_path) -> list:
records = []
with open(data_path, "r", encoding="utf-8") as reader:
for line in reader:
# 1. 获取原始数据
record = json.loads(line.strip())
# 2. 原始的文本数据转换
entities = record['entities']
text = special_token_processor(record['originalText'])
chars = list(text) # 分字,就是每个字对应一个token
# 3. 标签数据转换
labels = ['O'] * len(chars)
for entity in entities:
label_type = entity['label_type']
start_pos = entity['start_pos'] # 包含开始
end_pos = entity['end_pos'] # 不包含结尾
if end_pos - start_pos == 1:
# 只有一个字形成实体
labels[start_pos] = f'S-{label_type}'
elif end_pos - start_pos > 1:
# 多个字形成实体
labels[start_pos] = f'B-{label_type}'
labels[end_pos - 1] = f'E-{label_type}'
for i in range(start_pos + 1, end_pos - 1):
labels[i] = f'M-{label_type}'
else:
raise ValueError(f"数据异常:{record}")
if self.add_special_token:
# 需要对chars、labels进行split分割, 单个样本的长度不能超过510
for sub_chars, sub_labels in split_chars(chars, labels, self.sentence_max_len):
x = self.tokenizer.convert_tokens_to_ids(sub_chars) # 针对每个字获取对应的id
assert len(sub_chars) == len(x), "bert进行token id转换后,更改了列表长度."
y = [self.categories[c] for c in sub_labels]
x.insert(0, self.first_token_id)
x.append(self.last_token_id)
y.insert(0, self.categories['O'])
y.append(self.categories['O'])
assert len(x) == len(y), f"输入token的长度必须和标签的长度一致,当前长度为:{len(x)} - {len(y)} - {record}"
records.append((x, y, len(x)))
else:
x = self.tokenizer.convert_tokens_to_ids(chars) # 针对每个字获取对应的id
assert len(chars) == len(x), "bert进行token id转换后,更改了列表长度."
y = [self.categories[c] for c in labels]
assert len(x) == len(y), f"输入token的长度必须和标签的长度一致,当前长度为:{len(x)} - {len(y)} - {record}"
records.append((x, y, len(x)))
return records
def __getitem__(self, index):
"""
获取index对应的样本信息,包括x和y
:param index: 样本的索引id
:return: x,y
"""
x, y, num_tokens = self.records[index]
return copy.deepcopy(x), copy.deepcopy(y), num_tokens
def __len__(self):
return len(self.records)
def collate(self, batch):
max_length = max([t[2] for t in batch])
x, y, mask = [], [], []
for i in range(len(batch)):
_x, _y, _len_current_record = copy.deepcopy(batch[i])
_mask = [1] * _len_current_record
if _len_current_record < max_length:
_x.extend([self.token_padding_idx] * (max_length - _len_current_record))
_y.extend([self.target_padding_idx] * (max_length - _len_current_record))
_mask.extend([0] * (max_length - _len_current_record))
x.append(_x)
y.append(_y)
mask.append(_mask)
token_ids = torch.tensor(x, dtype=torch.long)
# 1表示实际token,0表示填充位置
token_masks = torch.tensor(mask, dtype=torch.long)
target_ids = torch.tensor(y, dtype=torch.long)
return token_ids, token_masks, target_ids
def create_dataloader(
data_path, tokenizer, label_categories, batch_size, shuffle,
num_workers=0, prefetch_factor=2, target_padding_idx=-100,
add_special_token=False, first_special_token='[CLS]', last_special_token='[SEP]'
):
# 创建Dataset对象
dataset = JsonNerDataset(
data_path=data_path, tokenizer=tokenizer,
categories=label_categories,
target_padding_idx=target_padding_idx,
add_special_token=add_special_token,
first_special_token=first_special_token,
last_special_token=last_special_token
)
print(f"当前dataset的总样本数目为:{data_path} - {len(dataset)}")
# dataloader实际上是一个批次的处理器,因为dataset可以返回一条一条的样本,dataloader就负责将多条样本组合成一个批次对象返回
prefetch_factor = prefetch_factor if num_workers <= 0 else num_workers * batch_size
dataloader = DataLoader(
dataset=dataset, # 给定单条样本加载的对象
batch_size=batch_size, # 给定批次大小
shuffle=shuffle, # 获取批次数据的时候是否打乱顺序
num_workers=num_workers, # 加载数据的时候是否用多进程,大于0表示使用num_workers个进程
collate_fn=dataset.collate, # 给定批次数据合并的方式
prefetch_factor=prefetch_factor # 多进程加载的时候,每个进程的预加载的样本数目,一般为num_workers * batch_size
)
return dataloader
def create_train_dataloader(
data_path, tokenizer, label_categories, batch_size, target_padding_idx=-100,
add_special_token=False, first_special_token='[CLS]', last_special_token='[SEP]'
):
return create_dataloader(
data_path, tokenizer, label_categories, batch_size,
shuffle=True, num_workers=0, prefetch_factor=2,
target_padding_idx=target_padding_idx,
add_special_token=add_special_token,
first_special_token=first_special_token,
last_special_token=last_special_token
)
def create_eval_dataloader(
data_path, tokenizer, label_categories, batch_size, target_padding_idx=-100,
add_special_token=False, first_special_token='[CLS]', last_special_token='[SEP]'
):
return create_dataloader(
data_path, tokenizer, label_categories, batch_size,
shuffle=False, num_workers=0, prefetch_factor=2,
target_padding_idx=target_padding_idx,
add_special_token=add_special_token,
first_special_token=first_special_token,
last_special_token=last_special_token
)
3.3 utils.py
# -*- coding: utf-8 -*-
def special_token_processor(text):
for old, new in [('”', '"'), ("“", '"'), ("’", "'"), ("‘", "'"), ("`", "'"), ('—', '-')]:
text = text.replace(old, new)
return text
3.4 loss.py
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
class CustomCrossEntropyLoss(nn.Module):
def __init__(self, ignore_index=-100, negative_alpha=0.2, summary_writer=None):
super(CustomCrossEntropyLoss, self).__init__()
self.target_padding_idx = ignore_index
self.negative_alpha = negative_alpha
self.loss_fn = nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='none')
self.summary_writer = summary_writer
@torch.no_grad()
def add_summary(self, positive_loss, negative_loss, loss):
if self.summary_writer is not None:
self.summary_writer.add_scalars('loss', {
'total': loss.item(),
'positive': positive_loss.item(),
'negative': negative_loss.item()
})
def forward(self, score, target):
loss = self.loss_fn(score, target)
mask = (target != self.target_padding_idx).to(dtype=loss.dtype)
loss_mask = (target > 0).to(dtype=loss.dtype)
positive_loss = loss * loss_mask # 所有非0token对应的损失
positive_loss = positive_loss.sum() / loss_mask.sum()
negative_loss = loss * (1.0 - loss_mask) # 所有0 token对应的损失
negative_loss = negative_loss.sum() / (mask.sum() - loss_mask.sum())
negative_loss = negative_loss * self.negative_alpha
loss = positive_loss + negative_loss
self.add_summary(positive_loss, negative_loss, loss)
return loss
3.5 accuracy.py
# -*- coding: utf-8 -*-
import torch
def div(x, y):
if y == 0:
return 0.0
else:
return x / y
# noinspection PyUnresolvedReferences
def calc_token_accuracy_number(score, target_labels, target_masks):
# 1. 获取模型预测标签id [N,T,num_class] -> [N,T]
pred_labels = torch.argmax(score, dim=-1)
pred_labels = pred_labels.to(dtype=target_labels.dtype, device=target_labels.device)
# print(f"预测为0的标签占比:{torch.mean((pred_labels == 0).to(dtype=torch.float))}")
# 2. 比较实际标签和预测标签,查看是否相等
is_equal = (pred_labels == target_labels).to(dtype=torch.float)
# 3. 计算均值的分子和分母
numerator = torch.sum(is_equal).cpu().item()
denominator = torch.sum(target_masks.to(dtype=is_equal.dtype)).cpu().item()
return numerator, denominator
def token_accuracy(score, target_labels, target_masks):
"""
计算token的准确率
:param score: [N,T,num_class] 模型输出的置信度
:param target_labels: [N,T] 实际样本的标签id
:param target_masks: [N,T] 填充mask,实际值的地方为1,填充值的地方为0
:return: float 准确率
"""
numerator, denominator = calc_token_accuracy_number(score, target_labels, target_masks)
if denominator <= 0.0:
return 0.0
return numerator / denominator
# noinspection PyUnresolvedReferences,DuplicatedCode
def calc_token_accuracy_number_v2(score, target_labels, target_masks, negative_target_id=0):
# 1. 获取模型预测标签id [N,T,num_class] -> [N,T]
pred_labels = torch.argmax(score, dim=-1)
pred_labels = pred_labels.to(dtype=target_labels.dtype, device=target_labels.device)
# print(f"预测为0的标签占比:{torch.mean((pred_labels == 0).to(dtype=torch.float))}")
# 2. 比较实际标签和预测标签,查看是否相等
is_equal = (pred_labels == target_labels).to(dtype=torch.float)
# 3. 计算均值的分子和分母
target_masks = target_masks.to(dtype=is_equal.dtype)
negative_mask = (target_labels == negative_target_id).to(dtype=is_equal.dtype) # 0实际token
negative_numerator = torch.sum(is_equal * negative_mask).cpu().item()
negative_denominator = torch.sum(negative_mask).cpu().item()
positive_mask = (1.0 - negative_mask) * target_masks # 非0实际token
positive_numerator = torch.sum(is_equal * positive_mask).cpu().item()
positive_denominator = torch.sum(positive_mask).cpu().item()
return negative_numerator, negative_denominator, positive_numerator, positive_denominator
def token_accuracy_v2(score, target_labels, target_masks):
"""
计算token的准确率
:param score: [N,T,num_class] 模型输出的置信度
:param target_labels: [N,T] 实际样本的标签id
:param target_masks: [N,T] 填充mask,实际值的地方为1,填充值的地方为0
:return: (float,float,float) 准确率
"""
negative_numerator, negative_denominator, positive_numerator, positive_denominator = calc_token_accuracy_number_v2(
score, target_labels, target_masks
)
numerator = negative_numerator + positive_numerator
denominator = negative_denominator + positive_denominator
negative_acc = div(negative_numerator, negative_denominator)
positive_acc = div(positive_numerator, positive_denominator)
acc = div(numerator, denominator)
return negative_acc, positive_acc, acc
3.6 ner_bert_softmax.py
# -*- coding: utf-8 -*-
import copy
import torch
import torch.nn as nn
from transformers import BertModel
class TransformerEncoderSoftmaxNerModel(nn.Module):
def __init__(self, vocab_size, hidden_size, encoder_num_head, encoder_num_layers, num_class):
super(TransformerEncoderSoftmaxNerModel, self).__init__()
# 确保 hidden_size 能被 encoder_num_head 整除
assert hidden_size % encoder_num_head == 0, "参数中的hidden_size必须能够整除encoder_num_head"
# 词嵌入层
self.emb_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=hidden_size)
# Transformer 编码器层
encoder_layer = nn.TransformerEncoderLayer(
d_model=hidden_size,
nhead=encoder_num_head,
dim_feedforward=hidden_size * 4,
batch_first=True
)
self.encoder = nn.TransformerEncoder(
encoder_layer=encoder_layer, num_layers=encoder_num_layers
)
# 分类层(多层感知机)
self.classify = nn.Sequential(
nn.Dropout(p=0.1),
nn.Linear(hidden_size, hidden_size * 4),
nn.ReLU(),
nn.Dropout(p=0.1),
nn.Linear(hidden_size * 4, hidden_size * 4),
nn.ReLU(),
nn.Linear(hidden_size * 4, num_class),
)
self.num_class = num_class
self.hidden_size = hidden_size
def forward(self, token_ids, token_masks=None):
"""
前向过程
:param token_ids: [N,T] N个样本,每个样本有T个token
:param token_masks: [N,T] N个样本,每个样本的每个token是否是真实的实际token,True表示填充,False表示实际值
:return: [N,T,num_class] N个样本,每个样本有T个token,每个token属于num_class个类别的置信度
"""
if token_masks is not None:
mask_dtype = token_masks.dtype
if mask_dtype in [torch.int, torch.int64, torch.int32, torch.int8, torch.int16]:
token_masks = (1 - token_masks).to(dtype=torch.bool)
# 1. 获取每个token对应的静态词向量 [N,T] -> [N,T,E]
token_x = self.emb_layer(token_ids)
# 2. token交叉获取每个token对应的动态词向量 [N,T,E] -> [N,T,E]
token_x = self.encoder(token_x, src_key_padding_mask=token_masks)
# 3. 针对每个token进行全连接,判断属于各个类别的置信度
# 最终的矩阵乘法操作为: [N,T,?] * [?,num_class] -> [N, T, num_class]
score = self.classify(token_x)
return score
if __name__ == '__main__':
token_padding_idx = 0
target_padding_idx = -100
def get_batch_record():
batch = [
(
[122, 116, 2399, 1184, 2642, 5442, 3187, 3209, 3227, 6430, 1728, 6237, 7946, 5682, 2768, 2501, 1920,
912,
8024,
7030, 5276, 123, 126, 121, 149, 120, 3613, 8024, 845, 1928, 3210, 510, 726, 1213, 510, 4649, 5502,
7945,
5606,
5721, 4635, 8024, 3187, 1445, 6117, 510, 5592, 4578, 510, 5592, 5515, 510, 5592, 3811, 8024, 3187,
1353,
7000,
510, 1641, 3698, 510, 4173, 2552, 8024, 3187, 1366, 2397, 510, 7582, 7481, 5501, 860, 3717, 5514, 510,
5592,
1741, 1872, 1920, 511, 754, 2769, 7368, 3389, 107, 100, 144, 124, 123, 149, 120, 100, 107, 8024, 6402,
3171,
711, 107, 122, 510, 3867, 1265, 6887, 1139, 6117, 1333, 1728, 2521, 3389, 100, 123, 510, 7028, 2428,
6577,
6117, 124, 510, 5498, 4801, 1265, 125, 510, 2714, 2595, 688, 1798, 5498, 4142, 107, 8024, 2867, 5318,
6121,
5517, 7262, 3466, 3389, 8024, 5314, 750, 3632, 6117, 510, 5272, 3633, 6577, 6117, 510, 2829, 7000,
5023,
2190,
4568, 3780, 4545, 8024, 3313, 6237, 7946, 912, 1400, 5632, 1220, 1139, 7368, 511, 4567, 4923, 704,
1914,
3613,
1139, 4385, 7946, 912, 1315, 3341, 2769, 4906, 857, 7368, 3780, 4545, 8024, 3295, 754, 123, 121, 122,
126,
118,
121, 127, 118, 122, 128, 6121, 107, 3187, 4578, 5517, 7262, 4850, 5517, 2419, 7474, 5549, 3289, 2476,
8020,
122, 3340, 8021, 8024, 100, 122, 8024, 5273, 5682, 2519, 7346, 2595, 8039, 5517, 860, 7945, 5606, 7384,
6629,
8024, 679, 7370, 1912, 7474, 5549, 3289, 2476, 8039, 1282, 753, 2900, 5499, 4413, 4142, 107, 8024,
5440,
5991,
678, 3867, 1265, 6887, 1139, 6117, 1333, 1728, 679, 5543, 3209, 4802, 8024, 5314, 750, 107, 3632, 6117,
510,
5272, 3633, 6577, 6117, 510, 2829, 7000, 107, 5023, 2190, 4568, 1905, 4415, 1400, 4568, 4307, 1962,
6760,
1139,
7368, 8024, 3313, 1086, 3633, 6226, 3466, 3389, 1350, 3780, 4545, 511, 1057, 7368, 1288, 1921, 1184,
8024,
2642, 5442, 1086, 3613, 6237, 7946, 1920, 912, 122, 3613, 8024, 7946, 5682, 2768, 2501, 1920, 912,
8024,
7030,
5276, 126, 121, 149, 8024, 3187, 7945, 3890, 5555, 6117, 8024, 3187, 2626, 2552, 510, 1445, 1402, 510,
1445,
6117, 8024, 3187, 3238, 1335, 510, 7946, 3310, 8024, 3187, 1139, 1107, 3731, 510, 2552, 2654, 6134,
4385,
511,
754, 2769, 7368, 7305, 6402, 3389, 107, 1920, 912, 2382, 6226, 8038, 7946, 5682, 100, 6763, 912, 8024,
7391,
6117, 6407, 7741, 100, 7345, 2595, 8020, 116, 8021, 511, 107, 711, 6822, 671, 3635, 2218, 6402, 8024,
6876,
754, 2769, 7368, 2218, 6402, 8024, 2877, 107, 3867, 1265, 6887, 1139, 6117, 107, 3119, 1057, 857, 7368,
511,
4567, 4923, 704, 5125, 4868, 510, 4717, 4697, 3612, 881, 8024, 7650, 7608, 2213, 1377, 8024, 1920, 912,
1963,
678, 6835, 8024, 2207, 912, 3633, 2382, 511, 860, 7028, 3187, 3209, 3227, 1121, 6768, 511], # 样本1的x
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
0,
0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 24, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21,
23,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 14,
14,
14,
14, 14, 15, 0, 0, 0, 13, 14, 14, 15, 0, 0, 13, 14, 15, 0, 0, 13, 14, 14, 14, 14, 15, 0, 0, 0, 0, 0, 0,
0,
0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 14,
14,
15,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 23, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 15, 0, 13, 14,
14,
14,
14, 15, 0, 0, 0, 0, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 15,
0,
0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2,
3,
0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 14, 15, 0, 0, 0, 0, 0, 0,
0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# 样本1的y
), # 样本1
(
[2642, 5442, 1057, 7368, 1184, 123, 2399, 3187, 3209, 3227, 6430, 1728, 3466, 3389, 3198, 1355, 4385,
139,
107,
5511, 6956, 5310, 5688, 139, 107, 8024, 2496, 3198, 2642, 5442, 3187, 3209, 3227, 1495, 1644, 510,
5541,
4578,
5023, 679, 6844, 8024, 2456, 6379, 2642, 5442, 2137, 3309, 7390, 6393, 2772, 6822, 671, 3635, 3466,
3389,
8024,
2642, 5442, 3313, 6905, 1278, 1671, 511, 1057, 7368, 1184, 122, 3299, 2642, 5442, 2697, 1495, 1644,
8024,
1914,
809, 2397, 1495, 711, 712, 8024, 981, 1377, 1495, 1139, 2208, 6387, 4635, 5682, 3796, 3773, 4588, 8024,
2400,
4588, 704, 2372, 6117, 8024, 3833, 1220, 1400, 2697, 3698, 1596, 8024, 3187, 4519, 2170, 1355, 4178,
8024,
3187, 5541, 4578, 8024, 3187, 4060, 4178, 4668, 3731, 5023, 679, 6844, 8024, 754, 2769, 7368, 1908,
3389,
5541,
6956, 100, 100, 1355, 4385, 5511, 6956, 5310, 5688, 680, 123, 121, 122, 126, 2399, 2190, 3683, 3209,
3227,
1872, 1920, 8024, 5440, 5991, 107, 2626, 2595, 5514, 4606, 107, 8024, 754, 129, 1921, 1184, 2642, 5442,
754,
115, 115, 115, 115, 115, 6121, 100, 100, 100, 120, 100, 100, 8024, 6402, 3171, 107, 2340, 5511, 677,
1383,
6818, 5511, 7305, 1905, 1350, 2340, 5511, 677, 1383, 1400, 1825, 2419, 3667, 1304, 855, 2595, 4567,
1359,
8024,
807, 6468, 3833, 2595, 1872, 7770, 8024, 5440, 5991, 2626, 2595, 5514, 4606, 1377, 5543, 107, 8024,
3466,
3389,
2496, 3241, 2642, 5442, 4960, 1355, 5541, 4578, 8024, 809, 1187, 4960, 677, 1196, 4164, 4563, 4578,
8024,
2400,
5541, 7315, 510, 1920, 3731, 3900, 4027, 8024, 754, 115, 115, 115, 115, 115, 2593, 6402, 2218, 6402,
8024,
5440, 5991, 107, 2593, 2595, 677, 1880, 510, 1400, 1880, 510, 2340, 2147, 2552, 5491, 3453, 3647, 107,
8024,
2593, 6402, 6121, 1355, 4385, 2340, 1094, 6818, 704, 3667, 2130, 1059, 7308, 1853, 8024, 3490, 1057,
3118,
3373, 122, 3361, 8024, 3318, 1400, 2642, 5442, 5541, 4578, 1962, 6760, 8024, 5326, 5330, 1094, 2552,
4567,
753,
5277, 7564, 7344, 2487, 1265, 3780, 4545, 8024, 3634, 3613, 857, 7368, 6435, 6421, 7368, 1461, 1429,
1079,
4906, 833, 6402, 8024, 5440, 5991, 107, 3300, 1158, 3466, 3389, 107, 7599, 7372, 6772, 7770, 8024,
3313,
2456,
6379, 6822, 671, 3635, 3466, 3389, 1350, 4294, 3654, 3780, 4545, 8024, 1057, 7368, 1184, 122, 1921,
2642,
5442,
5632, 6421, 7368, 2552, 1079, 4906, 1139, 7368, 8024, 852, 2642, 5442, 793, 6230, 1495, 1644, 8024,
1492,
4588,
8024, 4588, 704, 3187, 1139, 6117, 8024, 791, 3189, 1168, 2769, 7368, 2218, 6402, 8024, 711, 6822, 671,
3635,
3780, 4545, 3119, 1057, 2769, 4906, 857, 7368, 511, 511, 4567, 4923, 704, 2642, 5442, 5125, 4868, 510,
7650,
7608, 671, 5663, 8024, 1920, 2207, 912, 3633, 2382, 8024, 860, 7028, 3187, 3209, 3227, 677, 7360, 511],
# 样本2的x
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0,
24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,
0,
0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 7, 0, 0, 13, 14, 14, 15, 0, 0,
0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7,
0, 5,
7,
0, 0, 0, 0, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14,
14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 21, 22,
23,
0,
0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 14, 14,
14,
14,
14, 14, 14, 14, 14, 14, 15, 0, 0, 0, 0, 0, 0, 0, 21, 22, 22, 22, 23, 0, 0, 0, 0, 0, 9, 10, 10, 11, 0,
0, 0,
0,
0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0]
# 样本2的y
) # 样本2
]
max_length = max([len(t[0]) for t in batch])
x, y, mask = [], [], []
for i in range(len(batch)):
_x, _y = copy.deepcopy(batch[i])
len_current_record = len(_x)
_mask = [False] * len_current_record
if len_current_record < max_length:
_x.extend([token_padding_idx] * (max_length - len_current_record))
_y.extend([target_padding_idx] * (max_length - len_current_record))
_mask.extend([True] * (max_length - len_current_record))
x.append(_x)
y.append(_y)
mask.append(_mask)
token_ids = torch.tensor(x)
token_masks = torch.tensor(mask)
target_ids = torch.tensor(y)
return token_ids, token_masks, target_ids
net = TransformerEncoderSoftmaxNerModel(
vocab_size=21128,
hidden_size=128,
encoder_num_head=4,
encoder_num_layers=2,
num_class=25
)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
token_ids, token_masks, target_ids = get_batch_record()
print(token_ids)
print(target_ids)
print(token_masks)
score = net(token_ids, token_masks)
print(score.shape)
# target_ids: [N,T]
# score: [N,T,num_class] -> [N,num_class,T]
loss = loss_fn(torch.transpose(score, dim0=2, dim1=1), target_ids)
print(loss)
3.7 training.py
# -*- coding: utf-8 -*-
import os
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from NamedEntityRecognition.medical_ner.src.medical_ner.datas.bmeso_ner_labels import load_label_categories
from NamedEntityRecognition.medical_ner.src.medical_ner.datas.json_ner_dataloader import create_train_dataloader, create_eval_dataloader
from NamedEntityRecognition.medical_ner.src.medical_ner.losses.loss import CustomCrossEntropyLoss
from NamedEntityRecognition.medical_ner.src.medical_ner.metrics.accuracy import token_accuracy, token_accuracy_v2, \
calc_token_accuracy_number_v2, div
from NamedEntityRecognition.medical_ner.src.medical_ner.models.ner_bert_softmax import TransformerEncoderSoftmaxNerModel
from transformers import BertTokenizer
def train(
train_data_path, eval_data_path, bert_tokenizer_path, label_path,
model_save_path, summary_log_dir,
batch_size=8, total_epoch=10,
hidden_size=128, encoder_num_head=4, encoder_num_layers=2,
target_padding_idx=-100,
lr=0.01
):
"""
训练一个基于 Transformer 编码器的命名实体识别(NER)模型。
:param train_data_path: 训练数据文件路径。
:param eval_data_path: 评估数据文件路径。
:param bert_tokenizer_path: BERT 分词器的预训练模型路径。
:param label_path: 标签类别映射文件路径。
:param model_save_path: 模型保存路径。
:param summary_log_dir: TensorBoard 日志保存路径。
:param batch_size: 每个批次的样本数量,默认为 8。
:param total_epoch: 总训练轮数,默认为 10。
:param hidden_size: 隐藏层维度,默认为 128。
:param encoder_num_head: Transformer 编码器的头数,默认为 4。
:param encoder_num_layers: Transformer 编码器的层数,默认为 2。
:param target_padding_idx: 目标标签的填充索引,默认为 -100。
:param lr: 学习率,默认为 0.01。
"""
# 创建模型保存路径和日志路径的目录,如果不存在则创建
os.makedirs(model_save_path, exist_ok=True)
os.makedirs(summary_log_dir, exist_ok=True)
# 依赖信息的加载
'''
使用 BertTokenizer 加载预训练的 BERT 分词器。
加载标签类别映射(从文件中读取)。
定义保存最优模型和最后一个模型的路径。
'''
tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_path)
label_categories = load_label_categories(label_path)
best_dump_path = os.path.join(model_save_path, "best.pkl")
last_dump_path = os.path.join(model_save_path, "last.pkl")
# 1. 数据加载及解析
train_dataloader = create_train_dataloader(
train_data_path, tokenizer, label_categories, batch_size,
target_padding_idx=target_padding_idx
)
eval_dataloader = create_eval_dataloader(
eval_data_path, tokenizer, label_categories, batch_size * 2,
target_padding_idx=target_padding_idx
)
# 2. 网络构造、损失函数构造、优化器构造
#初始化 NER 模型(基于 Transformer 编码器)
net = TransformerEncoderSoftmaxNerModel(
vocab_size=tokenizer.vocab_size,
hidden_size=hidden_size,
encoder_num_head=encoder_num_head,
encoder_num_layers=encoder_num_layers,
num_class=len(label_categories)
)
# 需要安装一个tensorboard库,一般情况下安装tensorflow的时候,会自动安装这个库
# pip install tensorboard==2.12.3
# 查看可视化页面,在命令行,执行以下命令:
# tensorboard --logdir log_dir
# tensorboard --logdir D:\workspaces\study\NLP04\projects\NamedEntityRecognition\medical_ner\test\output\medical\ner_softmax\logs
#tensorboard --logdir D:\P4\04_NLP项目\20241116\NamedEntityRecognition\NamedEntityRecognition\medical_ner\test\output\medical\ner_softmax\logs
# 初始化 TensorBoard 日志记录器
writer = SummaryWriter(log_dir=summary_log_dir)
# writer.add_graph(net, input_to_model=torch.randint(0, 100, (4, 20)))
# 定义损失函数和优化器
loss_fn = CustomCrossEntropyLoss(ignore_index=target_padding_idx, summary_writer=writer)
train_fn = optim.SGD(params=net.parameters(), lr=lr)
# 3/4/5. 遍历训练、评估、持久化
# 初始化最佳准确率
best_acc = float('-inf')
#开始训练循环,遍历指定的总轮数。
for epoch in range(total_epoch):
# 当前epoch模型训练
net.train()
train_fn.zero_grad()
for batch, (x, mask, y) in enumerate(train_dataloader):
# a. 前向过程
score = net(token_ids=x, token_masks=mask) # score: [N,T,num_class]
loss = loss_fn(torch.permute(score, dims=(0, 2, 1)), y)
acc1 = token_accuracy(score, y, target_masks=mask)
negative_acc, positive_acc, acc2 = token_accuracy_v2(score, y, target_masks=mask)
# b. 反向更新
loss.backward()
train_fn.step()
train_fn.zero_grad()
# c. 日志的输出/运行过程的输出
writer.add_scalar('train_loss', loss.item())
writer.add_scalars('train_acc', {
'acc': acc2,
'positive': positive_acc,
'negative': negative_acc
})
print(f"Epoch {epoch}/{total_epoch} Batch {batch} Loss {loss.item():.5f} "
f"Token Accuracy {acc1:.3f} - {negative_acc:.3f} - {positive_acc:.3f} - {acc2:.3f}")
# 当前epoch的模型评估&持久化
net.eval()
with torch.no_grad():
eval_acc_number = [0.0, 0.0, 0.0, 0.0]
for _, (x, mask, y) in enumerate(eval_dataloader):
score = net(token_ids=x, token_masks=mask) # score: [N,?,num_class]
acc_number = calc_token_accuracy_number_v2(
score, y, mask
)
for i in range(4):
eval_acc_number[i] += acc_number[i]
eval_negative_acc = div(eval_acc_number[0], eval_acc_number[1])
eval_positive_acc = div(eval_acc_number[2], eval_acc_number[3])
eval_acc = div(eval_acc_number[0] + eval_acc_number[2], eval_acc_number[1] + eval_acc_number[3])
print(f"Epoch {epoch}/{total_epoch} "
f"Eval Token Accuracy {eval_negative_acc:.3f} - {eval_positive_acc:.3f} - {eval_acc:.3f}")
writer.add_scalars('eval_acc', {
'acc': eval_acc,
'positive': eval_positive_acc,
'negative': eval_negative_acc
}, global_step=epoch)
# 模型持久化
# 创建一个字典,用于保存模型的状态和相关信息
save_obj = {
'net': net.state_dict(),# 保存模型的参数状态(state_dict)
'epoch': epoch, # 保存当前的训练轮数
'best_accuracy': eval_acc# 保存当前的评估准确率
}
# 如果当前评估的准确率高于历史最佳准确率
if eval_acc > best_acc:
print(f"保存最优模型为 {epoch} {eval_acc} {best_dump_path}")
# 保存当前模型的状态到最优模型路径
torch.save(save_obj, best_dump_path)
# 更新历史最佳准确率
best_acc = eval_acc
# 无论当前模型是否为最优,都将当前模型的状态保存到最后一个模型路径
torch.save(save_obj, last_dump_path)
# 训练完成后,关闭相关资源
writer.close()
3.8 training_softmax_ner_model.py (运行)
# -*- coding: utf-8 -*-
import sys
print(sys.path)
# from medical_ner.ner_softmax.training import train
#from NamedEntityRecognition.medical_ner.src.medical_ner.ner_softmax.bert_training import train, export
from NamedEntityRecognition.medical_ner.src.medical_ner.ner_softmax.training import train
if __name__ == '__main__':
bert_tokenizer_path = r"C:\Users\.cache\huggingface\hub\models--bert-base-chinese\snapshots\c30a6ed22ab4564dc1e3b2ecbf6e766b0611a33f"
train_data_path = "./datas/medical/training.txt"
eval_data_path = "./datas/medical/test.json"
label_path = "./datas/medical/categories.json"
model_save_path = "./output/medical/ner_softmax/models"
summary_log_dir = "./output/medical/ner_softmax/logs"
train(
train_data_path, eval_data_path, bert_tokenizer_path, label_path,
model_save_path, summary_log_dir
)
运行结果:生成output日志
tensorboard 使用
TensorBoard 是可视化工具,可以与 PyTorch结合使用用于帮助开发者和研究人员直观地分析和理解机器学习模型的训练过程。它提供了丰富的可视化功能,包括标量(Scalars)、图像(Images)、图形(Graphs)、直方图(Histograms)、嵌入(Embeddings)等。
需要安装一个tensorboard库,一般情况下安装tensorflow的时候,会自动安装这个库 pip install tensorboard==2.12.3 查看可视化页面,在命令行,执行以下命令: tensorboard --logdir log_dir 其中log_dir为如图所示log文件的绝对路径
复制本地地址到浏览器