基于BILSTM及其他RNN序列模型的人名分类器
数据集Kaggle链接
NameNationalLanguage | Kaggle
数据集分布:
第一列为人名,第二列为国家标签
代码开源地址
Kaggle代码链接
https://www.kaggle.com/code/houjijin/name-nationality-classification
Gitee码云链接
人名国籍分类 Name Nation classification: using BILSTM to predict individual's nationality by their name
github链接
GitHub - Foxbabe1q/Name-Nation-classification: Use BILSTM to do the classification of individuals by their names
RNN序列模型类编写
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
class SimpleRNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(SimpleRNN, self).__init__()
self.hidden_size = hidden_size
self.input_size = input_size
self.num_layers = num_layers
self.output_size = 18
self.rnn = nn.RNN(input_size, hidden_size, num_layers = num_layers, batch_first=True)
self.fc = nn.Linear(self.hidden_size, self.output_size)
def forward(self, x, hidden):
output, hidden = self.rnn(x, hidden)
output = output[:, -1, :]
output = self.fc(output)
return output, hidden
def init_hidden(self, batch_size):
hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
return hidden
class SimpleLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(SimpleLSTM, self).__init__()
self.hidden_size = hidden_size
self.input_size = input_size
self.num_layers = num_layers
self.output_size = 18
self.rnn = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
self.fc = nn.Linear(self.hidden_size, self.output_size)
def forward(self, x, hidden, c):
output, (hidden, c) = self.rnn(x, (hidden, c))
output = output[:, -1, :]
output = self.fc(output)
return output, hidden, c
def init_hidden(self, batch_size):
hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
return hidden, c0
class SimpleBILSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(SimpleBILSTM, self).__init__()
self.hidden_size = hidden_size
self.input_size = input_size
self.num_layers = num_layers
self.output_size = 18
self.rnn = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
self.fc = nn.Linear(self.hidden_size*2, self.output_size)
def forward(self, x, hidden, c):
output, (hidden, c) = self.rnn(x, (hidden, c))
output = output[:, -1, :]
output = self.fc(output)
return output, hidden, c
def init_hidden(self, batch_size):
hidden = torch.zeros(self.num_layers*2, batch_size, self.hidden_size, device=device)
c0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size, device=device)
return hidden, c0
class SimpleGRU(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(SimpleGRU, self).__init__()
self.hidden_size = hidden_size
self.input_size = input_size
self.num_layers = num_layers
self.output_size = 18
self.rnn = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True)
self.fc = nn.Linear(self.hidden_size, self.output_size)
def forward(self, x, hidden):
output, hidden = self.rnn(x, hidden)
output = output[:, -1, :]
output = self.fc(output)
return output, hidden
def init_hidden(self, batch_size):
hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
return hidden
注意这里BILSTM类中,由于双向lstm会使用两个lstm模型分别处理前向序列和反向序列,所以在初始化隐藏层和记忆细胞层的时候要设置num_layers为2.
导包
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from RNN_Series1 import SimpleRNN, SimpleLSTM, SimpleGRU, SimpleBILSTM
from torch.utils.data import Dataset, DataLoader
import string
from sklearn.preprocessing import LabelEncoder
import time
字符序列及device定义
letters = string.ascii_letters + " .,;'"
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
数据读取及标签列编码
def load_data():
data = pd.read_csv('name_classfication.txt', sep='\t', names = ['name', 'country'])
X = data[['name']]
lb = LabelEncoder()
y = data['country']
y = lb.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
return X_train, X_test, y_train, y_test
数据集定义
class create_dataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
self.length = len(self.X)
def __len__(self):
return self.length
def __getitem__(self, idx):
data = torch.zeros(10, len(letters), dtype = torch.float, device=device)
for i, letter in enumerate(self.X.iloc[idx,0]):
if i==10:
break
data[i,letters.index(letter)] = 1
label = torch.tensor(self.y[idx], dtype = torch.long, device=device)
return data, label
这里使用字符序列进行独热编码,并且由于名字长度不一,所以经过序列长度分布,选取了10作为截断长度.
使用RNN训练
def train_rnn():
X_train, X_test, y_train, y_test = load_data()
criterion = nn.CrossEntropyLoss(reduction='sum')
loss_list = []
acc_list = []
val_acc_list = []
val_loss_list = []
epochs = 10
my_dataset = create_dataset(X_train, y_train)
val_dataset = create_dataset(X_test, y_test)
my_dataloader = DataLoader(my_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=len(y_test), shuffle=True)
my_rnn = SimpleRNN(len(letters), 128,2)
my_rnn.to(device)
optimizer = torch.optim.Adam(my_rnn.parameters(), lr=0.001)
start_time = time.time()
for epoch in range(epochs):
my_rnn.train()
total_loss = 0
total_acc = 0
total_sample = 0
for i, (X,y) in enumerate(my_dataloader):
output, hidden = my_rnn(X, my_rnn.init_hidden(batch_size=len(y)))
total_sample += len(y)
loss = criterion(output, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
prediction = output.argmax(dim=1)
acc_num = torch.sum(prediction == y).item()
total_acc += acc_num
loss_list.append(total_loss/total_sample)
acc_list.append(total_acc/total_sample)
my_rnn.eval()
with torch.no_grad():
for i, (X_val, y_val) in enumerate(val_dataloader):
output, hidden = my_rnn(X_val, my_rnn.init_hidden(batch_size=len(y_test)))
loss = criterion(output, y_val)
prediction = output.argmax(dim=1)
acc_num = torch.sum(prediction == y_val).item()
val_acc_list.append(acc_num/len(y_val))
val_loss_list.append(loss.item()/len(y_val))
print(f'epoch: {epoch+1}, train_loss: {total_loss/total_sample:.2f}, train_acc: {total_acc/total_sample:.2f}, val_loss: {loss.item()/len(y_val):.2f}, val_acc: {acc_num/len(y_val):.2f}, time: {time.time() - start_time : .2f}')
torch.save(my_rnn.state_dict(), 'rnn.pt')
plt.plot(np.arange(1,11),loss_list,label = 'Training Loss')
plt.plot(np.arange(1,11),val_loss_list,label = 'Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.xticks(np.arange(1,11))
plt.title('Loss')
plt.legend()
plt.savefig('logg.png')
plt.show()
plt.plot(np.arange(1,11),acc_list,label = 'Training Accuracy')
plt.plot(np.arange(1,11),val_acc_list,label = 'Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.xticks(np.arange(1,11))
plt.title('Accuracy')
plt.legend()
plt.savefig('accuracy.png')
plt.show()
使用BILSTM训练
def train_bilstm():
X_train, X_test, y_train, y_test = load_data()
criterion = nn.CrossEntropyLoss(reduction='sum')
loss_list = []
acc_list = []
val_acc_list = []
val_loss_list = []
epochs = 10
my_dataset = create_dataset(X_train, y_train)
val_dataset = create_dataset(X_test, y_test)
my_dataloader = DataLoader(my_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=len(y_test), shuffle=True)
my_rnn = SimpleBILSTM(len(letters), 128,2)
my_rnn.to(device)
optimizer = torch.optim.Adam(my_rnn.parameters(), lr=0.001)
start_time = time.time()
for epoch in range(epochs):
my_rnn.train()
total_loss = 0
total_acc = 0
total_sample = 0
for i, (X,y) in enumerate(my_dataloader):
hidden,c0 = my_rnn.init_hidden(batch_size=len(y))
output, hidden,c = my_rnn(X, hidden,c0)
total_sample += len(y)
loss = criterion(output, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
prediction = output.argmax(dim=1)
acc_num = torch.sum(prediction == y).item()
total_acc += acc_num
loss_list.append(total_loss/total_sample)
acc_list.append(total_acc/total_sample)
my_rnn.eval()
with torch.no_grad():
for i, (X_val, y_val) in enumerate(val_dataloader):
hidden, c0 = my_rnn.init_hidden(batch_size=len(y_val))
output, hidden ,c= my_rnn(X_val, hidden,c0)
loss = criterion(output, y_val)
prediction = output.argmax(dim=1)
acc_num = torch.sum(prediction == y_val).item()
val_acc_list.append(acc_num/len(y_val))
val_loss_list.append(loss.item()/len(y_val))
print(f'epoch: {epoch+1}, train_loss: {total_loss/total_sample:.2f}, train_acc: {total_acc/total_sample:.2f}, val_loss: {loss.item()/len(y_val):.2f}, val_acc: {acc_num/len(y_val):.2f}, time: {time.time() - start_time : .2f}')
torch.save(my_rnn.state_dict(), 'bilstm.pt')
plt.plot(np.arange(1,11),loss_list,label = 'Training Loss')
plt.plot(np.arange(1,11),val_loss_list,label = 'Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.xticks(np.arange(1,11))
plt.title('Loss')
plt.legend()
plt.savefig('loss.png')
plt.show()
plt.plot(np.arange(1,11),acc_list,label = 'Training Accuracy')
plt.plot(np.arange(1,11),val_acc_list,label = 'Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.xticks(np.arange(1,11))
plt.title('Accuracy')
plt.legend()
plt.savefig('accuracy.png')
plt.show()
这里超参数设置为epochs:10,lr:1e-3,Adam优化器
epoch: 1, train_loss: 1.70, train_acc: 0.51, val_loss: 1.50, val_acc: 0.56, time: 11.83
epoch: 2, train_loss: 1.36, train_acc: 0.60, val_loss: 1.25, val_acc: 0.64, time: 22.84
epoch: 3, train_loss: 1.19, train_acc: 0.65, val_loss: 1.10, val_acc: 0.69, time: 33.76
epoch: 4, train_loss: 1.05, train_acc: 0.69, val_loss: 0.97, val_acc: 0.72, time: 44.63
epoch: 5, train_loss: 0.93, train_acc: 0.73, val_loss: 0.91, val_acc: 0.74, time: 55.49
epoch: 6, train_loss: 0.85, train_acc: 0.75, val_loss: 0.85, val_acc: 0.75, time: 66.38
epoch: 7, train_loss: 0.78, train_acc: 0.77, val_loss: 0.78, val_acc: 0.77, time: 77.38
epoch: 8, train_loss: 0.73, train_acc: 0.78, val_loss: 0.75, val_acc: 0.77, time: 88.27
epoch: 9, train_loss: 0.68, train_acc: 0.79, val_loss: 0.71, val_acc: 0.78, time: 99.44
epoch: 10, train_loss: 0.64, train_acc: 0.80, val_loss: 0.72, val_acc: 0.78, time: 110.43
完整代码的开源链接可以查询kaggle,gitee,github链接,其中gitee和github仓库中有训练好的模型权重,有需要可以在模型实例化后直接使用.
如需使用其他rnn序列模型如lstm和gru也可以直接实例化这里对应的模型类进行训练即可