Chapter 6 -Fine-tuning for classification
- 引入不同的LLM微调方法
- 准备用于文本分类的数据集
- 修改预训练的 LLM 进行微调
- 微调 LLM 以识别垃圾邮件
- 评估微调LLM分类器的准确性
- 使用微调的 LLM 对新数据进行分类
用于分类的微调(步骤 8)
和用于遵循指令的微调(步骤 9)。
6.1-Different categories of fine-tuning
微调语言模型的最常见方式是指令微调(instruction-finetuning)和分类微调(classification finetuning)
关键点在于,经过分类微调的模型仅限于预测其在训练过程中遇到的类别。例如,它可以判断某条文本是“垃圾邮件”还是“非垃圾邮件”,如图 下图所示,展示了一个使用大语言模型(LLM)进行文本分类的场景。经过垃圾邮件分类微调的模型不需要在输入之外提供额外指令。与指令微调模型不同,它只能响应“垃圾邮件”或“非垃圾邮件”。,但它无法对输入文本做出其他判断
6.2-Preparing the dataset
如下图所示,分类微调 LLM 的三阶段过程
import urllib.request import zipfile import os from pathlib import Path url = "" zip_path = "" extracted_path = "sms_spam_collection" data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path): if data_file_path.exists(): print(f"{data_file_path} already exists. Skipping download and extraction.") return # Downloading the file with urllib.request.urlopen(url) as response: with open(zip_path, "wb") as out_file: out_file.write( # Unzipping the file with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(extracted_path) # Add .tsv file extension original_file_path = Path(extracted_path) / "SMSSpamCollection" os.rename(original_file_path, data_file_path) print(f"File downloaded and saved as {data_file_path}") download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
import pandas as pd download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) df = pd.read_csv(data_file_path, sep='\t', header=None, names=["Label", "Text"]) df
当我们检查类分布时,我们看到数据包含“ham”(即“not spam”)的频率比“spam”高得多
print(df["Label"].value_counts()) """输出""" Label ham 4825 spam 747 Name: count, dtype: int64
def create_balanced_dataset(df): # Count the instances of "spam" num_spam = df[df["Label"] == "spam"].shape[0] # Randomly sample "ham" instances to match the number of "spam" instances ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123) # Combine ham "subset" with "spam" balanced_df = pd.concat([ham_subset , df[df["Label"] == "spam"]] , ignore_index=True ) return balanced_df balanced_df = create_balanced_dataset(df) print(balanced_df["Label"].value_counts()) """输出""" Label ham 747 spam 747 Name: count, dtype: int64
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) balanced_df
def random_split(df, train_frac, validation_frac): # Shuffle the entire DataFrame df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Calculate split indices train_end = int(len(df) * train_frac) validation_end = train_end + int(len(df) * validation_frac) # Split the DataFrame train_df = df[:train_end] validation_df = df[train_end:validation_end] test_df = df[validation_end:] return train_df, validation_df, test_df train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) # Test size is implied to be 0.2 as the remainder train_df.to_csv("train.csv", index=None) validation_df.to_csv("validation.csv", index=None) test_df.to_csv("test.csv", index=None)
6.3-Creating data loaders
- 将所有消息截断为数据集或批处理中最短消息的长度
- 将所有消息填充到数据集或批处理中最长消息的长度
tokenizer = tiktoken.get_encoding("gpt2") print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})) """输出""" [50256]
import torch from import Dataset class SpamDataset(Dataset): def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256): = pd.read_csv(csv_file) # Pre-tokenize texts self.encoded_texts = [ tokenizer.encode(text) for text in["Text"] ] if max_length is None: self.max_length = self._longest_encoded_length() else: self.max_length = max_length # Truncate sequences if they are longer than max_length self.encoded_texts = [ encoded_text[:self.max_length] for encoded_text in self.encoded_texts ] # Pad sequences to the longest sequence self.encoded_texts = [ encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts ] def __getitem__(self, index): encoded = self.encoded_texts[index] label =[index]["Label"] return ( torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long) ) def __len__(self): return len( def _longest_encoded_length(self): max_length = 0 for encoded_text in self.encoded_texts: encoded_length = len(encoded_text) if encoded_length > max_length: max_length = encoded_length return max_length
train_dataset = SpamDataset( csv_file="train.csv", max_length=None, tokenizer=tokenizer ) print(train_dataset.max_length) """输出""" 120
。val_dataset = SpamDataset( csv_file="validation.csv", max_length=train_dataset.max_length, tokenizer=tokenizer ) test_dataset = SpamDataset( csv_file="test.csv", max_length=train_dataset.max_length, tokenizer=tokenizer )
接下来,我们使用数据集实例化数据加载器,如下图,单个训练批次由表示为令牌 ID 的八个文本消息组成。每个文本消息由 120 个令牌 ID 组成。
from import DataLoader num_workers = 0 batch_size = 8 torch.manual_seed(123) train_loader = DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True, ) val_loader = DataLoader( dataset=val_dataset, batch_size=batch_size, num_workers=num_workers, drop_last=False, ) test_loader = DataLoader( dataset=test_dataset, batch_size=batch_size, num_workers=num_workers, drop_last=False, )
print("Train loader:") for input_batch, target_batch in train_loader: pass print("Input batch dimensions:", input_batch.shape) print("Label batch dimensions", target_batch.shape) """输出""" Train loader: Input batch dimensions: torch.Size([8, 120]) Label batch dimensions torch.Size([8])
print(f"{len(train_loader)} training batches") print(f"{len(val_loader)} validation batches") print(f"{len(test_loader)} test batches") """输出""" 130 training batches 19 validation batches 38 test batches
6.4-Initializing a model with pretrained weights
完成第 1 阶段(准备数据集)后,我们现在必须初始化 LLM,然后我们将对其进行微调以对垃圾邮件进行分类。
CHOOSE_MODEL = "gpt2-small (124M)" INPUT_PROMPT = "Every effort moves" BASE_CONFIG = { "vocab_size": 50257, # Vocabulary size "context_length": 1024, # Context length "drop_rate": 0.0, # Dropout rate "qkv_bias": True # Query-key-value bias } model_configs = { "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12}, "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16}, "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20}, "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25}, } BASE_CONFIG.update(model_configs[CHOOSE_MODEL]) assert train_dataset.max_length <= BASE_CONFIG["context_length"], ( f"Dataset length {train_dataset.max_length} exceeds model's context " f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with " f"`max_length={BASE_CONFIG['context_length']}`" )
from gpt_download import download_and_load_gpt2 from previous_chapters import GPTModel, load_weights_into_gpt model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")") settings, params = download_and_load_gpt2(model_size=model_size, models_dir="E:\\LLM\\gpt2\\") # models_dir为gpt2模型下载的存放路径 model = GPTModel(BASE_CONFIG) load_weights_into_gpt(model, params) model.eval();
from previous_chapters import ( generate_text_simple, text_to_token_ids, token_ids_to_text ) text_1 = "Every effort moves you" token_ids = generate_text_simple( model=model, idx=text_to_token_ids(text_1, tokenizer), max_new_tokens=15, context_size=BASE_CONFIG["context_length"] ) print(token_ids_to_text(token_ids, tokenizer)) """输出""" Every effort moves you forward. The first step is to understand the importance of your work
text_2 = ( "Is the following text 'spam'? Answer with 'yes' or 'no':" " 'You are a winner you have been specially" " selected to receive $1000 cash or a $2000 award.'" ) token_ids = generate_text_simple( model=model, idx=text_to_token_ids(text_2, tokenizer), max_new_tokens=23, context_size=BASE_CONFIG["context_length"] ) print(token_ids_to_text(token_ids, tokenizer)) """输出""" Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.' The following text 'spam'? Answer with 'yes' or 'no': 'You are a winner
6.5 Adding a classification head
为进行分类微调,须修改预训练的大语言模型(LLM)。我们将原本把隐藏表征映射到含50,257个词的词表的输出层,替换为一个更小、仅映射到 “0(非垃圾邮件)” 和 “1(垃圾邮件)” 这两个类别的输出层如下图所示。除输出层外,模型其余部分保持不变 。
我们的目标是替换和微调输出层, 为了实现这一点,我们首先冻结模型,这意味着我们使所有层都不可训练。
for param in model.parameters(): param.requires_grad = False
torch.manual_seed(123) num_classes = 2 model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)
,这意味着它是模型中唯一一个在训练期间更新的层。从技术上讲,仅训练我们刚刚添加的输出层就足够了。然而,正如我在实验中发现的那样,微调附加层可以显著提高模型的预测性能(更多详细信息请参阅附录 B)。此外,我们还配置了最后一个 Transformer 块以及将该块连接到输出层的最终LayerNorm
GPT 模型包含 12 个重复的 Transformer 块。在输出层附近,我们将最后一个 LayerNormalization 层和最后一个 Transformer 块设置为可训练(trainable),而其余 11 个 Transformer 块和嵌入层保持不变,设置为不可训练(non-trainable)。为了使最后一个 LayerNormalization 层和最后一个 Transformer 块可训练,我们将它们各自的
。for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True

for param in model.final_norm.parameters():
    param.requires_grad = True
inputs = tokenizer.encode("Do you have time") inputs = torch.tensor(inputs).unsqueeze(0) print("Inputs:", inputs) print("Inputs dimensions:", inputs.shape) # shape: (batch_size, num_tokens) """输出""" Inputs: tensor([[5211, 345, 423, 640]]) Inputs dimensions: torch.Size([1, 4])
with torch.no_grad(): outputs = model(inputs) print("Outputs:\n", outputs) print("Outputs dimensions:", outputs.shape) # shape: (batch_size, num_tokens, num_classes) """输出""" Outputs: tensor([[[-1.5854, 0.9904], [-3.7235, 7.4548], [-2.2661, 6.6049], [-3.5983, 3.9902]]]) Outputs dimensions: torch.Size([1, 4, 2])
第3章讨论了将各输入令牌相互连接的注意力机制,还介绍类GPT模型中使用的因果注意掩码(使当前令牌仅关注当前及先前令牌位置),基于此因果注意机制,最后一个令牌包含信息最多,故将对其进行微调以用于垃圾邮件分类任务 。
print("Last output token:", outputs[:, -1, :]) """输出""" Last output token: tensor([[-3.5983, 3.9902]])
6.6-Calculating the classification loss and accuracy
probas = torch.softmax(outputs[:, -1, :], dim=-1) label = torch.argmax(probas) print("Class label:", label.item()) """输出""" Class label: 1
简化代码,不适用 softmax (因为最大的输出直接对应于最高概率得分。)
logits = outputs[:, -1, :] label = torch.argmax(logits) print("Class label:", label.item()) """输出""" Class label: 1
这个概念可用于计算分类精度,即衡量整个adataset中正确预测的百分比。 为了确定分类精度,我们将基于argmax的预测代码应用于数据集中的所有示例,并通过定义acalc_accuracy_loader函数来计算正确预测的比例。
def calc_accuracy_loader(data_loader, model, device, num_batches=None): model.eval() correct_predictions, num_examples = 0, 0 if num_batches is None: num_batches = len(data_loader) else: num_batches = min(num_batches, len(data_loader)) for i, (input_batch, target_batch) in enumerate(data_loader): if i < num_batches: input_batch, target_batch =, with torch.no_grad(): logits = model(input_batch)[:, -1, :] # Logits of last output token predicted_labels = torch.argmax(logits, dim=-1) num_examples += predicted_labels.shape[0] correct_predictions += (predicted_labels == target_batch).sum().item() else: break return correct_predictions / num_examples
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Note: # Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable, # which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air). # As of this writing, in PyTorch 2.4, the results obtained via CPU and MPS were identical. # However, in earlier versions of PyTorch, you may observe different results when using MPS. #if torch.cuda.is_available(): # device = torch.device("cuda") #elif torch.backends.mps.is_available(): # device = torch.device("mps") #else: # device = torch.device("cpu") #print(f"Running on {device} device.") # no assignment model = necessary for nn.Module classes torch.manual_seed(123) # For reproducibility due to the shuffling in the training data loader train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=10) val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=10) test_accuracy = calc_accuracy_loader(test_loader, model, device, num_batches=10) print(f"Training accuracy: {train_accuracy*100:.2f}%") print(f"Validation accuracy: {val_accuracy*100:.2f}%") print(f"Test accuracy: {test_accuracy*100:.2f}%") """输出""" Training accuracy: 46.25% Validation accuracy: 45.00% Test accuracy: 48.75%
函数仅优化最后一个令牌的输出(model(input_batch)[:, -1, :])
)。def calc_loss_batch(input_batch, target_batch, model, device): input_batch, target_batch =, logits = model(input_batch)[:, -1, :] # Logits of last output token loss = torch.nn.functional.cross_entropy(logits, target_batch) return loss
calc_loss_loader 和第五章一模一样
# Same as in chapter 5 def calc_loss_loader(data_loader, model, device, num_batches=None): total_loss = 0. if len(data_loader) == 0: return float("nan") elif num_batches is None: num_batches = len(data_loader) else: # Reduce the number of batches to match the total number of batches in the data loader # if num_batches exceeds the number of batches in the data loader num_batches = min(num_batches, len(data_loader)) for i, (input_batch, target_batch) in enumerate(data_loader): if i < num_batches: loss = calc_loss_batch(input_batch, target_batch, model, device) total_loss += loss.item() else: break return total_loss / num_batches
使用 calc_closs_loader,我们在开始训练之前计算初始训练、验证和测试集损失(初始的损失值)
with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet train_loss = calc_loss_loader(train_loader, model, device, num_batches=5) val_loss = calc_loss_loader(val_loader, model, device, num_batches=5) test_loss = calc_loss_loader(test_loader, model, device, num_batches=5) print(f"Training loss: {train_loss:.3f}") print(f"Validation loss: {val_loss:.3f}") print(f"Test loss: {test_loss:.3f}") """输出""" Training loss: 2.453 Validation loss: 2.583 Test loss: 2.322
6.7-Finetuning the model on supervised data
唯一的两个区别是我们现在- 跟踪看到的训练示例的数量(‘examples_seen’),而不是看到的token数量
- 计算每个epoch后的准确性,而不是在每个epoch后打印示例文本
PyTorch 中训练深度神经网络的典型训练循环需在多轮次中遍历训练集批次,通过计算批次损失确定梯度以更新模型权重来降低损失。实现相关概念的训练函数与预训练所用的
函数相似,区别在于跟踪训练样本数而非标记数,且每轮次后计算准确率而非打印示例文本 。 -
# Overall the same as `train_model_simple` in chapter 5 def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter): # Initialize lists to track losses and examples seen train_losses, val_losses, train_accs, val_accs = [], [], [], [] examples_seen, global_step = 0, -1 # Main training loop for epoch in range(num_epochs): model.train() # Set model to training mode for input_batch, target_batch in train_loader: optimizer.zero_grad() # Reset loss gradients from previous batch iteration loss = calc_loss_batch(input_batch, target_batch, model, device) loss.backward() # Calculate loss gradients optimizer.step() # Update model weights using loss gradients examples_seen += input_batch.shape[0] # New: track examples instead of tokens global_step += 1 # Optional evaluation step if global_step % eval_freq == 0: train_loss, val_loss = evaluate_model( model, train_loader, val_loader, device, eval_iter) train_losses.append(train_loss) val_losses.append(val_loss) print(f"Ep {epoch+1} (Step {global_step:06d}): " f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}") # Calculate accuracy after each epoch train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter) val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter) print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="") print(f"Validation accuracy: {val_accuracy*100:.2f}%") train_accs.append(train_accuracy) val_accs.append(val_accuracy) return train_losses, val_losses, train_accs, val_accs, examples_seen
函数与我们在第 5 章中使用的函数相同# Same as chapter 5 def evaluate_model(model, train_loader, val_loader, device, eval_iter): model.eval() with torch.no_grad(): train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) model.train() return train_loss, val_loss
函数启动训练。import time start_time = time.time() torch.manual_seed(123) optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) num_epochs = 5 train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( model, train_loader, val_loader, optimizer, device, num_epochs=num_epochs, eval_freq=50, eval_iter=5, ) end_time = time.time() execution_time_minutes = (end_time - start_time) / 60 print(f"Training completed in {execution_time_minutes:.2f} minutes.") """输出""" Ep 1 (Step 000000): Train loss 2.153, Val loss 2.392 Ep 1 (Step 000050): Train loss 0.617, Val loss 0.637 Ep 1 (Step 000100): Train loss 0.523, Val loss 0.557 Training accuracy: 70.00% | Validation accuracy: 72.50% Ep 2 (Step 000150): Train loss 0.561, Val loss 0.489 Ep 2 (Step 000200): Train loss 0.419, Val loss 0.397 Ep 2 (Step 000250): Train loss 0.409, Val loss 0.353 Training accuracy: 82.50% | Validation accuracy: 85.00% Ep 3 (Step 000300): Train loss 0.333, Val loss 0.320 Ep 3 (Step 000350): Train loss 0.340, Val loss 0.306 Training accuracy: 90.00% | Validation accuracy: 90.00% Ep 4 (Step 000400): Train loss 0.136, Val loss 0.200 Ep 4 (Step 000450): Train loss 0.153, Val loss 0.132 Ep 4 (Step 000500): Train loss 0.222, Val loss 0.137 Training accuracy: 100.00% | Validation accuracy: 97.50% Ep 5 (Step 000550): Train loss 0.207, Val loss 0.143 Ep 5 (Step 000600): Train loss 0.083, Val loss 0.074 Training accuracy: 100.00% | Validation accuracy: 97.50% Training completed in 0.67 minutes.
在M3 MacBook Air笔记本电脑上训练大约需要6分钟,在V100或A100 GPU上训练不到半分钟,我的电脑是windows,显卡3060 12G,训练总时长 40s (0.67minutes)。
import matplotlib.pyplot as plt def plot_values(epochs_seen, examples_seen, train_values, val_values, label="loss"): fig, ax1 = plt.subplots(figsize=(5, 3)) # Plot training and validation loss against epochs ax1.plot(epochs_seen, train_values, label=f"Training {label}") ax1.plot(epochs_seen, val_values, linestyle="-.", label=f"Validation {label}") ax1.set_xlabel("Epochs") ax1.set_ylabel(label.capitalize()) ax1.legend() # Create a second x-axis for examples seen ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis ax2.plot(examples_seen, train_values, alpha=0) # Invisible plot for aligning ticks ax2.set_xlabel("Examples seen") fig.tight_layout() # Adjust layout to make room plt.savefig(f"{label}-plot.pdf")
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses)) examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses)) plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)
早些时候,当我们启动训练时,我们将训练的轮数(epochs)设置为 5。轮数的选择取决于数据集和任务的难度,虽然没有通用的解决方案或推荐值,但 5 轮通常是一个不错的起点。如果在前几轮训练后,模型出现过拟合现象,则可能需要减少轮数。相反,如果趋势线表明验证损失可能随着进一步训练而改善,则应增加轮数。在本例中,5 轮是一个合理的数值,因为没有出现早期过拟合的迹象,且验证损失接近 0。
epochs_tensor = torch.linspace(0, num_epochs, len(train_accs)) examples_seen_tensor = torch.linspace(0, examples_seen, len(train_accs)) plot_values(epochs_tensor, examples_seen_tensor, train_accs, val_accs, label="accuracy")
基于上面的准确度图,我们可以看到模型在第 4 和第 5 阶段之后实现了相对较高的训练和验证准确度但是,我们必须记住,我们之前在训练函数中指定了 “eval_iter=5”,这意味着我们只估计了训练和验证集的性能,我们可以计算完整数据集的训练、验证和测试集性能,如下所示。
train_accuracy = calc_accuracy_loader(train_loader, model, device) val_accuracy = calc_accuracy_loader(val_loader, model, device) test_accuracy = calc_accuracy_loader(test_loader, model, device) print(f"Training accuracy: {train_accuracy*100:.2f}%") print(f"Validation accuracy: {val_accuracy*100:.2f}%") print(f"Test accuracy: {test_accuracy*100:.2f}%") """输出""" Training accuracy: 97.21% Validation accuracy: 97.32% Test accuracy: 95.67%
训练集和测试集的性能几乎相同。训练集和测试集准确率之间的细微差异表明训练数据的过拟合程度极低。通常情况下,验证集的准确率会比测试集的准确率略高一些,这是因为模型开发过程中常常需要调整超参数,以使模型在验证集上表现良好,但这些调整后的超参数可能无法同样有效地泛化到测试集上。这种情况很常见,不过,可以通过调整模型设置来尽可能缩小这一差距,比如在优化器配置中提高丢弃率(drop_rate)或权重衰减参数(weight_decay) 。
6.8-Using the LLM as a spam classifier
对模型进行微调和评估后,我们现在可以对垃圾短信进行分类了(下图)。让我们使用基于 GPT 的微调垃圾邮件分类模型。
实现器中使用的步骤。然后,在将文本处理成token ID 后,函数使用该模型预测一个整数类标签,类似于我们在第 6.6 节中实现的标签,然后返回相应的类名def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256): model.eval() # Prepare inputs to the model input_ids = tokenizer.encode(text) supported_context_length = model.pos_emb.weight.shape[0] # Note: In the book, this was originally written as pos_emb.weight.shape[1] by mistake # It didn't break the code but would have caused unnecessary truncation (to 768 instead of 1024) # Truncate sequences if they too long input_ids = input_ids[:min(max_length, supported_context_length)] # Pad sequences to the longest sequence input_ids += [pad_token_id] * (max_length - len(input_ids)) input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension # Model inference with torch.no_grad(): logits = model(input_tensor)[:, -1, :] # Logits of the last output token predicted_label = torch.argmax(logits, dim=-1).item() # Return the classified result return "spam" if predicted_label == 1 else "not spam"
text_1 = ( "You are a winner you have been specially" " selected to receive $1000 cash or a $2000 award." ) print(classify_review( text_1, model, tokenizer, device, max_length=train_dataset.max_length )) """输出""" spam
text_2 = ( "Hey, just wanted to check if we're still on" " for dinner tonight? Let me know!" ) print(classify_review( text_2, model, tokenizer, device, max_length=train_dataset.max_length )) """输出""" not spam
最后,让我们保存模型,以防以后我们想重用模型而不必再次训练它, "review_classifier.pth")
model_state_dict = torch.load("review_classifier.pth", map_location=device, weights_only=True) model.load_state_dict(model_state_dict)
6.9-Summary and takeaways
- There are different strategies for fine-tuning LLMs, including classification fine-tuning and instruction fine-tuning.
- Classification fine-tuning involves replacing the output layer of an LLM via a small classification layer.
- In the case of classifying text messages as “spam” or “not spam,” the new classification layer consists of only two output nodes. Previously, we used the number of output nodes equal to the number of unique tokens in the vocabulary (i.e., 50,256).
- Instead of predicting the next token in the text as in pretraining, classification fine-tuning trains the model to output a correct class label—for example, “spam” or “not spam.”
- The model input for fine-tuning is text converted into token IDs, similar to pretraining.
- Before fine-tuning an LLM, we load the pretrained model as a base model.
- Evaluating a classification model involves calculating the classification accuracy (the fraction or percentage of correct predictions).
- Fine-tuning a classification model uses the same cross entropy loss function as when pretraining the LLM
- ./gpt_class_finetune.py为用于分类微调的脚本
- appendix E 中有lora参数高效训练的介绍
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). # Source for "Build a Large Language Model From Scratch" # - # Code: # This is a summary file containing the main takeaways from chapter 6. import urllib.request import zipfile import os from pathlib import Path import time import matplotlib.pyplot as plt import pandas as pd import tiktoken import torch from import Dataset, DataLoader from gpt_download import download_and_load_gpt2 from previous_chapters import GPTModel, load_weights_into_gpt def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=False): if data_file_path.exists(): print(f"{data_file_path} already exists. Skipping download and extraction.") return if test_mode: # Try multiple times since CI sometimes has connectivity issues max_retries = 5 delay = 5 # delay between retries in seconds for attempt in range(max_retries): try: # Downloading the file with urllib.request.urlopen(url, timeout=10) as response: with open(zip_path, "wb") as out_file: out_file.write( break # if download is successful, break out of the loop except urllib.error.URLError as e: print(f"Attempt {attempt + 1} failed: {e}") if attempt < max_retries - 1: time.sleep(delay) # wait before retrying else: print("Failed to download file after several attempts.") return # exit if all retries fail else: # Code as it appears in the chapter # Downloading the file with urllib.request.urlopen(url) as response: with open(zip_path, "wb") as out_file: out_file.write( # Unzipping the file with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(extracted_path) # Add .tsv file extension original_file_path = Path(extracted_path) / "SMSSpamCollection" os.rename(original_file_path, data_file_path) print(f"File downloaded and saved as {data_file_path}") def create_balanced_dataset(df): # Count the instances of "spam" num_spam = df[df["Label"] == "spam"].shape[0] # Randomly sample "ham" instances to match the number of "spam" instances ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123) # Combine ham "subset" with "spam" balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]]) return balanced_df def random_split(df, train_frac, validation_frac): # Shuffle the entire DataFrame df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Calculate split indices train_end = int(len(df) * train_frac) validation_end = train_end + int(len(df) * validation_frac) # Split the DataFrame train_df = df[:train_end] validation_df = df[train_end:validation_end] test_df = df[validation_end:] return train_df, validation_df, test_df class SpamDataset(Dataset): def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256): = pd.read_csv(csv_file) # Pre-tokenize texts self.encoded_texts = [ tokenizer.encode(text) for text in["Text"] ] if max_length is None: self.max_length = self._longest_encoded_length() else: self.max_length = max_length # Truncate sequences if they are longer than max_length self.encoded_texts = [ encoded_text[:self.max_length] for encoded_text in self.encoded_texts ] # Pad sequences to the longest sequence self.encoded_texts = [ encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts ] def __getitem__(self, index): encoded = self.encoded_texts[index] label =[index]["Label"] return ( torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long) ) def __len__(self): return len( def _longest_encoded_length(self): max_length = 0 for encoded_text in self.encoded_texts: encoded_length = len(encoded_text) if encoded_length > max_length: max_length = encoded_length return max_length def calc_accuracy_loader(data_loader, model, device, num_batches=None): model.eval() correct_predictions, num_examples = 0, 0 if num_batches is None: num_batches = len(data_loader) else: num_batches = min(num_batches, len(data_loader)) for i, (input_batch, target_batch) in enumerate(data_loader): if i < num_batches: input_batch, target_batch =, with torch.no_grad(): logits = model(input_batch)[:, -1, :] # Logits of last output token predicted_labels = torch.argmax(logits, dim=-1) num_examples += predicted_labels.shape[0] correct_predictions += (predicted_labels == target_batch).sum().item() else: break return correct_predictions / num_examples def calc_loss_batch(input_batch, target_batch, model, device): input_batch, target_batch =, logits = model(input_batch)[:, -1, :] # Logits of last output token loss = torch.nn.functional.cross_entropy(logits, target_batch) return loss def calc_loss_loader(data_loader, model, device, num_batches=None): total_loss = 0. if len(data_loader) == 0: return float("nan") elif num_batches is None: num_batches = len(data_loader) else: num_batches = min(num_batches, len(data_loader)) for i, (input_batch, target_batch) in enumerate(data_loader): if i < num_batches: loss = calc_loss_batch(input_batch, target_batch, model, device) total_loss += loss.item() else: break return total_loss / num_batches def evaluate_model(model, train_loader, val_loader, device, eval_iter): model.eval() with torch.no_grad(): train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) model.train() return train_loss, val_loss def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, tokenizer): # Initialize lists to track losses and tokens seen train_losses, val_losses, train_accs, val_accs = [], [], [], [] examples_seen, global_step = 0, -1 # Main training loop for epoch in range(num_epochs): model.train() # Set model to training mode for input_batch, target_batch in train_loader: optimizer.zero_grad() # Reset loss gradients from previous batch iteration loss = calc_loss_batch(input_batch, target_batch, model, device) loss.backward() # Calculate loss gradients optimizer.step() # Update model weights using loss gradients examples_seen += input_batch.shape[0] # New: track examples instead of tokens global_step += 1 # Optional evaluation step if global_step % eval_freq == 0: train_loss, val_loss = evaluate_model( model, train_loader, val_loader, device, eval_iter) train_losses.append(train_loss) val_losses.append(val_loss) print(f"Ep {epoch+1} (Step {global_step:06d}): " f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}") # Calculate accuracy after each epoch train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter) val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter) print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="") print(f"Validation accuracy: {val_accuracy*100:.2f}%") train_accs.append(train_accuracy) val_accs.append(val_accuracy) return train_losses, val_losses, train_accs, val_accs, examples_seen def plot_values(epochs_seen, examples_seen, train_values, val_values, label="loss"): fig, ax1 = plt.subplots(figsize=(5, 3)) # Plot training and validation loss against epochs ax1.plot(epochs_seen, train_values, label=f"Training {label}") ax1.plot(epochs_seen, val_values, linestyle="-.", label=f"Validation {label}") ax1.set_xlabel("Epochs") ax1.set_ylabel(label.capitalize()) ax1.legend() # Create a second x-axis for tokens seen ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis ax2.plot(examples_seen, train_values, alpha=0) # Invisible plot for aligning ticks ax2.set_xlabel("Examples seen") fig.tight_layout() # Adjust layout to make room plt.savefig(f"{label}-plot.pdf") # if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Finetune a GPT model for classification" ) parser.add_argument( "--test_mode", default=False, action="store_true", help=("This flag runs the model in test mode for internal testing purposes. " "Otherwise, it runs the model as it is used in the chapter (recommended).") ) args = parser.parse_args() ######################################## # Download and prepare dataset ######################################## url = "" zip_path = "" extracted_path = "sms_spam_collection" data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode) df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"]) balanced_df = create_balanced_dataset(df) balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) train_df.to_csv("train.csv", index=None) validation_df.to_csv("validation.csv", index=None) test_df.to_csv("test.csv", index=None) ######################################## # Create data loaders ######################################## tokenizer = tiktoken.get_encoding("gpt2") train_dataset = SpamDataset( csv_file="train.csv", max_length=None, tokenizer=tokenizer ) val_dataset = SpamDataset( csv_file="validation.csv", max_length=train_dataset.max_length, tokenizer=tokenizer ) test_dataset = SpamDataset( csv_file="test.csv", max_length=train_dataset.max_length, tokenizer=tokenizer ) num_workers = 0 batch_size = 8 torch.manual_seed(123) train_loader = DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True, ) val_loader = DataLoader( dataset=val_dataset, batch_size=batch_size, num_workers=num_workers, drop_last=False, ) test_loader = DataLoader( dataset=test_dataset, batch_size=batch_size, num_workers=num_workers, drop_last=False, ) ######################################## # Load pretrained model ######################################## # Small GPT model for testing purposes if args.test_mode: BASE_CONFIG = { "vocab_size": 50257, "context_length": 120, "drop_rate": 0.0, "qkv_bias": False, "emb_dim": 12, "n_layers": 1, "n_heads": 2 } model = GPTModel(BASE_CONFIG) model.eval() device = "cpu" # Code as it is used in the main chapter else: CHOOSE_MODEL = "gpt2-small (124M)" INPUT_PROMPT = "Every effort moves" BASE_CONFIG = { "vocab_size": 50257, # Vocabulary size "context_length": 1024, # Context length "drop_rate": 0.0, # Dropout rate "qkv_bias": True # Query-key-value bias } model_configs = { "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12}, "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16}, "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20}, "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25}, } BASE_CONFIG.update(model_configs[CHOOSE_MODEL]) assert train_dataset.max_length <= BASE_CONFIG["context_length"], ( f"Dataset length {train_dataset.max_length} exceeds model's context " f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with " f"`max_length={BASE_CONFIG['context_length']}`" ) model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")") settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2") model = GPTModel(BASE_CONFIG) load_weights_into_gpt(model, params) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ######################################## # Modify and pretrained model ######################################## for param in model.parameters(): param.requires_grad = False torch.manual_seed(123) num_classes = 2 model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes) for param in model.trf_blocks[-1].parameters(): param.requires_grad = True for param in model.final_norm.parameters(): param.requires_grad = True ######################################## # Finetune modified model ######################################## start_time = time.time() torch.manual_seed(123) optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) num_epochs = 5 train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( model, train_loader, val_loader, optimizer, device, num_epochs=num_epochs, eval_freq=50, eval_iter=5, tokenizer=tokenizer ) end_time = time.time() execution_time_minutes = (end_time - start_time) / 60 print(f"Training completed in {execution_time_minutes:.2f} minutes.") ######################################## # Plot results ######################################## # loss plot epochs_tensor = torch.linspace(0, num_epochs, len(train_losses)) examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses)) plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses) # accuracy plot epochs_tensor = torch.linspace(0, num_epochs, len(train_accs)) examples_seen_tensor = torch.linspace(0, examples_seen, len(train_accs)) plot_values(epochs_tensor, examples_seen_tensor, train_accs, val_accs, label="accuracy")