使用qwen作为基座训练分类大模型
训练大模型
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
# 1. 加载 Qwen2.5-0.5B 预训练模型和分词器
model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 指定多分类任务的类别数(请根据你的数据集修改)
num_labels = 5 # 假设有 5 个类别
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
label_mapping = {"positive": 0, "negative": 1, "neutral": 2}
# 2. 加载和预处理数据
def preprocess_function(examples):
tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized["label"] = label_mapping[examples["label"]] # 转换文本标签为数值
return tokenized
# 示例数据集(请替换为你的数据路径)
raw_datasets = DatasetDict({
"train": load_dataset("csv", data_files="train.csv")["train"],
"test": load_dataset("csv", data_files="test.csv")["train"]
})
# 预处理数据
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")
# 3. 设置训练参数
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
logging_dir="./logs",
logging_steps=50,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
learning_rate=2e-5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True
)
# 4. 定义评价指标
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = torch.argmax(torch.tensor(logits), dim=-1)
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
return {
"accuracy": accuracy,
"precision": precision,
"recall": recall,
"f1": f1
}
# 5. 定义 Trainer
model.config.pad_token_id = tokenizer.pad_token_id
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
compute_metrics=compute_metrics
)
# 6. 开始训练
trainer.train()
# 7. 评估模型
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
# 8. 保存模型
model.save_pretrained("./qwen2.5-multiclass-model")
tokenizer.save_pretrained("./qwen2.5-multiclass-model")
注意 model.config.pad_token_id = tokenizer.pad_token_id这个必须加上,不加上batch_size>1会报错
这里引用的训练数据的格式要求
类似下面这种,包含text和label两个列
text,label
"这是一条正面评价","positive"
"产品质量很差","negative"
"服务态度一般","neutral"