使用LoRA微调florence-2模型
1 环境
Kaggle,单GPU
2 数据
图片、索引和标签放在JSON文件中
文件目录如下:
logo是图片的文件夹,PNG-SVG是图片的文件夹,re.json是索引,florence2-weight是预训练的权重
JSON文件内容如下:
image是图片的地址,label是图片的标签
3 微调代码
3.1 安装所需要的环境
!pip install peft einops flash_attn
3.2 微调代码
使用LoRA策略
import os
import json
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision import transforms
from PIL import Image
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoProcessor, AutoModelForCausalLM
from tqdm import tqdm
import time
from torch.cuda.amp import autocast, GradScaler
import warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false" # 关闭 HuggingFace Tokenizers 警告
warnings.filterwarnings("ignore", category=UserWarning, module="PIL.Image") # 忽略 PIL 图像透明度相关警告
# 数据集类
class IconLogoDataset(Dataset):
def __init__(self, json_path, transform=None):
with open(json_path, 'r', encoding='utf-8') as f:
self.data = json.load(f)
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
while True: # 循环直到成功返回有效样本
try:
item = self.data[idx]
image_path = item['image']
label = item['label']
image = Image.open(image_path).convert('RGB') # 尝试打开图片
if self.transform:
image = self.transform(image)
return image, label # 成功返回样本
except Exception as e:
print(f"图片无法读取 {self.data[idx]['image']}: {e}")
idx = (idx + 1) % len(self.data) # 跳过当前样本,尝试下一个
def prepare_dataloader(json_path, batch_size, num_workers=0):
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(), # 转化为张量,也会缩放到[0, 1]
])
dataset = IconLogoDataset(json_path, transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
print('数据预处理完成')
return dataloader
# 配置 LoRA
def configure_lora(model_name):
lora_config = LoraConfig(
task_type=TaskType.SEQ_2_SEQ_LM,
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"],
)
# 加载模型和权重
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('/kaggle/input/florence2-weight/icon_caption_florence', torch_dtype=torch.float16, trust_remote_code=True)
# 配置 LoRA 模型
model = get_peft_model(model, lora_config)
print('模型加载完毕')
return model, processor
# 训练函数
def train(model, dataloader, processor, optimizer, device, epochs=5):
print('开始训练')
start_time = time.time() # 记录训练开始时间
scaler = GradScaler() # 初始化混合精度缩放器
model.train()
for epoch in range(epochs):
epoch_loss = 0
# 显示进度条
progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}")
for images, labels in progress_bar:
images = images.to(device, dtype=torch.float16) # 转换到 float16
# 同时处理图像和文本
inputs = processor(
text=labels,
images=images,
return_tensors="pt",
padding=True,
do_rescale=False # 因为前面已经0-1归一化了,所以这里不再做
).to(device)
optimizer.zero_grad()
# 自动混合精度上下文
with autocast(dtype=torch.float16):
outputs = model(**inputs, labels=inputs["input_ids"]) # 模型前向传播
loss = outputs.loss
# 使用 GradScaler 处理损失
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
epoch_loss += loss.item()
# 在进度条上显示当前的平均损失
progress_bar.set_postfix(loss=epoch_loss / len(dataloader))
print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")
end_time = time.time() # 记录训练结束时间
print(f"训练总耗时: {end_time - start_time:.2f} 秒")
# 主程序
def main():
# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 数据加载
re_dataloader = prepare_dataloader(
json_path="/kaggle/input/index-of-data/re.json",
batch_size=16
)
# 配置 LoRA 模型
model_name = "microsoft/Florence-2-base" # 替换为你的模型名称
model, tokenizer = configure_lora(model_name)
model.to(device)
# 优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# 开始训练
train(model, re_dataloader, tokenizer, optimizer, device, epochs=5)
print('训练结束')
# 保存模型
model.save_pretrained("./lora_florence2")
tokenizer.save_pretrained("./lora_florence2")
if __name__ == "__main__":
main()
训练结束后,压缩成一个文件进行下载
import shutil
# 压缩文件夹
shutil.make_archive("lora_florence2", 'zip', "./lora_florence2")
print("压缩完成:生成了 lora_florence2.zip 文件")