多张GPU卡
from transformers import pipeline
from accelerate import init_empty_weights, infer_auto_device_map
from transformers import AutoModelForCausalLM, AutoTokenizer
初始化加速器
from accelerate import Accelerator
accelerator = Accelerator()
加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained(“/app/model/”)
model = AutoModelForCausalLM.from_pretrained(“/app/model/”)
将模型分布到多张 GPU 上
device_map = infer_auto_device_map(model, max_memory={0: “16GB”, 1: “16GB”, 2: “16GB”, 3: “16GB”, 4: “16GB”, 5: “16GB”, 6: “16GB”, 7: “16GB”})
将模型加载到设备上
model = model.from_pretrained(“/app/model/”, device_map=device_map)
使用 pipeline
generator = pipeline(“text-generation”, model=model, tokenizer=tokenizer, device_map=device_map)
进行推理
output = generator(“Your input text”, max_length=30)
print(output)