【AIGC】AIGC程序记录
一、模型相关
二、程序相关
1、GPU使用
方式一:
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4,5"
方式二:
2、是否加载GPU
-- GPU加载:
model = AutoModelForCausalLM.from_pretrained(
"baichuan-inc/Baichuan-13B-Chat",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
-- 先CPU,再GPU加载:
model = AutoModelForCausalLM.from_pretrained(
"baichuan-inc/Baichuan-13B-Chat",
torch_dtype=torch.float16,
trust_remote_code=True,
)
model = model.cuda()
-- 量化:
model = model.quantize(4).cuda()
三、服务相关
1、chat类-websocket服务
from fastapi import FastAPI, Request, WebSocket
from transformers import AutoTokenizer, AutoModel
import uvicorn, json, datetime
import torch
import os
import asyncio
""" BaiChan13B Example """
#配置GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4,5"
app = FastAPI(debug=False)
@app.websocket("/chat")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
messages = []
while True:
prompt = await websocket.receive_text()
messages.append(
{"role": "user", "content": prompt_text}
)
for response in model.chat(tokenizer, messages, stream=True):
result = {
"res":{
"text": response,
"end": False
}
}
result = json.dumps(result, ensure_ascii=False)
await websocket.send_text(result)
await asyncio.sleep(0.001)
result = {
"res": {
"text": response,
"end": True
}
}
result = json.dumps(result, ensure_ascii=False)
await websocket.send_text(result)
await asyncio.sleep(0.01)
messages.append(
{"role": "assistant", "content": response}
)
if __name__ == "__main__":
model = AutoModelForCausalLM.from_pretrained(
"baichuan-inc/Baichuan-13B-Chat",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
model.generation_config = GenerationConfig.from_pretrained(
"baichuan-inc/Baichuan-13B-Chat"
)
tokenizer = AutoTokenizer.from_pretrained(
"baichuan-inc/Baichuan-13B-Chat",
use_fast=False,
trust_remote_code=True
)
model.eval()
uvicorn.run(app, host="0.0.0.0", port=8000, ws_ping_timeout=5)
注意
如果websocket返回结果比较慢,加入sleep,如下:
await websocket.send_text(result)
await asyncio.sleep(0.001)