【留一下记录】Vllm在Linux环境下的学习笔记
基于conda创建一个vllm环境
- 操作系统:目前只支持Linux
- python环境必须是Python:3.8 - 3.12
- GPU:计算能力 7.0 或更高(例如 V100、T4、RTX20xx、A100、L4、H100 等)
- 低于7.0不行
- 使用vllm显存低于16G,启动大模型会报错
创建环境
conda create -n vllm python=3.12 -y
激活
conda activate vllm
安装vllm
pip install vllm
注意:
截至目前,vLLM 的二进制文件默认使用 CUDA 12.1
和公共 PyTorch 发行版本进行编译。我们还提供使用 CUDA 11.8
和公共 PyTorch 发行版本编译的 vLLM 二进制文件:
# Install vLLM with CUDA 11.8.
# 安装带有 CUDA 11.8 的 vLLM。
export VLLM_VERSION=0.4.0
export PYTHON_VERSION=310
pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
启动
绝对路径
vllm serve mnt/workspace/llm/Qwen/Qwen2.5-0.5B-Instruct
如果有如下类似的报错:
mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/_distutils_hack/__init__.py:53: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
warnings.warn(
Traceback (most recent call last):
File "/mnt/workspace/Anaconda3/envs/vllm/bin/vllm", line 5, in <module>
from vllm.entrypoints.cli.main import main
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/__init__.py", line 11, in <module>
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/engine/arg_utils.py", line 15, in <module>
from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/config.py", line 27, in <module>
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/model_executor/__init__.py", line 3, in <module>
from vllm.model_executor.parameter import (BasevLLMParameter,
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/model_executor/parameter.py", line 9, in <module>
from vllm.distributed import get_tensor_model_parallel_rank
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/distributed/__init__.py", line 3, in <module>
from .communication_op import *
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/distributed/communication_op.py", line 8, in <module>
from .parallel_state import get_tp_group
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/distributed/parallel_state.py", line 40, in <module>
import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/distributed/kv_transfer/kv_transfer_agent.py", line 16, in <module>
from vllm.distributed.kv_transfer.kv_connector.factory import (
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 6, in <module>
from .base import KVConnectorBase
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/distributed/kv_transfer/kv_connector/base.py", line 15, in <module>
from vllm.sequence import IntermediateTensors
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/sequence.py", line 17, in <module>
from vllm.inputs import SingletonInputs, SingletonInputsAdapter
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/inputs/__init__.py", line 9, in <module>
from .registry import (DummyData, InputContext, InputProcessingContext,
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/inputs/registry.py", line 15, in <module>
from vllm.transformers_utils.processor import cached_processor_from_config
File "/mnt/workspace/Anaconda3/envs/vllm/lib/python3.12/site-packages/vllm/transformers_utils/__init__.py", line 7, in <module>
以上依赖有问题
更新构建工具
pip install --upgrade setuptools wheel
强制重装 vllm
pip install --force-reinstall vllm
安装 ROCm(如果涉及 GPU)
sudo apt update && sudo apt install rocm-dkms
然后在启动,就可以通过代码调用模型了
例如:
#多轮对话
from openai import OpenAI
#定义多轮对话方法
def run_chat_session():
#初始化客户端
client = OpenAI(base_url="http://localhost:8000/v1/",api_key="123456")
#初始化对话历史
chat_history = []
#启动对话循环
while True:
#获取用户输入
user_input = input("用户:")
if user_input.lower() == "exit":
print("退出对话。")
break
#更新对话历史(添加用户输入)
chat_history.append({"role":"user","content":user_input})
#调用模型回答
try:
chat_complition = client.chat.completions.create(messages=chat_history,model="qwen:0.5b")
#获取最新回答
model_response = chat_complition.choices[0]
print("AI:",model_response.message.content)
#更新对话历史(添加AI模型的回复)
chat_history.append({"role":"assistant","content":model_response.message.content})
except Exception as e:
print("发生错误:",e)
break
if __name__ == '__main__':
run_chat_session()