跳到主要内容

vLLM PD 分离部署

docker run -td --runtime nvidia \
--gpus '"device=2,3"' \
--name vllm-pd \
-v /root/model:/root/model \
-p 8000:8000 \
--ipc=host \
--network=host \
--entrypoint bash \
vetought-cn-beijing.cr.volces.com/maas/vllm-openai:latest
nerdctl run -td \
--gpus all \
--name vllm-pd \
-v /root/model:/root/model \
-p 8000:8000 \
--ipc=host \
--network=host \
--entrypoint bash \
vetought-cn-beijing.cr.volces.com/maas/vllm-openai:latest
# 最小化复现代码(2卡场景)
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig

# 生产者节点
ktc_producer = KVTransferConfig.from_cli(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
)
llm_producer = LLM(model="/root/model/Mistral-7B-Instruct-v0.3", kv_transfer_config=ktc_producer)

# 消费者节点
ktc_consumer = KVTransferConfig.from_cli(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
)
llm_consumer = LLM(model="/root/model/Mistral-7B-Instruct-v0.3", kv_transfer_config=ktc_consumer)