Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
2a0cb78
add test py
panf2333 Jan 6, 2025
5d20f38
add vllm connect cmd
panf2333 Jan 6, 2025
905424e
add identity url headers
panf2333 Jan 6, 2025
bfde168
add /v1/completions stream support
panf2333 Jan 7, 2025
6e1fba8
1. connect_parser set --prefill-addr and --decode-addr are required
panf2333 Jan 8, 2025
b7ffb43
update disagg_connect test_request.py
panf2333 Jan 8, 2025
897db7b
Replace zmq.asyncio.Context().term() with zmq.asyncio.Context().destr…
panf2333 Jan 8, 2025
187f112
1. fix mypy issue
panf2333 Jan 8, 2025
2c31e4c
Run yapf and ruff
panf2333 Jan 8, 2025
7fbf70d
1. replace tpc:// with ipc:// \n 2. fix json response
panf2333 Jan 11, 2025
ee66073
create proxy sockets in the proxy function for thread safety
panf2333 Jan 11, 2025
27c1afe
fix ThreadProxy
panf2333 Jan 12, 2025
8fa9df7
run format.sh
panf2333 Jan 12, 2025
16ed827
add benchmark shell
panf2333 Jan 16, 2025
6c8fae8
run format
panf2333 Jan 16, 2025
298298f
remove invalid zmq benchmark code
panf2333 Jan 20, 2025
d6945ec
change disagg_prefill example to use zmq
panf2333 Jan 20, 2025
0cb2e05
change log level and fix some comments
panf2333 Feb 4, 2025
b9a7dbe
remove default socket address value
panf2333 Feb 4, 2025
4f13e89
fix SIM105
panf2333 Feb 4, 2025
912031c
refactor disagg
panf2333 Mar 8, 2025
d35dace
refactor zmq msg to object
panf2333 Mar 9, 2025
c0b1443
fix mypy
panf2333 Mar 9, 2025
8355358
add unlimited HWM
panf2333 Mar 11, 2025
b89d89f
fix rebase
panf2333 Mar 21, 2025
a8a621e
updated
Mar 22, 2025
9f7fb5e
updated
Mar 22, 2025
2ceb7bc
updated
Mar 22, 2025
120bbdf
updated
Mar 22, 2025
85687b4
updated
Mar 22, 2025
522279e
Stash
Mar 22, 2025
144162f
Merge branch 'main' into rob-fixes
Mar 22, 2025
47a3f26
updated
Mar 22, 2025
2fec6e0
working?
Mar 22, 2025
24cbbe4
updated
Mar 22, 2025
f6f008c
cleanup
Mar 22, 2025
5d57896
updated
Mar 22, 2025
2ba687d
updated
Mar 22, 2025
79e465f
fix pre-commit
Mar 23, 2025
f51f182
pre-commit
robertgshaw2-redhat Mar 23, 2025
cf64b0e
updated
robertgshaw2-redhat Mar 23, 2025
2f29ae3
added files
robertgshaw2-redhat Mar 23, 2025
28d0396
updated
robertgshaw2-redhat Mar 23, 2025
66349c3
updated
robertgshaw2-redhat Mar 23, 2025
d5b0db4
added __init__.py
robertgshaw2-redhat Mar 23, 2025
284d5df
added __init__.py
robertgshaw2-redhat Mar 23, 2025
a10da86
updated
robertgshaw2-redhat Mar 23, 2025
7954461
updated
robertgshaw2-redhat Mar 23, 2025
70e06dd
updated
robertgshaw2-redhat Mar 24, 2025
220d694
updated
robertgshaw2-redhat Mar 24, 2025
2767063
Merge pull request #2 from robertgshaw2-redhat/rob-pd-controller
panf2333 Mar 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions examples/online_serving/disaggregated_prefill_zmq.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/bin/bash
# This file demonstrates the example usage of disaggregated prefilling with ZMQ
# We will launch 2 vllm instances (1 for prefill and 1 for decode),
# and then transfer the KV cache between them.

set -xe

echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
sleep 1

# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'cleanup' INT

# Cleanup function
cleanup() {
echo "Caught Ctrl+C, cleaning up..."
# Cleanup commands
pgrep python | xargs kill -9
pkill -f python
echo "Cleanup complete. Exiting."
exit 0
}

export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')

# a function that waits vLLM connect to start
wait_for_server() {
local port=$1
timeout 1200 bash -c "
until curl -s localhost:${port}/v1/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}


# a function that waits vLLM disagg to start
wait_for_disagg_server() {
local log_file=$1
timeout 1200 bash -c "
until grep -q 'PDWorker is ready' $log_file; do
sleep 1
done" && return 0 || return 1
}


# You can also adjust --kv-ip and --kv-port for distributed inference.
MODEL=meta-llama/Llama-3.1-8B-Instruct
CONTROLLER_ADDR=controller.ipc
PREFILL_WORKER_ADDR=prefill.ipc
DECODE_WORKER_ADDR=decode.ipc
PORT=8001

# prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 python3 ../../vllm/entrypoints/disaggregated/worker.py \
--model $MODEL \
--controller-addr $CONTROLLER_ADDR \
--worker-addr $PREFILL_WORKER_ADDR \
--max-model-len 100 \
--gpu-memory-utilization 0.8 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' > vllm_disagg_prefill.log 2>&1 &

# decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 python3 ../../vllm/entrypoints/disaggregated/worker.py \
--model $MODEL \
--controller-addr $CONTROLLER_ADDR \
--worker-addr $DECODE_WORKER_ADDR \
--max-model-len 100 \
--gpu-memory-utilization 0.8 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' > vllm_disagg_decode.log 2>&1 &

# launch a proxy server that opens the service at port 8000
# the workflow of this proxy:
# - Send req to prefill instance, wait until complete.
# - Send req to decode instance, streaming tokens.
python3 ../../vllm/entrypoints/disaggregated/api_server.py \
--port $PORT \
--model $MODEL \
--controller-addr $CONTROLLER_ADDR \
--prefill-addr $PREFILL_WORKER_ADDR \
--decode-addr $DECODE_WORKER_ADDR &

# wait until prefill, decode instances and proxy are ready
wait_for_server $PORT
wait_for_disagg_server vllm_disagg_prefill.log
wait_for_disagg_server vllm_disagg_decode.log

# serve two example requests
output1=$(curl -X POST -s http://localhost:8001/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Llama-3.1-8B-Instruct",
"prompt": "San Francisco is a",
"max_tokens": 10,
"temperature": 0
}')

output2=$(curl -X POST -s http://localhost:8001/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Llama-3.1-8B-Instruct",
"prompt": "Santa Clara is a",
"max_tokens": 10,
"temperature": 0
}')


# Cleanup commands
pgrep python | xargs kill -9
pkill -f python

echo ""

sleep 1

# Print the outputs of the curl requests
echo ""
echo "Output of first request: $output1"
echo "Output of second request: $output2"

echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
echo ""
Empty file added vllm/disaggregated/__init__.py
Empty file.
Loading