Skip to content

Commit a0b5061

Browse files
committed
Merge remote-tracking branch 'upstream/main' into yihua-cpu-offloading2
2 parents 789b00e + 0920ab9 commit a0b5061

File tree

241 files changed

+9783
-3464
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

241 files changed

+9783
-3464
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,19 @@ steps:
3939
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
4040
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
4141
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
42+
43+
- label: "Build and publish TPU release image"
44+
depends_on: ~
45+
if: build.env("NIGHTLY") == "1"
46+
agents:
47+
queue: tpu_queue_postmerge
48+
commands:
49+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
50+
- "docker push vllm/vllm-tpu:nightly"
51+
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
52+
plugins:
53+
- docker-login#v3.0.0:
54+
username: vllm
55+
password-env: DOCKERHUB_TOKEN
56+
env:
57+
DOCKER_BUILDKIT: "1"

.buildkite/test-pipeline.yaml

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -181,14 +181,14 @@ steps:
181181
commands:
182182
- VLLM_USE_V1=1 pytest -v -s v1
183183

184-
- label: Examples Test # 15min
184+
- label: Examples Test # 25min
185185
working_dir: "/vllm-workspace/examples"
186186
#mirror_hardwares: [amd]
187187
source_file_dependencies:
188188
- vllm/entrypoints
189189
- examples/
190190
commands:
191-
- pip install awscli tensorizer # for llava example and tensorizer test
191+
- pip install tensorizer # for tensorizer test
192192
- python3 offline_inference.py
193193
- python3 cpu_offload.py
194194
- python3 offline_inference_chat.py
@@ -198,10 +198,13 @@ steps:
198198
- python3 offline_inference_vision_language_multi_image.py
199199
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
200200
- python3 offline_inference_encoder_decoder.py
201+
- python3 offline_inference_classification.py
202+
- python3 offline_inference_embedding.py
203+
- python3 offline_inference_scoring.py
201204
- python3 offline_profile.py --model facebook/opt-125m
202205

203206
- label: Prefix Caching Test # 9min
204-
#mirror_hardwares: [amd]
207+
mirror_hardwares: [amd]
205208
source_file_dependencies:
206209
- vllm/
207210
- tests/prefix_caching
@@ -321,7 +324,7 @@ steps:
321324

322325
##### models test #####
323326

324-
- label: Basic Models Test # 30min
327+
- label: Basic Models Test # 24min
325328
source_file_dependencies:
326329
- vllm/
327330
- tests/models
@@ -331,7 +334,7 @@ steps:
331334
- pytest -v -s models/test_registry.py
332335
- pytest -v -s models/test_initialization.py
333336

334-
- label: Language Models Test (Standard) # 42min
337+
- label: Language Models Test (Standard) # 32min
335338
#mirror_hardwares: [amd]
336339
source_file_dependencies:
337340
- vllm/
@@ -342,7 +345,7 @@ steps:
342345
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
343346
- pytest -v -s models/embedding/language -m core_model
344347

345-
- label: Language Models Test (Extended) # 50min
348+
- label: Language Models Test (Extended) # 1h10min
346349
optional: true
347350
source_file_dependencies:
348351
- vllm/
@@ -353,7 +356,7 @@ steps:
353356
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
354357
- pytest -v -s models/embedding/language -m 'not core_model'
355358

356-
- label: Multi-Modal Models Test (Standard) # 26min
359+
- label: Multi-Modal Models Test (Standard) # 28min
357360
#mirror_hardwares: [amd]
358361
source_file_dependencies:
359362
- vllm/
@@ -369,7 +372,7 @@ steps:
369372
- pytest -v -s models/encoder_decoder/language -m core_model
370373
- pytest -v -s models/encoder_decoder/vision_language -m core_model
371374

372-
- label: Multi-Modal Models Test (Extended) # 1h15m
375+
- label: Multi-Modal Models Test (Extended) 1 # 1h16m
373376
optional: true
374377
source_file_dependencies:
375378
- vllm/
@@ -380,14 +383,24 @@ steps:
380383
commands:
381384
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
382385
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
386+
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
383387
# HACK - run phi3v tests separately to sidestep this transformers bug
384388
# https://github.com/huggingface/transformers/issues/34307
385389
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
386-
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
390+
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
387391
- pytest -v -s models/embedding/vision_language -m 'not core_model'
388392
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
389393
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
390394

395+
- label: Multi-Modal Models Test (Extended) 2 # 38m
396+
optional: true
397+
source_file_dependencies:
398+
- vllm/
399+
- tests/models/decoder_only/vision_language
400+
commands:
401+
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
402+
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
403+
391404
# This test is used only in PR development phase to test individual models and should never run on main
392405
- label: Custom Models Test
393406
optional: true
@@ -422,11 +435,11 @@ steps:
422435
- tests/distributed/
423436
commands:
424437
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
425-
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
438+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
426439
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
427440
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
428441
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
429-
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
442+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
430443

431444
- label: Distributed Tests (2 GPUs) # 40min
432445
#mirror_hardwares: [amd]
@@ -445,12 +458,12 @@ steps:
445458
commands:
446459
- pytest -v -s ./compile/test_basic_correctness.py
447460
- pytest -v -s ./compile/test_wrapper.py
448-
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
449-
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
461+
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
462+
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
450463
# Avoid importing model tests that cause CUDA reinitialization error
451-
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
452-
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
453-
- pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
464+
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
465+
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
466+
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
454467
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
455468
- pip install -e ./plugins/vllm_add_dummy_model
456469
- pytest -v -s distributed/test_distributed_oot.py
@@ -540,7 +553,7 @@ steps:
540553
# see https://github.com/vllm-project/vllm/pull/5689 for details
541554
- pytest -v -s distributed/test_custom_all_reduce.py
542555
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
543-
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
556+
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
544557
- pytest -v -s -x lora/test_mixtral.py
545558

546559
- label: LM Eval Large Models # optional
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
name: Lint and Deploy Charts
2+
3+
on: pull_request
4+
5+
jobs:
6+
lint-and-deploy:
7+
runs-on: ubuntu-latest
8+
steps:
9+
- name: Checkout
10+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
11+
with:
12+
fetch-depth: 0
13+
14+
- name: Set up Helm
15+
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
16+
with:
17+
version: v3.14.4
18+
19+
#Python is required because ct lint runs Yamale and yamllint which require Python.
20+
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
21+
with:
22+
python-version: '3.13'
23+
24+
- name: Set up chart-testing
25+
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
26+
with:
27+
version: v3.10.1
28+
29+
- name: Run chart-testing (lint)
30+
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
31+
32+
- name: Setup minio
33+
run: |
34+
docker network create vllm-net
35+
docker run -d -p 9000:9000 --name minio --net vllm-net \
36+
-e "MINIO_ACCESS_KEY=minioadmin" \
37+
-e "MINIO_SECRET_KEY=minioadmin" \
38+
-v /tmp/data:/data \
39+
-v /tmp/config:/root/.minio \
40+
minio/minio server /data
41+
export AWS_ACCESS_KEY_ID=minioadmin
42+
export AWS_SECRET_ACCESS_KEY=minioadmin
43+
export AWS_EC2_METADATA_DISABLED=true
44+
mkdir opt-125m
45+
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
46+
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
47+
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
48+
49+
- name: Create kind cluster
50+
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
51+
52+
- name: Build the Docker image vllm cpu
53+
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
54+
55+
- name: Configuration of docker images, network and namespace for the kind cluster
56+
run: |
57+
docker pull amazon/aws-cli:2.6.4
58+
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
59+
kind load docker-image vllm-cpu-env:latest --name chart-testing
60+
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
61+
kubectl create ns ns-vllm
62+
63+
- name: Run chart-testing (install)
64+
run: |
65+
export AWS_ACCESS_KEY_ID=minioadmin
66+
export AWS_SECRET_ACCESS_KEY=minioadmin
67+
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
68+
69+
- name: curl test
70+
run: |
71+
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
72+
sleep 10
73+
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
74+
--header "Content-Type: application/json" \
75+
--data '{
76+
"model": "opt-125m",
77+
"prompt": "San Francisco is a",
78+
"max_tokens": 7,
79+
"temperature": 0
80+
}'):$CODE"
81+
echo "$CODE"

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ set(VLLM_EXT_SRC
196196
"csrc/quantization/gptq/q_gemm.cu"
197197
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
198198
"csrc/quantization/fp8/common.cu"
199+
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
199200
"csrc/quantization/gguf/gguf_kernel.cu"
200201
"csrc/cuda_utils_kernels.cu"
201202
"csrc/prepare_inputs/advance_step.cu"
@@ -300,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
300301
#
301302
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
302303
# kernels for the remaining archs that are not already built for 3x.
303-
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
304+
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
304305
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
305306
# subtract out the archs that are already built for 3x
306307
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})

Dockerfile.neuron

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# default base image
2-
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
2+
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
3+
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
34

45
FROM $BASE_IMAGE
56

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
1616
---
1717

1818
*Latest News* 🔥
19+
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
1920
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
2021
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
2122
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
@@ -133,3 +134,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
133134
* For coordinating contributions and development, please use Slack.
134135
* For security disclosures, please use Github's security advisory feature.
135136
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
137+
138+
## Media Kit
139+
140+
* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

benchmarks/benchmark_serving.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,7 @@ def main(args: argparse.Namespace):
781781
backend = args.backend
782782
model_id = args.model
783783
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
784+
tokenizer_mode = args.tokenizer_mode
784785

785786
if args.base_url is not None:
786787
api_url = f"{args.base_url}{args.endpoint}"
@@ -790,6 +791,7 @@ def main(args: argparse.Namespace):
790791
base_url = f"http://{args.host}:{args.port}"
791792

792793
tokenizer = get_tokenizer(tokenizer_id,
794+
tokenizer_mode=tokenizer_mode,
793795
trust_remote_code=args.trust_remote_code)
794796

795797
if args.dataset is not None:
@@ -1210,5 +1212,15 @@ def main(args: argparse.Namespace):
12101212
"from the sampled HF dataset.",
12111213
)
12121214

1215+
parser.add_argument(
1216+
'--tokenizer-mode',
1217+
type=str,
1218+
default="auto",
1219+
choices=['auto', 'slow', 'mistral'],
1220+
help='The tokenizer mode.\n\n* "auto" will use the '
1221+
'fast tokenizer if available.\n* "slow" will '
1222+
'always use the slow tokenizer. \n* '
1223+
'"mistral" will always use the `mistral_common` tokenizer.')
1224+
12131225
args = parser.parse_args()
12141226
main(args)

0 commit comments

Comments
 (0)