From 1a7d82642b65a563c1f47e5097e5c0fb764e0c21 Mon Sep 17 00:00:00 2001 From: Harry Mellor Date: Wed, 7 Feb 2024 14:33:50 +0000 Subject: [PATCH 01/54] Upgrade to `torch==2.2.0` - `torch==2.1.1` -> `torch==2.2.0` - `xformers==0.0.23.post1` -> `xformers==0.0.24` - ROCM not updated because no `torch==2.2.0` containers have been published yet --- .github/workflows/publish.yml | 2 +- Dockerfile.rocm | 2 +- pyproject.toml | 2 +- requirements-build.txt | 2 +- requirements-dev.txt | 1 + requirements.txt | 4 ++-- 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 5211dc180798..d88f646636ba 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -49,7 +49,7 @@ jobs: matrix: os: ['ubuntu-20.04'] python-version: ['3.8', '3.9', '3.10', '3.11'] - pytorch-version: ['2.1.2'] # Must be the most recent version that meets requirements.txt. + pytorch-version: ['2.2.0'] # Must be the most recent version that meets requirements.txt. cuda-version: ['11.8', '12.1'] steps: diff --git a/Dockerfile.rocm b/Dockerfile.rocm index f49b321372ed..1ae9e9527820 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -65,7 +65,7 @@ RUN mkdir libs \ COPY ./ /app/vllm RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install xformers==0.0.23 --no-deps +RUN python3 -m pip install xformers==0.0.24 --no-deps # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Manually removed it so that later steps of numpy upgrade can continue diff --git a/pyproject.toml b/pyproject.toml index b197256f6ff5..e430f703154f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "ninja", "packaging", "setuptools >= 49.4.0", - "torch == 2.1.2", + "torch == 2.2.0", "wheel", ] build-backend = "setuptools.build_meta" diff --git a/requirements-build.txt b/requirements-build.txt index 7e7e48a1313e..fdc6dcd503fb 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -2,5 +2,5 @@ ninja packaging setuptools>=49.4.0 -torch==2.1.2 +torch==2.2.0 wheel \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index f8126008d079..b764313487e7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,6 +15,7 @@ pytest-forked pytest-asyncio httpx einops # required for MPT +wheel # required for flash_attn flash_attn # required for HuggingFace's llama implementation openai requests diff --git a/requirements.txt b/requirements.txt index 5684b2c29634..dc5c939f2d36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ psutil ray >= 2.9 sentencepiece # Required for LLaMA tokenizer. numpy -torch == 2.1.2 +torch == 2.2.0 transformers >= 4.37.0 # Required for Qwen2 -xformers == 0.0.23.post1 # Required for CUDA 12.1. +xformers == 0.0.24 # Required for CUDA 12.1. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. From 7de363fdcae21891392a113ac16c797f66d862d8 Mon Sep 17 00:00:00 2001 From: Harry Mellor Date: Wed, 7 Feb 2024 15:28:19 +0000 Subject: [PATCH 02/54] Remove `wheel` from `requirements-dev.txt` --- requirements-dev.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index b764313487e7..f8126008d079 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,7 +15,6 @@ pytest-forked pytest-asyncio httpx einops # required for MPT -wheel # required for flash_attn flash_attn # required for HuggingFace's llama implementation openai requests From 9bc921d2cc6484297b262487ea42e1524a74e9ba Mon Sep 17 00:00:00 2001 From: Harry Mellor Date: Mon, 12 Feb 2024 12:49:47 +0000 Subject: [PATCH 03/54] Revert change to `Dockerfile.rocm` --- Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 1ae9e9527820..f49b321372ed 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -65,7 +65,7 @@ RUN mkdir libs \ COPY ./ /app/vllm RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install xformers==0.0.24 --no-deps +RUN python3 -m pip install xformers==0.0.23 --no-deps # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Manually removed it so that later steps of numpy upgrade can continue From 76ab3e7274b4641a503fcfefdb04281b26736531 Mon Sep 17 00:00:00 2001 From: Harry Mellor Date: Thu, 15 Feb 2024 10:59:45 +0000 Subject: [PATCH 04/54] Kick CI From 922aa0c8f21c03b26fd058ac89f3d051e444da73 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 22 Feb 2024 08:46:52 +0000 Subject: [PATCH 05/54] Update requirements.txt Co-authored-by: Woosuk Kwon --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 264a63e85ec7..594d1d6fbb1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ sentencepiece # Required for LLaMA tokenizer. numpy torch == 2.2.0 transformers >= 4.38.0 # Required for Gemma. -xformers == 0.0.24 # Required for CUDA 12.1. +xformers == 0.0.24 # Required for PyTorch 2.2.0. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. From daca4e15560b743901d97a1dfffeb58134338a9f Mon Sep 17 00:00:00 2001 From: Harry Mellor Date: Mon, 4 Mar 2024 14:01:17 +0000 Subject: [PATCH 06/54] Update to 2.2.1 --- .github/workflows/publish.yml | 2 +- pyproject.toml | 2 +- requirements-build.txt | 2 +- requirements.txt | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d88f646636ba..2db687a287ef 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -49,7 +49,7 @@ jobs: matrix: os: ['ubuntu-20.04'] python-version: ['3.8', '3.9', '3.10', '3.11'] - pytorch-version: ['2.2.0'] # Must be the most recent version that meets requirements.txt. + pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements.txt. cuda-version: ['11.8', '12.1'] steps: diff --git a/pyproject.toml b/pyproject.toml index f2d39237af0f..508c58f42898 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "ninja", "packaging", "setuptools >= 49.4.0", - "torch == 2.2.0", + "torch == 2.2.1", "wheel", ] build-backend = "setuptools.build_meta" diff --git a/requirements-build.txt b/requirements-build.txt index fdc6dcd503fb..18cfb5894d18 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -2,5 +2,5 @@ ninja packaging setuptools>=49.4.0 -torch==2.2.0 +torch==2.2.1 wheel \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 60d58d1c3a19..03f63f2fe97b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ psutil ray >= 2.9 sentencepiece # Required for LLaMA tokenizer. numpy -torch == 2.2.0 +torch == 2.2.1 transformers >= 4.38.0 # Required for Gemma. -xformers == 0.0.24 # Required for PyTorch 2.2.0. +xformers == 0.0.24 # Required for PyTorch 2.2.1. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. From 015b7d4dfde87d6a1cbf5652d0a90a3d5eb6826e Mon Sep 17 00:00:00 2001 From: Harry Mellor Date: Mon, 4 Mar 2024 15:21:04 +0000 Subject: [PATCH 07/54] Revert "Update to 2.2.1" This reverts commit daca4e15560b743901d97a1dfffeb58134338a9f. --- .github/workflows/publish.yml | 2 +- pyproject.toml | 2 +- requirements-build.txt | 2 +- requirements.txt | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 2db687a287ef..d88f646636ba 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -49,7 +49,7 @@ jobs: matrix: os: ['ubuntu-20.04'] python-version: ['3.8', '3.9', '3.10', '3.11'] - pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements.txt. + pytorch-version: ['2.2.0'] # Must be the most recent version that meets requirements.txt. cuda-version: ['11.8', '12.1'] steps: diff --git a/pyproject.toml b/pyproject.toml index 508c58f42898..f2d39237af0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "ninja", "packaging", "setuptools >= 49.4.0", - "torch == 2.2.1", + "torch == 2.2.0", "wheel", ] build-backend = "setuptools.build_meta" diff --git a/requirements-build.txt b/requirements-build.txt index 18cfb5894d18..fdc6dcd503fb 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -2,5 +2,5 @@ ninja packaging setuptools>=49.4.0 -torch==2.2.1 +torch==2.2.0 wheel \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 03f63f2fe97b..60d58d1c3a19 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ psutil ray >= 2.9 sentencepiece # Required for LLaMA tokenizer. numpy -torch == 2.2.1 +torch == 2.2.0 transformers >= 4.38.0 # Required for Gemma. -xformers == 0.0.24 # Required for PyTorch 2.2.1. +xformers == 0.0.24 # Required for PyTorch 2.2.0. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. From 75f05de27b0a25df106b9850cc78075de1df4e42 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 15 Mar 2024 20:57:28 +0000 Subject: [PATCH 08/54] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6a30bdbbfc41..16f369365079 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ sentencepiece # Required for LLaMA tokenizer. numpy torch == 2.2.0 transformers >= 4.38.0 # Required for Gemma. -xformers == 0.0.24 # Required for PyTorch 2.2.0. +xformers == 0.0.24 # Requires PyTorch 2.2.0. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. From e82cf3a5a1147ecba9b4aeeedec7a58039747fa1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 15 Mar 2024 18:38:05 -0700 Subject: [PATCH 09/54] try to test one distributed at a time --- .buildkite/test-pipeline.yaml | 9 +++++++-- .../test_basic_distributed_correctness.py | 16 +++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8badc16d0cb7..8810dda8d826 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -22,8 +22,13 @@ steps: working_dir: "/vllm-workspace/tests/distributed" num_gpus: 2 # only support 1 or 2 for now. -- label: Distributed Correctness Test - command: pytest -v -s --forked test_basic_distributed_correctness.py +- label: Distributed Correctness Test-facebook/opt-125m + command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py + working_dir: "/vllm-workspace/tests/distributed" + num_gpus: 2 # only support 1 or 2 for now. + +- label: Distributed Correctness Test-meta-llama/Llama-2-7b-hf + command: TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py working_dir: "/vllm-workspace/tests/distributed" num_gpus: 2 # only support 1 or 2 for now. diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 82075356fccb..75d6a84adfc7 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -1,13 +1,23 @@ """Compare the outputs of HF and distributed vLLM when using greedy sampling. -Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`. +vLLM will allocate all the available memory, so we need to run the tests one +by one. The solution is to pass arguments (model name) by environment +variables. +Run: + +```sh +TEST_DIST_MODEL=facebook/opt-125m pytest \ + test_basic_distributed_correctness.py +TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \ + test_basic_distributed_correctness.py +``` """ +import os import pytest import torch MODELS = [ - "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + os.environ["TEST_DIST_MODEL"], ] From 4accd02f6f14fd8a09c387f0ff0a1f7c3c4df9e1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 16 Mar 2024 00:54:11 -0700 Subject: [PATCH 10/54] try pytorch 2.2.1 --- .github/workflows/publish.yml | 2 +- pyproject.toml | 2 +- requirements-build.txt | 2 +- requirements.txt | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d88f646636ba..2db687a287ef 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -49,7 +49,7 @@ jobs: matrix: os: ['ubuntu-20.04'] python-version: ['3.8', '3.9', '3.10', '3.11'] - pytorch-version: ['2.2.0'] # Must be the most recent version that meets requirements.txt. + pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements.txt. cuda-version: ['11.8', '12.1'] steps: diff --git a/pyproject.toml b/pyproject.toml index c0645fd09e7f..c812af904b89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "ninja", "packaging", "setuptools >= 49.4.0", - "torch == 2.2.0", + "torch == 2.2.1", "wheel", ] build-backend = "setuptools.build_meta" diff --git a/requirements-build.txt b/requirements-build.txt index fdc6dcd503fb..18cfb5894d18 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -2,5 +2,5 @@ ninja packaging setuptools>=49.4.0 -torch==2.2.0 +torch==2.2.1 wheel \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 16f369365079..12062276d12e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ psutil ray >= 2.9 sentencepiece # Required for LLaMA tokenizer. numpy -torch == 2.2.0 +torch == 2.2.1 transformers >= 4.38.0 # Required for Gemma. -xformers == 0.0.24 # Requires PyTorch 2.2.0. +xformers == 0.0.25 # Requires PyTorch 2.2.1. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. From a92346f0e7897e37f994822d1516e1c71a76a347 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 20 Mar 2024 23:38:14 -0700 Subject: [PATCH 11/54] try to fix test --- .buildkite/test-pipeline.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8810dda8d826..e313226e08dd 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -28,7 +28,11 @@ steps: num_gpus: 2 # only support 1 or 2 for now. - label: Distributed Correctness Test-meta-llama/Llama-2-7b-hf - command: TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py + command: + - export TORCH_FILE=$(python -c "import torch; print(torch.__file__)") + - export NCCL_DIR=$(echo $TORCH_FILE | sed 's|torch/__init__.py|nvidia/nccl|') + - rm -rf $NCCL_DIR || true # remove the directory if it exists + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py working_dir: "/vllm-workspace/tests/distributed" num_gpus: 2 # only support 1 or 2 for now. From e7f215b08361a52bc7d960c5c65e7ac627ebd087 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 20 Mar 2024 23:57:44 -0700 Subject: [PATCH 12/54] use pip install to resolve the problem --- .buildkite/test-pipeline.yaml | 6 +----- requirements.txt | 1 + 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e313226e08dd..8810dda8d826 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -28,11 +28,7 @@ steps: num_gpus: 2 # only support 1 or 2 for now. - label: Distributed Correctness Test-meta-llama/Llama-2-7b-hf - command: - - export TORCH_FILE=$(python -c "import torch; print(torch.__file__)") - - export NCCL_DIR=$(echo $TORCH_FILE | sed 's|torch/__init__.py|nvidia/nccl|') - - rm -rf $NCCL_DIR || true # remove the directory if it exists - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py + command: TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py working_dir: "/vllm-workspace/tests/distributed" num_gpus: 2 # only support 1 or 2 for now. diff --git a/requirements.txt b/requirements.txt index 12062276d12e..bb51db100e5c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ pynvml == 11.5.0 triton >= 2.1.0 outlines == 0.0.34 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. +nvidia-nccl-cu12 == 2.18.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph \ No newline at end of file From f99fe2ac75d4b4c0a47966cb17566f5ca766b925 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Mar 2024 01:29:03 -0700 Subject: [PATCH 13/54] remove nccl version to test --- .buildkite/test-pipeline.yaml | 4 +++- requirements.txt | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8810dda8d826..eb4c702e9108 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -23,7 +23,9 @@ steps: num_gpus: 2 # only support 1 or 2 for now. - label: Distributed Correctness Test-facebook/opt-125m - command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py + command: + - pip install nvidia-nccl-cu12==2.19.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph + - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py working_dir: "/vllm-workspace/tests/distributed" num_gpus: 2 # only support 1 or 2 for now. diff --git a/requirements.txt b/requirements.txt index bb51db100e5c..12062276d12e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,3 @@ pynvml == 11.5.0 triton >= 2.1.0 outlines == 0.0.34 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. -nvidia-nccl-cu12 == 2.18.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph \ No newline at end of file From 0f3181f434c262ee7934f3171bc8257b6cc47c00 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Mar 2024 07:16:30 -0700 Subject: [PATCH 14/54] move to Dockerfile --- .buildkite/test-pipeline.yaml | 4 +--- Dockerfile | 4 ++-- requirements.txt | 1 + 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index eb4c702e9108..8810dda8d826 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -23,9 +23,7 @@ steps: num_gpus: 2 # only support 1 or 2 for now. - label: Distributed Correctness Test-facebook/opt-125m - command: - - pip install nvidia-nccl-cu12==2.19.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph - - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py + command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py working_dir: "/vllm-workspace/tests/distributed" num_gpus: 2 # only support 1 or 2 for now. diff --git a/Dockerfile b/Dockerfile index 8be03b3567f0..ff1629f02a8d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /workspace # install build and runtime dependencies COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements.txt + pip install -r requirements.txt --force-reinstall # install development dependencies COPY requirements-dev.txt requirements-dev.txt @@ -106,7 +106,7 @@ RUN apt-get update -y \ WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements.txt + pip install -r requirements.txt --force-reinstall # Install flash attention (from pre-built wheel) RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ diff --git a/requirements.txt b/requirements.txt index 12062276d12e..e0d80463a8ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ pynvml == 11.5.0 triton >= 2.1.0 outlines == 0.0.34 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. +nvidia-nccl-cu12 == 2.19.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph \ No newline at end of file From 6ef38435c567fbba4570bcc8ae181c4a5f9304f5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Mar 2024 07:18:27 -0700 Subject: [PATCH 15/54] fix version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e0d80463a8ae..1e5c3f911e21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ pynvml == 11.5.0 triton >= 2.1.0 outlines == 0.0.34 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. -nvidia-nccl-cu12 == 2.19.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph \ No newline at end of file +nvidia-nccl-cu12 == 2.18.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph From 7db0e1b4a87deeef29861a090f42522099bb7575 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Mar 2024 07:29:51 -0700 Subject: [PATCH 16/54] use docerfile --- Dockerfile | 16 ++++++++++++++-- requirements.txt | 1 - 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index ff1629f02a8d..1ae333da5dab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,13 @@ WORKDIR /workspace # install build and runtime dependencies COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements.txt --force-reinstall + pip install -r requirements.txt +# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph +# so we use 2.18.3 +# and we are in a dependency hell with torch and cupy +# have to manually install to downgrade nccl version +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install nvidia-nccl-cu12==2.18.3 # install development dependencies COPY requirements-dev.txt requirements-dev.txt @@ -106,7 +112,13 @@ RUN apt-get update -y \ WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements.txt --force-reinstall + pip install -r requirements.txt +# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph +# so we use 2.18.3 +# and we are in a dependency hell with torch and cupy +# have to manually install to downgrade nccl version +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install nvidia-nccl-cu12==2.18.3 # Install flash attention (from pre-built wheel) RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ diff --git a/requirements.txt b/requirements.txt index 1e5c3f911e21..12062276d12e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,3 @@ pynvml == 11.5.0 triton >= 2.1.0 outlines == 0.0.34 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. -nvidia-nccl-cu12 == 2.18.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph From 62650ae6fd02fa37a7130a7a700b082a3ba6f2aa Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Mar 2024 08:05:46 -0700 Subject: [PATCH 17/54] try 2.2.0 first --- .github/workflows/publish.yml | 2 +- pyproject.toml | 2 +- requirements-build.txt | 2 +- requirements.txt | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 2db687a287ef..d88f646636ba 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -49,7 +49,7 @@ jobs: matrix: os: ['ubuntu-20.04'] python-version: ['3.8', '3.9', '3.10', '3.11'] - pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements.txt. + pytorch-version: ['2.2.0'] # Must be the most recent version that meets requirements.txt. cuda-version: ['11.8', '12.1'] steps: diff --git a/pyproject.toml b/pyproject.toml index c812af904b89..c0645fd09e7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "ninja", "packaging", "setuptools >= 49.4.0", - "torch == 2.2.1", + "torch == 2.2.0", "wheel", ] build-backend = "setuptools.build_meta" diff --git a/requirements-build.txt b/requirements-build.txt index 18cfb5894d18..fdc6dcd503fb 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -2,5 +2,5 @@ ninja packaging setuptools>=49.4.0 -torch==2.2.1 +torch==2.2.0 wheel \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 12062276d12e..16f369365079 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ psutil ray >= 2.9 sentencepiece # Required for LLaMA tokenizer. numpy -torch == 2.2.1 +torch == 2.2.0 transformers >= 4.38.0 # Required for Gemma. -xformers == 0.0.25 # Requires PyTorch 2.2.1. +xformers == 0.0.24 # Requires PyTorch 2.2.0. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. From 4ed16b9dbf1aca8c20c620d1bc801a5b7f3e2a7c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Mar 2024 09:20:20 -0700 Subject: [PATCH 18/54] place nccl install after vllm --- Dockerfile | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1ae333da5dab..179758ba0b71 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,12 +19,6 @@ WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt -# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph -# so we use 2.18.3 -# and we are in a dependency hell with torch and cupy -# have to manually install to downgrade nccl version -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install nvidia-nccl-cu12==2.18.3 # install development dependencies COPY requirements-dev.txt requirements-dev.txt @@ -96,6 +90,13 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose +# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph +# so we use 2.18.3 +# and we are in a dependency hell with torch and cupy +# have to manually install to downgrade nccl version +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install nvidia-nccl-cu12==2.18.3 + #################### TEST IMAGE #################### @@ -113,6 +114,10 @@ WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt + +# Install flash attention (from pre-built wheel) +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph # so we use 2.18.3 # and we are in a dependency hell with torch and cupy @@ -120,10 +125,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \ pip install nvidia-nccl-cu12==2.18.3 -# Install flash attention (from pre-built wheel) -RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ - pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir - #################### RUNTIME BASE IMAGE #################### From 2d215dfac817f9d3669daf09791e00210d00ac7a Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Mar 2024 10:57:36 -0700 Subject: [PATCH 19/54] patchelf --- Dockerfile | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 179758ba0b71..f579e4fa0ae5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -90,13 +90,15 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph -# so we use 2.18.3 -# and we are in a dependency hell with torch and cupy -# have to manually install to downgrade nccl version -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install nvidia-nccl-cu12==2.18.3 - +# install nccl 2.16.2 +RUN python3 -m cupyx.tools.install_library --cuda 12.x --library nccl +RUN apt install patchelf -y +# rename to 2.16 +RUN cp /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2 /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2.16 +# rename pytorch-dependent nccl to 2.19 +RUN cp /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2 /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2.19 +RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.19 /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so +RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.16 /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so #################### TEST IMAGE #################### @@ -118,12 +120,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # Install flash attention (from pre-built wheel) RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir -# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph -# so we use 2.18.3 -# and we are in a dependency hell with torch and cupy -# have to manually install to downgrade nccl version -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install nvidia-nccl-cu12==2.18.3 #################### RUNTIME BASE IMAGE #################### From 0f6f24376158c9b4d1e3e93e11cc3e7ce8790a00 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Mar 2024 12:56:23 -0700 Subject: [PATCH 20/54] update rpath for cupy --- Dockerfile | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index f579e4fa0ae5..cf688294371c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -90,15 +90,25 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -# install nccl 2.16.2 -RUN python3 -m cupyx.tools.install_library --cuda 12.x --library nccl + +# tricky part, manipulate so files to use the correct nccl version RUN apt install patchelf -y + +# cupy part +# nccl 2.16.2 that is good for cupy +RUN python3 -m cupyx.tools.install_library --cuda 12.x --library nccl # rename to 2.16 RUN cp /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2 /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2.16 +# link to the correct nccl version +RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.16 /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so +# set rpath to the correct nccl version +RUN patchelf --set-rpath /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/ /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so + +# pytorch part # rename pytorch-dependent nccl to 2.19 RUN cp /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2 /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2.19 +# link to a more specific nccl version RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.19 /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so -RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.16 /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so #################### TEST IMAGE #################### From da1df5e24bef5b910b8132853109d30fe64bf378 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 15:47:10 -0700 Subject: [PATCH 21/54] try to write a custom pynccl --- Dockerfile | 25 +- .../parallel_utils/cupy_utils.py | 65 ++++-- vllm/model_executor/parallel_utils/pynccl.py | 217 ++++++++++++++++++ 3 files changed, 272 insertions(+), 35 deletions(-) create mode 100644 vllm/model_executor/parallel_utils/pynccl.py diff --git a/Dockerfile b/Dockerfile index cf688294371c..9d8e357a781e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -91,24 +91,13 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -# tricky part, manipulate so files to use the correct nccl version -RUN apt install patchelf -y - -# cupy part -# nccl 2.16.2 that is good for cupy -RUN python3 -m cupyx.tools.install_library --cuda 12.x --library nccl -# rename to 2.16 -RUN cp /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2 /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2.16 -# link to the correct nccl version -RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.16 /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so -# set rpath to the correct nccl version -RUN patchelf --set-rpath /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/ /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so - -# pytorch part -# rename pytorch-dependent nccl to 2.19 -RUN cp /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2 /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2.19 -# link to a more specific nccl version -RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.19 /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so +# tricky part, only this version nccl is good +RUN wget https://developer.download.nvidia.com/compute/redist/nccl/v2.16.2/nccl_2.16.2-1+cuda12.0_x86_64.txz +RUN tar -xvf nccl_2.16.2-1+cuda12.0_x86_64.txz +RUN cp ./nccl_2.16.2-1+cuda12.0_x86_64/lib/libnccl.so.2.16.2 ./libnccl.so.2.16.2 +RUN rm -rf ./nccl_2.16.2-1+cuda12.0_x86_64 +RUN rm nccl_2.16.2-1+cuda12.0_x86_64.txz + #################### TEST IMAGE #################### diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py index f8cffc01e3c3..f58ce6c4bdba 100644 --- a/vllm/model_executor/parallel_utils/cupy_utils.py +++ b/vllm/model_executor/parallel_utils/cupy_utils.py @@ -12,10 +12,15 @@ import torch from torch.distributed import ReduceOp + try: - import cupy - from cupy.cuda import nccl - from cupyx.distributed import NCCLBackend + # import cupy + # from cupy.cuda import nccl + # from cupyx.distributed import NCCLBackend + from .pynccl import NCCLCommunicator, ncclGetVersion + print(f"nccl version {ncclGetVersion()}") + comm: NCCLCommunicator = None + except ImportError as e: cupy = e nccl = None @@ -32,7 +37,7 @@ class NCCLBackend: } -class NCCLBackendWithBFloat16(NCCLBackend): +class NCCLBackendWithBFloat16: # This is enough to add bfloat16 support for most operations, # but broadcast will fail (will require changes in compiled # cupy code). @@ -55,12 +60,18 @@ def barrier(self) -> None: def is_initialized() -> bool: """Returns whether the NCCL backend is initialized.""" - return _NCCL_BACKEND is not None + return comm is not None @contextlib.contextmanager def set_cupy_stream(stream: torch.cuda.Stream): """Set the cuda stream for communication""" + try: + comm.stream = stream + yield + finally: + pass + return cupy_stream = cupy.cuda.ExternalStream(stream.cuda_stream, stream.device_index) with cupy_stream: @@ -74,6 +85,9 @@ def init_process_group(world_size: int, rank: int, host: str, # TODO: handle NCCL timeouts. """ assert not is_initialized() + global comm + comm = NCCLCommunicator(init_method=f"tcp://{host}:{port}", world_size=world_size, rank=rank) + return if isinstance(cupy, Exception): raise ImportError( @@ -99,21 +113,36 @@ def init_process_group(world_size: int, rank: int, host: str, def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None: """All-reduces the input tensor across the process group.""" assert input_.is_cuda, f"{input_} should be a cuda tensor" - # Hack to support bfloat16 - torch_dtype = input_.dtype - if torch_dtype is torch.bfloat16: - # We need to view as float16, otherwise - # cupy will fail. This will not change - # the underlying data. - input_ = input_.view(torch.float16) - cupy_input = cupy.asarray(input_) - cupy_input._torch_dtype = torch_dtype # pylint: disable=protected-access - _NCCL_BACKEND.all_reduce(in_array=cupy_input, - out_array=cupy_input, - op=_OP_MAPPING[op]) + free_bytes = torch.cuda.mem_get_info()[0] + # # Hack to support bfloat16 + # torch_dtype = input_.dtype + # if torch_dtype is torch.bfloat16: + # # We need to view as float16, otherwise + # # cupy will fail. This will not change + # # the underlying data. + # input_ = input_.view(torch.float16) + # cupy_input = cupy.asarray(input_) + # cupy_input._torch_dtype = torch_dtype # pylint: disable=protected-access + # _NCCL_BACKEND.all_reduce(in_array=cupy_input, + # out_array=cupy_input, + # op=_OP_MAPPING[op]) + comm.all_reduce(input_, op) + + import os + env_name = os.environ['CONDA_DEFAULT_ENV'] + dir_name = f"/home/gcpuser/vllm/{env_name}-process-{os.getpid()}" + with open(f"{dir_name}.txt", "a") as f: + f.write(f"{free_bytes=} before allreduce\n") + free_bytes_after = torch.cuda.mem_get_info()[0] + f.write(f"{free_bytes_after=} after allreduce\n") + f.write(f"memory cost during allreduce: {(free_bytes - free_bytes_after) / 1024 / 1024} MiB\n") def destroy_process_group() -> None: + global comm + del comm + comm = None + return """Destroys the NCCL backend.""" global _NCCL_BACKEND global _WORLD_SIZE @@ -123,8 +152,10 @@ def destroy_process_group() -> None: def get_world_size() -> int: """Returns the world size.""" + return comm.world_size return _WORLD_SIZE def get_nccl_backend(): + return comm return _NCCL_BACKEND diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py new file mode 100644 index 000000000000..c6f8e6e6840c --- /dev/null +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -0,0 +1,217 @@ +# ===================== pynccl.py ===================== +# This file is a pure Python wrapper for the NCCL library. +# Copyright (c) 2024 vLLM team +# Author: Kaichao You +# Email: youkaichao@gmail.com +# All rights reserved. +# ==================================================== + + +# ===================== import region ===================== +import torch +import ctypes +import torch.distributed as dist +from torch.distributed import ReduceOp +import datetime + +# manually load the nccl library +nccl = ctypes.CDLL("/vllm-workspace/libnccl.so.2.16.2") +# use `pip install nvidia-nccl-cu12==2.16.2` to install from pypi +# then you can use the following line to load the library +# and they cause increased memory overhead +# nccl = ctypes.CDLL("/opt/conda/envs/${CONDA_ENV}/lib/python3.9/site-packages/nvidia/nccl/lib/libnccl.so.2") + +# ===================== declare types and functions ===================== + +ncclResult_t = ctypes.c_int + +# equivalent to c declaration: +# ncclResult_t ncclGetVersion(int *version); +_c_ncclGetVersion = nccl.ncclGetVersion +_c_ncclGetVersion.restype = ctypes.c_int +_c_ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)] + +def ncclGetVersion() -> int: + version = ctypes.c_int() + result = _c_ncclGetVersion(ctypes.byref(version)) + assert result == 0 + # something like 21903 --> "2.19.3" + version_str = str(version.value) + major = version_str[0].lstrip("0") + minor = version_str[1:3].lstrip("0") + patch = version_str[3:].lstrip("0") + return f"{major}.{minor}.{patch}" + +class NcclUniqueId(ctypes.Structure): + _fields_ = [("internal", ctypes.c_byte * 128)] + +# equivalent to c declaration: +# ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +_c_ncclGetUniqueId = nccl.ncclGetUniqueId +_c_ncclGetUniqueId.restype = ctypes.c_int +_c_ncclGetUniqueId.argtypes = [ctypes.POINTER(NcclUniqueId)] +def ncclGetUniqueId() -> NcclUniqueId: + unique_id = NcclUniqueId() + result = _c_ncclGetUniqueId(ctypes.byref(unique_id)) + assert result == 0 + return unique_id + +def test_ncclGetUniqueId(): + unique_id = ncclGetUniqueId() + # print something like: + # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + print(list(unique_id.internal)) + + +# equivalent to c declaration: +# ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +# note that ncclComm_t is a pointer type, so the first argument is a pointer to a pointer +_c_ncclCommInitRank = nccl.ncclCommInitRank +_c_ncclCommInitRank.restype = ctypes.c_int +_c_ncclCommInitRank.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int] + +# enums +class ncclDataType_t(ctypes.c_int): + ncclInt8 = 0 + ncclChar = 0 + ncclUint8 = 1 + ncclInt32 = 2 + ncclInt = 2 + ncclUint32 = 3 + ncclInt64 = 4 + ncclUint64 = 5 + ncclFloat16 = 6 + ncclHalf = 6 + ncclFloat32 = 7 + ncclFloat = 7 + ncclFloat64 = 8 + ncclDouble = 8 + ncclBfloat16 = 9 # Uncomment if __CUDA_BF16_TYPES_EXIST__ is defined + ncclNumTypes = 10 # Uncomment if __CUDA_BF16_TYPES_EXIST__ is defined + + @classmethod + def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t': + if dtype == torch.int8: + return cls.ncclInt8 + if dtype == torch.uint8: + return cls.ncclUint8 + if dtype == torch.int32: + return cls.ncclInt32 + if dtype == torch.int64: + return cls.ncclInt64 + if dtype == torch.float16: + return cls.ncclFloat16 + if dtype == torch.float32: + return cls.ncclFloat32 + if dtype == torch.float64: + return cls.ncclFloat64 + if dtype == torch.bfloat16: + return cls.ncclBfloat16 + raise ValueError(f"Unsupported dtype: {dtype}") + +class ncclRedOp_t(ctypes.c_int): + ncclSum = 0 + ncclProd = 1 + ncclMax = 2 + ncclMin = 3 + ncclAvg = 4 + ncclNumOps = 5 + # ncclMaxRedOp value is based on enum size and int size, here simplified + ncclMaxRedOp = 0x7fffffff + + @classmethod + def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t': + if op == ReduceOp.SUM: + return cls.ncclSum + if op == ReduceOp.PRODUCT: + return cls.ncclProd + if op == ReduceOp.MAX: + return cls.ncclMax + if op == ReduceOp.MIN: + return cls.ncclMin + if op == ReduceOp.AVG: + return cls.ncclAvg + raise ValueError(f"Unsupported op: {op}") + + +# equivalent to c declaration: +# ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +# note that cudaStream_t is a pointer type, so the last argument is a pointer +_c_ncclAllReduce = nccl.ncclAllReduce +_c_ncclAllReduce.restype = ctypes.c_int +_c_ncclAllReduce.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t, ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p] + + +# equivalent to c declaration: +# ncclResult_t ncclCommDestroy(ncclComm_t comm); +_c_ncclCommDestroy = nccl.ncclCommDestroy +_c_ncclCommDestroy.restype = ctypes.c_int +_c_ncclCommDestroy.argtypes = [ctypes.c_void_p] + +class NCCLCommunicator: + def __init__( + self, + backend = None, + init_method = None, + timeout = datetime.timedelta(seconds=10), + world_size: int = -1, + rank: int = -1, + store = None, + group_name: str = "", + pg_options = None, + ): + if not dist.is_initialized(): + backend = backend or "nccl" + assert backend == 'nccl', "only use gloo backend for starting the NCCL communicator" + dist.init_process_group( + backend=backend, + init_method=init_method, + timeout=timeout, + world_size=world_size, + rank=rank, + store=store, + group_name=group_name, + pg_options=pg_options + ) + self.world_size = dist.get_world_size() + self.rank = dist.get_rank() + torch.cuda.set_device(self.rank) + if self.rank == 0: + self.unique_id = ncclGetUniqueId() + else: + self.unique_id = NcclUniqueId() + tensor = torch.ByteTensor(list(self.unique_id.internal)).cuda(self.rank) + dist.broadcast(tensor, src=0) + byte_list = tensor.cpu().tolist() + self.unique_id = NcclUniqueId() + for i, byte in enumerate(byte_list): + self.unique_id.internal[i] = byte + self.comm = ctypes.c_void_p() + result = _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size, self.unique_id, self.rank) + assert result == 0 + self.stream = torch.cuda.Stream(device=f"cuda:{self.rank}") + + def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None): + if stream is None: + stream = self.stream + result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()), ctypes.c_void_p(tensor.data_ptr()), tensor.numel(), ncclDataType_t.from_torch(tensor.dtype), ncclRedOp_t.from_torch(op), self.comm, ctypes.c_void_p(stream.cuda_stream)) + assert result == 0 + + def __del__(self): + dist.destroy_process_group() + _c_ncclCommDestroy(self.comm) + +def test_NCCLCommunicator(): + # use `torchrun` to launch the script + # e.g. `torchrun --nproc_per_node=2 pynccl.py` + comm = NCCLCommunicator() + tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank) + comm.all_reduce(tensor) + result = tensor.mean().cpu().item() + assert result == comm.world_size + print(result) + +if __name__ == "__main__": + test_ncclGetUniqueId() + test_NCCLCommunicator() +# ===================== pynccl.py ===================== From b4085a16865dac77c246f3896ec79c1ab1680bea Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 16:05:20 -0700 Subject: [PATCH 22/54] add wget --- .github/ISSUE_TEMPLATE/300-usage.yml | 1 + Dockerfile | 1 + vllm/worker/worker.py | 6 ++++++ 3 files changed, 8 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml index 88227b4b2e7b..b5638bca0158 100644 --- a/.github/ISSUE_TEMPLATE/300-usage.yml +++ b/.github/ISSUE_TEMPLATE/300-usage.yml @@ -12,6 +12,7 @@ body: attributes: label: Your current environment description: | + If you want your issue to be answered quickly, it is **very important** to provide enough information about your environment. Please run the following and paste the output below. ```sh wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py diff --git a/Dockerfile b/Dockerfile index 9d8e357a781e..a8c4c3a8286a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -92,6 +92,7 @@ RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose # tricky part, only this version nccl is good +RUN apt install -y wget RUN wget https://developer.download.nvidia.com/compute/redist/nccl/v2.16.2/nccl_2.16.2-1+cuda12.0_x86_64.txz RUN tar -xvf nccl_2.16.2-1+cuda12.0_x86_64.txz RUN cp ./nccl_2.16.2-1+cuda12.0_x86_64/lib/libnccl.so.2.16.2 ./libnccl.so.2.16.2 diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 0dcd4018afa5..be7baa009cfb 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -142,6 +142,12 @@ def profile_num_available_blocks( self.model_runner.remove_all_loras() gc.collect() torch.cuda.empty_cache() + print( + f"{free_gpu_memory=}, {total_gpu_memory=}, {self.init_gpu_memory=}, {peak_memory=}" + ) + print( + f"{block_size=}, {cache_block_size=}, {num_gpu_blocks=}, {num_cpu_blocks=}" + ) return num_gpu_blocks, num_cpu_blocks def init_cache_engine(self, cache_config: CacheConfig) -> None: From f77c9ae19eba798d9dc8683c1152f2a71357816e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 16:18:00 -0700 Subject: [PATCH 23/54] delete logging code --- .../parallel_utils/cupy_utils.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py index f58ce6c4bdba..94e72bf62314 100644 --- a/vllm/model_executor/parallel_utils/cupy_utils.py +++ b/vllm/model_executor/parallel_utils/cupy_utils.py @@ -113,30 +113,8 @@ def init_process_group(world_size: int, rank: int, host: str, def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None: """All-reduces the input tensor across the process group.""" assert input_.is_cuda, f"{input_} should be a cuda tensor" - free_bytes = torch.cuda.mem_get_info()[0] - # # Hack to support bfloat16 - # torch_dtype = input_.dtype - # if torch_dtype is torch.bfloat16: - # # We need to view as float16, otherwise - # # cupy will fail. This will not change - # # the underlying data. - # input_ = input_.view(torch.float16) - # cupy_input = cupy.asarray(input_) - # cupy_input._torch_dtype = torch_dtype # pylint: disable=protected-access - # _NCCL_BACKEND.all_reduce(in_array=cupy_input, - # out_array=cupy_input, - # op=_OP_MAPPING[op]) comm.all_reduce(input_, op) - import os - env_name = os.environ['CONDA_DEFAULT_ENV'] - dir_name = f"/home/gcpuser/vllm/{env_name}-process-{os.getpid()}" - with open(f"{dir_name}.txt", "a") as f: - f.write(f"{free_bytes=} before allreduce\n") - free_bytes_after = torch.cuda.mem_get_info()[0] - f.write(f"{free_bytes_after=} after allreduce\n") - f.write(f"memory cost during allreduce: {(free_bytes - free_bytes_after) / 1024 / 1024} MiB\n") - def destroy_process_group() -> None: global comm From 2766418b5d57d5d944db8aa02f4d7dcb8e720da1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 16:57:39 -0700 Subject: [PATCH 24/54] remove some debugging print --- .github/ISSUE_TEMPLATE/300-usage.yml | 1 - vllm/worker/worker.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml index b5638bca0158..88227b4b2e7b 100644 --- a/.github/ISSUE_TEMPLATE/300-usage.yml +++ b/.github/ISSUE_TEMPLATE/300-usage.yml @@ -12,7 +12,6 @@ body: attributes: label: Your current environment description: | - If you want your issue to be answered quickly, it is **very important** to provide enough information about your environment. Please run the following and paste the output below. ```sh wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index be7baa009cfb..0dcd4018afa5 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -142,12 +142,6 @@ def profile_num_available_blocks( self.model_runner.remove_all_loras() gc.collect() torch.cuda.empty_cache() - print( - f"{free_gpu_memory=}, {total_gpu_memory=}, {self.init_gpu_memory=}, {peak_memory=}" - ) - print( - f"{block_size=}, {cache_block_size=}, {num_gpu_blocks=}, {num_cpu_blocks=}" - ) return num_gpu_blocks, num_cpu_blocks def init_cache_engine(self, cache_config: CacheConfig) -> None: From 0e18aed831218f2acb72f05d3c9028a00ec64a6f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 17:10:51 -0700 Subject: [PATCH 25/54] use nccl 2.18.3 --- Dockerfile | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index a8c4c3a8286a..fb1bef248ae5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -91,13 +91,16 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -# tricky part, only this version nccl is good -RUN apt install -y wget -RUN wget https://developer.download.nvidia.com/compute/redist/nccl/v2.16.2/nccl_2.16.2-1+cuda12.0_x86_64.txz -RUN tar -xvf nccl_2.16.2-1+cuda12.0_x86_64.txz -RUN cp ./nccl_2.16.2-1+cuda12.0_x86_64/lib/libnccl.so.2.16.2 ./libnccl.so.2.16.2 -RUN rm -rf ./nccl_2.16.2-1+cuda12.0_x86_64 -RUN rm nccl_2.16.2-1+cuda12.0_x86_64.txz +# tricky part, nccl 2.19 has a bug that increased memory overhead of cudagraph +# however, pytorch has binary dependencies on nccl 2.19 +# simply using `pip install nvidia-nccl-cu12==2.18.3` will break pytorch +# so we have to manually download nccl 2.18 and keep the library to a secrect place +RUN apt install -y wget unzip +RUN wget https://files.pythonhosted.org/packages/44/6e/3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl +RUN unzip nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl +RUN cp ./nvidia/nccl/lib/libnccl.so.2 ./libnccl.so.2 +RUN rm -rf ./nvidia +RUN rm nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl #################### TEST IMAGE #################### From 7c531b037038fa9d1a6c8d02fe7b1b5849728325 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 18:31:51 -0700 Subject: [PATCH 26/54] add test for pynccl --- tests/distributed/test_pynccl.py | 48 ++++++++++++++++++ vllm/model_executor/parallel_utils/pynccl.py | 51 ++++++-------------- 2 files changed, 64 insertions(+), 35 deletions(-) create mode 100644 tests/distributed/test_pynccl.py diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py new file mode 100644 index 000000000000..1dd9ec8dada5 --- /dev/null +++ b/tests/distributed/test_pynccl.py @@ -0,0 +1,48 @@ +# this script is not run with `pytest`. +# It is run with `torchrun`. +import os +import multiprocessing +import pytest +import torch +from vllm.model_executor.parallel_utils.pynccl import ( + NCCLCommunicator, + ncclGetUniqueId, + ncclCommInitRank, + ncclCommDestroy, +) + +def worker_fn(env): + import os + os.environ.update(env) + + # when environments are properly set, the usage is simple + comm = NCCLCommunicator() + tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank) + comm.all_reduce(tensor) + result = tensor.mean().cpu().item() + assert result == comm.world_size + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +def test_pynccl(): + number_of_processes = 2 + processes = [] + for i in range(number_of_processes): + env = os.environ.copy() + env['RANK'] = str(i) + env['WORLD_SIZE'] = str(number_of_processes) + env['MASTER_ADDR'] = 'localhost' + env['MASTER_PORT'] = '12345' + p = multiprocessing.Process(target=worker_fn, args=(env,)) + processes.append(p) + p.start() + + for p in processes: + p.join() + + +def test_ncclGetUniqueId(): + unique_id = ncclGetUniqueId() + # `list(unique_id.internal)` is something like this: + # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + # as long as the function doesn't raise an exception, we're good diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index c6f8e6e6840c..8827aab25675 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -1,9 +1,13 @@ -# ===================== pynccl.py ===================== +# ===================== pynccl.py ================================== # This file is a pure Python wrapper for the NCCL library. -# Copyright (c) 2024 vLLM team -# Author: Kaichao You -# Email: youkaichao@gmail.com -# All rights reserved. +# The main purpose is to use NCCL combined with CUDA graph. +# Before writing this script, we tried the following approach: +# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself +# often gets stuck when initializing the NCCL communicator. +# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce` +# contains many other potential cuda APIs, that are not allowed during +# capturing the CUDA graph. For further details, please check +# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/199366 # ==================================================== @@ -15,13 +19,12 @@ import datetime # manually load the nccl library +# TODO: find the path programmatically nccl = ctypes.CDLL("/vllm-workspace/libnccl.so.2.16.2") -# use `pip install nvidia-nccl-cu12==2.16.2` to install from pypi -# then you can use the following line to load the library -# and they cause increased memory overhead -# nccl = ctypes.CDLL("/opt/conda/envs/${CONDA_ENV}/lib/python3.9/site-packages/nvidia/nccl/lib/libnccl.so.2") -# ===================== declare types and functions ===================== +# ===================== export types and functions from nccl to Python ===================== +# for the original nccl definition, please check +# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in ncclResult_t = ctypes.c_int @@ -56,13 +59,6 @@ def ncclGetUniqueId() -> NcclUniqueId: assert result == 0 return unique_id -def test_ncclGetUniqueId(): - unique_id = ncclGetUniqueId() - # print something like: - # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - print(list(unique_id.internal)) - - # equivalent to c declaration: # ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); # note that ncclComm_t is a pointer type, so the first argument is a pointer to a pointer @@ -86,8 +82,8 @@ class ncclDataType_t(ctypes.c_int): ncclFloat = 7 ncclFloat64 = 8 ncclDouble = 8 - ncclBfloat16 = 9 # Uncomment if __CUDA_BF16_TYPES_EXIST__ is defined - ncclNumTypes = 10 # Uncomment if __CUDA_BF16_TYPES_EXIST__ is defined + ncclBfloat16 = 9 + ncclNumTypes = 10 @classmethod def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t': @@ -116,8 +112,6 @@ class ncclRedOp_t(ctypes.c_int): ncclMin = 3 ncclAvg = 4 ncclNumOps = 5 - # ncclMaxRedOp value is based on enum size and int size, here simplified - ncclMaxRedOp = 0x7fffffff @classmethod def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t': @@ -162,7 +156,7 @@ def __init__( ): if not dist.is_initialized(): backend = backend or "nccl" - assert backend == 'nccl', "only use gloo backend for starting the NCCL communicator" + assert backend == 'nccl', "only use nccl backend for starting the NCCL communicator" dist.init_process_group( backend=backend, init_method=init_method, @@ -201,17 +195,4 @@ def __del__(self): dist.destroy_process_group() _c_ncclCommDestroy(self.comm) -def test_NCCLCommunicator(): - # use `torchrun` to launch the script - # e.g. `torchrun --nproc_per_node=2 pynccl.py` - comm = NCCLCommunicator() - tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank) - comm.all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == comm.world_size - print(result) - -if __name__ == "__main__": - test_ncclGetUniqueId() - test_NCCLCommunicator() # ===================== pynccl.py ===================== From 1abf38e76dcd438241c2512b27fb6b0546b89ad2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 18:39:33 -0700 Subject: [PATCH 27/54] fix linter --- tests/distributed/test_pynccl.py | 14 +++- .../parallel_utils/cupy_utils.py | 6 +- vllm/model_executor/parallel_utils/pynccl.py | 84 ++++++++++++------- 3 files changed, 69 insertions(+), 35 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 1dd9ec8dada5..06f90452c42b 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -7,10 +7,9 @@ from vllm.model_executor.parallel_utils.pynccl import ( NCCLCommunicator, ncclGetUniqueId, - ncclCommInitRank, - ncclCommDestroy, ) + def worker_fn(env): import os os.environ.update(env) @@ -22,6 +21,7 @@ def worker_fn(env): result = tensor.mean().cpu().item() assert result == comm.world_size + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") def test_pynccl(): @@ -33,7 +33,7 @@ def test_pynccl(): env['WORLD_SIZE'] = str(number_of_processes) env['MASTER_ADDR'] = 'localhost' env['MASTER_PORT'] = '12345' - p = multiprocessing.Process(target=worker_fn, args=(env,)) + p = multiprocessing.Process(target=worker_fn, args=(env, )) processes.append(p) p.start() @@ -44,5 +44,11 @@ def test_pynccl(): def test_ncclGetUniqueId(): unique_id = ncclGetUniqueId() # `list(unique_id.internal)` is something like this: - # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, + # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # as long as the function doesn't raise an exception, we're good + assert unique_id is not None diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py index 94e72bf62314..cb8d7385f31f 100644 --- a/vllm/model_executor/parallel_utils/cupy_utils.py +++ b/vllm/model_executor/parallel_utils/cupy_utils.py @@ -12,7 +12,6 @@ import torch from torch.distributed import ReduceOp - try: # import cupy # from cupy.cuda import nccl @@ -86,7 +85,9 @@ def init_process_group(world_size: int, rank: int, host: str, """ assert not is_initialized() global comm - comm = NCCLCommunicator(init_method=f"tcp://{host}:{port}", world_size=world_size, rank=rank) + comm = NCCLCommunicator(init_method=f"tcp://{host}:{port}", + world_size=world_size, + rank=rank) return if isinstance(cupy, Exception): @@ -118,7 +119,6 @@ def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None: def destroy_process_group() -> None: global comm - del comm comm = None return """Destroys the NCCL backend.""" diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index 8827aab25675..539f350d4a2a 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -10,7 +10,6 @@ # https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/199366 # ==================================================== - # ===================== import region ===================== import torch import ctypes @@ -22,7 +21,7 @@ # TODO: find the path programmatically nccl = ctypes.CDLL("/vllm-workspace/libnccl.so.2.16.2") -# ===================== export types and functions from nccl to Python ===================== +# === export types and functions from nccl to Python === # for the original nccl definition, please check # https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in @@ -34,6 +33,7 @@ _c_ncclGetVersion.restype = ctypes.c_int _c_ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)] + def ncclGetVersion() -> int: version = ctypes.c_int() result = _c_ncclGetVersion(ctypes.byref(version)) @@ -45,26 +45,36 @@ def ncclGetVersion() -> int: patch = version_str[3:].lstrip("0") return f"{major}.{minor}.{patch}" + class NcclUniqueId(ctypes.Structure): _fields_ = [("internal", ctypes.c_byte * 128)] + # equivalent to c declaration: # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); _c_ncclGetUniqueId = nccl.ncclGetUniqueId _c_ncclGetUniqueId.restype = ctypes.c_int _c_ncclGetUniqueId.argtypes = [ctypes.POINTER(NcclUniqueId)] + + def ncclGetUniqueId() -> NcclUniqueId: unique_id = NcclUniqueId() result = _c_ncclGetUniqueId(ctypes.byref(unique_id)) assert result == 0 return unique_id + # equivalent to c declaration: -# ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); -# note that ncclComm_t is a pointer type, so the first argument is a pointer to a pointer +# ncclResult_t ncclCommInitRank( +# ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +# note that ncclComm_t is a pointer type, so the first argument +# is a pointer to a pointer _c_ncclCommInitRank = nccl.ncclCommInitRank _c_ncclCommInitRank.restype = ctypes.c_int -_c_ncclCommInitRank.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int] +_c_ncclCommInitRank.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int +] + # enums class ncclDataType_t(ctypes.c_int): @@ -105,6 +115,7 @@ def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t': return cls.ncclBfloat16 raise ValueError(f"Unsupported dtype: {dtype}") + class ncclRedOp_t(ctypes.c_int): ncclSum = 0 ncclProd = 1 @@ -129,12 +140,17 @@ def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t': # equivalent to c declaration: -# ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +# ncclResult_t ncclAllReduce( +# const void* sendbuff, void* recvbuff, size_t count, +# ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, +# udaStream_t stream); # note that cudaStream_t is a pointer type, so the last argument is a pointer _c_ncclAllReduce = nccl.ncclAllReduce _c_ncclAllReduce.restype = ctypes.c_int -_c_ncclAllReduce.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t, ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p] - +_c_ncclAllReduce.argtypes = [ + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t, + ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p +] # equivalent to c declaration: # ncclResult_t ncclCommDestroy(ncclComm_t comm); @@ -142,31 +158,32 @@ def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t': _c_ncclCommDestroy.restype = ctypes.c_int _c_ncclCommDestroy.argtypes = [ctypes.c_void_p] + class NCCLCommunicator: + def __init__( self, - backend = None, - init_method = None, - timeout = datetime.timedelta(seconds=10), + backend=None, + init_method=None, + timeout=datetime.timedelta(seconds=10), world_size: int = -1, rank: int = -1, - store = None, + store=None, group_name: str = "", - pg_options = None, + pg_options=None, ): if not dist.is_initialized(): backend = backend or "nccl" - assert backend == 'nccl', "only use nccl backend for starting the NCCL communicator" - dist.init_process_group( - backend=backend, - init_method=init_method, - timeout=timeout, - world_size=world_size, - rank=rank, - store=store, - group_name=group_name, - pg_options=pg_options - ) + assert backend == 'nccl', ( + "only use nccl backend for starting the NCCL communicator") + dist.init_process_group(backend=backend, + init_method=init_method, + timeout=timeout, + world_size=world_size, + rank=rank, + store=store, + group_name=group_name, + pg_options=pg_options) self.world_size = dist.get_world_size() self.rank = dist.get_rank() torch.cuda.set_device(self.rank) @@ -174,25 +191,36 @@ def __init__( self.unique_id = ncclGetUniqueId() else: self.unique_id = NcclUniqueId() - tensor = torch.ByteTensor(list(self.unique_id.internal)).cuda(self.rank) + tensor = torch.ByteTensor(list(self.unique_id.internal)).cuda( + self.rank) dist.broadcast(tensor, src=0) byte_list = tensor.cpu().tolist() self.unique_id = NcclUniqueId() for i, byte in enumerate(byte_list): self.unique_id.internal[i] = byte self.comm = ctypes.c_void_p() - result = _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size, self.unique_id, self.rank) + result = _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size, + self.unique_id, self.rank) assert result == 0 self.stream = torch.cuda.Stream(device=f"cuda:{self.rank}") - def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None): + def all_reduce(self, + tensor: torch.Tensor, + op: ReduceOp = ReduceOp.SUM, + stream=None): if stream is None: stream = self.stream - result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()), ctypes.c_void_p(tensor.data_ptr()), tensor.numel(), ncclDataType_t.from_torch(tensor.dtype), ncclRedOp_t.from_torch(op), self.comm, ctypes.c_void_p(stream.cuda_stream)) + result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()), + ctypes.c_void_p(tensor.data_ptr()), + tensor.numel(), + ncclDataType_t.from_torch(tensor.dtype), + ncclRedOp_t.from_torch(op), self.comm, + ctypes.c_void_p(stream.cuda_stream)) assert result == 0 def __del__(self): dist.destroy_process_group() _c_ncclCommDestroy(self.comm) + # ===================== pynccl.py ===================== From 5d661a679166dcffd44affb0bd6508e3f42b3e7c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 18:45:49 -0700 Subject: [PATCH 28/54] update cupy_utils to pynccl --- .../parallel_utils/cupy_utils.py | 95 ++----------------- 1 file changed, 7 insertions(+), 88 deletions(-) diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py index cb8d7385f31f..5c06b40a3f56 100644 --- a/vllm/model_executor/parallel_utils/cupy_utils.py +++ b/vllm/model_executor/parallel_utils/cupy_utils.py @@ -1,60 +1,17 @@ -"""CuPy utilities for all-reduce. +from .pynccl import NCCLCommunicator, ncclGetVersion -We use CuPy all-reduce instead of torch.distributed.all_reduce when capturing -CUDA graphs, because torch.distributed.all_reduce causes errors when capturing -CUDA graphs. - -NOTE: We use CuPy 12.3 since CuPy 13.0 does not support Python 3.8. -TODO: Remove this file when torch.distributed.all_reduce is fixed. -""" import contextlib - +import logging import torch -from torch.distributed import ReduceOp - -try: - # import cupy - # from cupy.cuda import nccl - # from cupyx.distributed import NCCLBackend - from .pynccl import NCCLCommunicator, ncclGetVersion - print(f"nccl version {ncclGetVersion()}") - comm: NCCLCommunicator = None - -except ImportError as e: - cupy = e - nccl = None - - class NCCLBackend: - ... - - -_OP_MAPPING = { - ReduceOp.SUM: "sum", - ReduceOp.PRODUCT: "prod", - ReduceOp.MIN: "min", - ReduceOp.MAX: "max", -} +from typing import Optional +from torch.distributed import ReduceOp -class NCCLBackendWithBFloat16: - # This is enough to add bfloat16 support for most operations, - # but broadcast will fail (will require changes in compiled - # cupy code). - def _get_nccl_dtype_and_count(self, array, count=None): - nccl_dtype, count = super()._get_nccl_dtype_and_count(array, count) - torch_dtype = getattr(array, "_torch_dtype", None) - if torch_dtype is torch.bfloat16: - nccl_dtype = nccl.NCCL_BFLOAT16 - return nccl_dtype, count - - def barrier(self) -> None: - raise RuntimeError( - "Currently, CuPy NCCL barrier is not supported since the TCP " - "store is immediately stopped after the initialization.") +logger = logging.getLogger(__name__) +logger.info(f"vLLM is using nccl=={ncclGetVersion()}") -_NCCL_BACKEND = None -_WORLD_SIZE = 0 +comm: Optional[NCCLCommunicator] = None def is_initialized() -> bool: @@ -70,45 +27,15 @@ def set_cupy_stream(stream: torch.cuda.Stream): yield finally: pass - return - cupy_stream = cupy.cuda.ExternalStream(stream.cuda_stream, - stream.device_index) - with cupy_stream: - yield def init_process_group(world_size: int, rank: int, host: str, port: int) -> None: - """Initializes the CuPy NCCL backend. - - # TODO: handle NCCL timeouts. - """ assert not is_initialized() global comm comm = NCCLCommunicator(init_method=f"tcp://{host}:{port}", world_size=world_size, rank=rank) - return - - if isinstance(cupy, Exception): - raise ImportError( - "NCCLBackend is not available. Please install cupy.") from cupy - - # TODO(woosuk): Create TP and PP process groups for CuPy. - global _NCCL_BACKEND - global _WORLD_SIZE - assert world_size > 0, f"{world_size=} should be a positive integer" - assert 0 <= rank < world_size, ( - f"{rank=} should be a integer between [0, {world_size})") - - cupy.cuda.runtime.setDevice(torch.cuda.current_device()) - _NCCL_BACKEND = NCCLBackendWithBFloat16(world_size, rank, host, port) - _WORLD_SIZE = world_size - - # Stop the TCP store to prevent the deadlock issues at termination time. - # FIXME(woosuk): This is hacky. Find a more robust solution. - if rank == 0 and hasattr(_NCCL_BACKEND, "_store"): - _NCCL_BACKEND._store.stop() def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None: @@ -120,20 +47,12 @@ def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None: def destroy_process_group() -> None: global comm comm = None - return - """Destroys the NCCL backend.""" - global _NCCL_BACKEND - global _WORLD_SIZE - _NCCL_BACKEND = None - _WORLD_SIZE = 0 def get_world_size() -> int: """Returns the world size.""" return comm.world_size - return _WORLD_SIZE def get_nccl_backend(): return comm - return _NCCL_BACKEND From 99f96d7f7f21498309f55bf8cef658beeeed4b54 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 18:52:46 -0700 Subject: [PATCH 29/54] rename cupy_utils to pynccl_utils --- .../parallel_utils/communication_op.py | 4 ++-- vllm/model_executor/parallel_utils/parallel_state.py | 6 +++--- .../{cupy_utils.py => pynccl_utils.py} | 0 vllm/worker/model_runner.py | 6 +++--- vllm/worker/worker.py | 12 ++++++------ 5 files changed, 14 insertions(+), 14 deletions(-) rename vllm/model_executor/parallel_utils/{cupy_utils.py => pynccl_utils.py} (100%) diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index 6f00fd001d95..01036ab5fba2 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -4,7 +4,7 @@ import torch from torch.distributed import ProcessGroup -from vllm.model_executor.parallel_utils import cupy_utils +from vllm.model_executor.parallel_utils import pynccl_utils from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -35,7 +35,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: return out if is_cupy_nccl_enabled_for_all_reduce(): # TODO: support multiple parallel groups. - cupy_utils.all_reduce(input_) + pynccl_utils.all_reduce(input_) else: torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group()) diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py index c821936d06e4..8f3b1549d609 100644 --- a/vllm/model_executor/parallel_utils/parallel_state.py +++ b/vllm/model_executor/parallel_utils/parallel_state.py @@ -7,7 +7,7 @@ import torch -from vllm.model_executor.parallel_utils import cupy_utils +from vllm.model_executor.parallel_utils import pynccl_utils # Tensor model parallel group that the current rank belongs to. _TENSOR_MODEL_PARALLEL_GROUP = None @@ -211,7 +211,7 @@ def destroy_model_parallel(): _PIPELINE_GLOBAL_RANKS = None # Destroy the cupy states if any. - cupy_utils.destroy_process_group() + pynccl_utils.destroy_process_group() # Whether to use cupy for nccl all reduce. @@ -234,7 +234,7 @@ def with_cupy_nccl_for_all_reduce(): _ENABLE_CUPY_FOR_ALL_REDUCE = True stream = torch.cuda.current_stream() - with cupy_utils.set_cupy_stream(stream): + with pynccl_utils.set_cupy_stream(stream): yield _ENABLE_CUPY_FOR_ALL_REDUCE = old diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py similarity index 100% rename from vllm/model_executor/parallel_utils/cupy_utils.py rename to vllm/model_executor/parallel_utils/pynccl_utils.py diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index b8eeb51379f4..b2d6c96165c4 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -11,7 +11,7 @@ from vllm.logger import init_logger from vllm.model_executor import InputMetadata, SamplingMetadata from vllm.model_executor.model_loader import get_model -from vllm.model_executor.parallel_utils import cupy_utils +from vllm.model_executor.parallel_utils import pynccl_utils from vllm.model_executor.parallel_utils.communication_op import ( broadcast_tensor_dict) from vllm.model_executor.parallel_utils.parallel_state import ( @@ -720,7 +720,7 @@ def capture_model(self, kv_caches: List[KVCache]) -> None: """ # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never # deleted before the CUDA graphs. - self.cupy_nccl_backend = cupy_utils.get_nccl_backend() + self.cupy_nccl_backend = pynccl_utils.get_nccl_backend() assert not self.model_config.enforce_eager logger.info("Capturing the model for CUDA graphs. This may lead to " @@ -900,7 +900,7 @@ def __call__(self, *args, **kwargs): @contextlib.contextmanager def _maybe_cupy_nccl(): - if cupy_utils.is_initialized() and not custom_all_reduce.is_initialized(): + if pynccl_utils.is_initialized() and not custom_all_reduce.is_initialized(): with with_cupy_nccl_for_all_reduce(): yield else: diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d8999dc17212..c979effae048 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -9,7 +9,7 @@ from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig) from vllm.model_executor import set_random_seed -from vllm.model_executor.parallel_utils import cupy_utils +from vllm.model_executor.parallel_utils import pynccl_utils from vllm.model_executor.parallel_utils.communication_op import ( broadcast_tensor_dict) from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar @@ -262,8 +262,8 @@ def init_distributed_environment( init_method=distributed_init_method, ) - if cupy_utils.is_initialized(): - cupy_world_size = cupy_utils.get_world_size() + if pynccl_utils.is_initialized(): + cupy_world_size = pynccl_utils.get_world_size() if cupy_world_size != parallel_config.world_size: raise RuntimeError( "cupy.distributed is already initialized but the cupy world " @@ -273,7 +273,7 @@ def init_distributed_environment( # NOTE(woosuk): We don't initialize CuPy process group when world size # is 1. # TODO(woosuk): Support multi-node connection. - cupy_utils.init_process_group( + pynccl_utils.init_process_group( world_size=parallel_config.world_size, rank=rank, host="localhost", @@ -282,8 +282,8 @@ def init_distributed_environment( # A small all_reduce for warmup. torch.distributed.all_reduce(torch.zeros(1).cuda()) - if cupy_utils.is_initialized(): - cupy_utils.all_reduce(torch.zeros(1).cuda()) + if pynccl_utils.is_initialized(): + pynccl_utils.all_reduce(torch.zeros(1).cuda()) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) From b567f0408202172fd8f6ec2df196948930197da6 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 18:53:48 -0700 Subject: [PATCH 30/54] update import --- vllm/model_executor/parallel_utils/pynccl_utils.py | 6 ++++-- vllm/worker/model_runner.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py index 5c06b40a3f56..584c26bced9f 100644 --- a/vllm/model_executor/parallel_utils/pynccl_utils.py +++ b/vllm/model_executor/parallel_utils/pynccl_utils.py @@ -1,11 +1,13 @@ -from .pynccl import NCCLCommunicator, ncclGetVersion - import contextlib import logging import torch from typing import Optional from torch.distributed import ReduceOp +from vllm.model_executor.parallel_utils.pynccl import ( + NCCLCommunicator, + ncclGetVersion, +) logger = logging.getLogger(__name__) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index b2d6c96165c4..0fa33a6debc5 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -900,7 +900,8 @@ def __call__(self, *args, **kwargs): @contextlib.contextmanager def _maybe_cupy_nccl(): - if pynccl_utils.is_initialized() and not custom_all_reduce.is_initialized(): + if pynccl_utils.is_initialized( + ) and not custom_all_reduce.is_initialized(): with with_cupy_nccl_for_all_reduce(): yield else: From 74fcf08b020a0e85728584f2775c8c5a0028f960 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 19:02:38 -0700 Subject: [PATCH 31/54] update pytorch in cmake --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 66842e6845ed..7c9ca497fc2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2") +set(TORCH_SUPPORTED_VERSION_CUDA "2.2.0") set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") From 43da101f440095998ab2e4eee69599afd25703fd Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 19:16:05 -0700 Subject: [PATCH 32/54] add test with cudagraph --- tests/distributed/test_pynccl.py | 68 ++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 06f90452c42b..651e1471c93c 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -10,22 +10,8 @@ ) -def worker_fn(env): - import os - os.environ.update(env) - - # when environments are properly set, the usage is simple - comm = NCCLCommunicator() - tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank) - comm.all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == comm.world_size - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -def test_pynccl(): - number_of_processes = 2 +def distributed_run(fn, world_size): + number_of_processes = world_size processes = [] for i in range(number_of_processes): env = os.environ.copy() @@ -33,7 +19,7 @@ def test_pynccl(): env['WORLD_SIZE'] = str(number_of_processes) env['MASTER_ADDR'] = 'localhost' env['MASTER_PORT'] = '12345' - p = multiprocessing.Process(target=worker_fn, args=(env, )) + p = multiprocessing.Process(target=fn, args=(env, )) processes.append(p) p.start() @@ -41,6 +27,54 @@ def test_pynccl(): p.join() +def update_env(fn): + + def wrapper(env): + import os + os.environ.update(env) + fn() + + return wrapper + + +@update_env +def worker_fn(): + comm = NCCLCommunicator() + tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank) + comm.all_reduce(tensor) + result = tensor.mean().cpu().item() + assert result == comm.world_size + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +def test_pynccl(): + distributed_run(worker_fn, 2) + + +@update_env +def worker_fn_with_cudagraph(): + with torch.no_grad(): + graph = torch.cuda.CUDAGraph() + comm = NCCLCommunicator() + # run something in the default stream to initialize torch engine + a = torch.ones((4, 4), device=f'cuda:{comm.rank}') + torch.cuda.synchronize() + with torch.cuda.graph(graph, stream=comm.stream): + comm.all_reduce(a) + comm.stream.synchronize() + assert a.mean().cpu().item() == comm.world_size + graph.replay() + comm.stream.synchronize() + assert a.mean().cpu().item() == comm.world_size**2 + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +def test_pynccl_with_cudagraph(): + distributed_run(worker_fn_with_cudagraph, 2) + + def test_ncclGetUniqueId(): unique_id = ncclGetUniqueId() # `list(unique_id.internal)` is something like this: From 37e7425674417378d567120bc54a3e3fe751e543 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 19:30:04 -0700 Subject: [PATCH 33/54] fix test; fix TORCH_CUDA_ARCH_LIST --- CMakeLists.txt | 3 +++ tests/distributed/test_pynccl.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c9ca497fc2b..29d11f9a04d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,9 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") # Supported NVIDIA architectures. set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") +# used when building pytorch-related extensions +set(TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0;8.6;8.9;9.0") + # Supported AMD GPU architectures. set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 651e1471c93c..58376306c277 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -63,7 +63,7 @@ def worker_fn_with_cudagraph(): with torch.cuda.graph(graph, stream=comm.stream): comm.all_reduce(a) comm.stream.synchronize() - assert a.mean().cpu().item() == comm.world_size + assert a.mean().cpu().item() == comm.world_size**0 graph.replay() comm.stream.synchronize() assert a.mean().cpu().item() == comm.world_size**2 From 7e983f5e32302b44773c9252af3cce281f8344f8 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 19:51:30 -0700 Subject: [PATCH 34/54] fix amd tests --- .../parallel_utils/pynccl_utils.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py index 584c26bced9f..4f1e03c260d7 100644 --- a/vllm/model_executor/parallel_utils/pynccl_utils.py +++ b/vllm/model_executor/parallel_utils/pynccl_utils.py @@ -4,16 +4,21 @@ from typing import Optional from torch.distributed import ReduceOp -from vllm.model_executor.parallel_utils.pynccl import ( - NCCLCommunicator, - ncclGetVersion, -) logger = logging.getLogger(__name__) -logger.info(f"vLLM is using nccl=={ncclGetVersion()}") - -comm: Optional[NCCLCommunicator] = None +try: + from vllm.model_executor.parallel_utils.pynccl import ( + NCCLCommunicator, + ncclGetVersion, + ) + logger.info(f"vLLM is using nccl=={ncclGetVersion()}") +except Exception: + # in non-NVIDIA environments, we can't import the nccl module + # e.g. when running on machines with AMD GPUs + pass + +comm: Optional["NCCLCommunicator"] = None def is_initialized() -> bool: From e3f8d5f866f530d32bfe37bbf2676bbd0fce87c4 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 20:14:20 -0700 Subject: [PATCH 35/54] add pynccl test --- .buildkite/test-pipeline.yaml | 5 +++++ vllm/model_executor/parallel_utils/pynccl_utils.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index cae2d9f0517f..0654fcfef0da 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -22,6 +22,11 @@ steps: working_dir: "/vllm-workspace/tests/distributed" num_gpus: 2 # only support 1 or 2 for now. +- label: Distributed pynccl Test + command: pytest -v -s --forked test_pynccl.py + working_dir: "/vllm-workspace/tests/distributed" + num_gpus: 2 # only support 1 or 2 for now. + - label: Distributed Correctness Test-facebook/opt-125m command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py working_dir: "/vllm-workspace/tests/distributed" diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py index 4f1e03c260d7..62b8b22c3143 100644 --- a/vllm/model_executor/parallel_utils/pynccl_utils.py +++ b/vllm/model_executor/parallel_utils/pynccl_utils.py @@ -13,9 +13,11 @@ ncclGetVersion, ) logger.info(f"vLLM is using nccl=={ncclGetVersion()}") -except Exception: +except Exception as e: # in non-NVIDIA environments, we can't import the nccl module # e.g. when running on machines with AMD GPUs + logger.info(f"Failed to import NCCL library: {e}") + logger.info("It is expected if you are not running on NVIDIA GPUs.") pass comm: Optional["NCCLCommunicator"] = None From 4e277ae3ba83308acdbda59dec66f029c826be17 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 22:22:26 -0700 Subject: [PATCH 36/54] pack up libnccl.so --- setup.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 47cac5996f81..5baefdec8af3 100644 --- a/setup.py +++ b/setup.py @@ -8,9 +8,15 @@ from packaging.version import parse, Version from setuptools import setup, find_packages, Extension from setuptools.command.build_ext import build_ext +from setuptools.command.install import install from shutil import which import torch from torch.utils.cpp_extension import CUDA_HOME +import zipfile +import shutil +import logging + +logger = logging.getLogger(__name__) ROOT_DIR = os.path.dirname(__file__) @@ -184,6 +190,56 @@ def _is_neuron() -> bool: return torch_neuronx_installed +class CustomInstallCommand(install): + + def run(self): + # Call the standard install process first + install.run(self) + + if not _is_cuda(): + return + + # Define the URL of the file and the directory to unzip to + file_url = ( + 'https://files.pythonhosted.org/packages/44/6e/' + '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff' + '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl') + + logger.info('Installing NVIDIA NCCL library...') + + # `self.install_lib` is something like /path/to/python/site-packages/ + target_dir = self.install_lib + "vllm/lib/" + # `self.root` is something like `/tmp/pip-install-abc123/`, i.e. the + # temporary directory where the package is being built + temp_dir = self.root + local_zip_path = ( + f"{temp_dir}/" + "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl") + # check if the target directory exists + if not os.path.exists(target_dir): + logger.info(f'Creating target directory {target_dir} ...') + os.makedirs(target_dir) + # Check if the file is already downloaded + if os.path.exists(target_dir + "nvidia"): + logger.info('library already exists.') + return + if not os.path.exists(local_zip_path): + # Download the file + logger.info('Downloading file...') + os.system(f"wget {file_url} -q -P {temp_dir}/") + # Unzip the file + logger.info('Unzipping file...') + with zipfile.ZipFile(local_zip_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info") + os.remove(local_zip_path) + # Move the unzipped files to the target directory + logger.info('Moving files...') + os.system(f"mv {temp_dir}/nvidia {target_dir}") + so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2" + os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3")) + + def _install_punica() -> bool: return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) @@ -362,6 +418,9 @@ def get_requirements() -> List[str]: python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, - cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {}, + cmdclass={ + "build_ext": cmake_build_ext if not _is_neuron() else build_ext, + "install": CustomInstallCommand, + }, package_data=package_data, ) From a20d802bba8a173122e100bfec183e6563220f6d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 22:29:08 -0700 Subject: [PATCH 37/54] add so in setup.py, and use programatical path in pynccl --- Dockerfile | 10 +--------- setup.py | 6 ++++++ vllm/model_executor/parallel_utils/pynccl.py | 7 +++++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4dbbc65a9b1b..6246a7e40773 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,16 +93,8 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -# tricky part, nccl 2.19 has a bug that increased memory overhead of cudagraph -# however, pytorch has binary dependencies on nccl 2.19 -# simply using `pip install nvidia-nccl-cu12==2.18.3` will break pytorch -# so we have to manually download nccl 2.18 and keep the library to a secrect place +# used for downloading files RUN apt install -y wget unzip -RUN wget https://files.pythonhosted.org/packages/44/6e/3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl -RUN unzip nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl -RUN cp ./nvidia/nccl/lib/libnccl.so.2 ./libnccl.so.2 -RUN rm -rf ./nvidia -RUN rm nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl #################### TEST IMAGE #################### diff --git a/setup.py b/setup.py index 5baefdec8af3..7a4db9ac2e00 100644 --- a/setup.py +++ b/setup.py @@ -199,6 +199,12 @@ def run(self): if not _is_cuda(): return + # tricky part, nccl 2.19 has a bug that increased memory overhead + # of cudagraph. However, pytorch has binary dependencies on nccl 2.19, + # simply `pip install nvidia-nccl-cu12==2.18.3` will break pytorch, + # so we have to manually download nccl 2.18 and keep the library to + # a secrect place + # Define the URL of the file and the directory to unzip to file_url = ( 'https://files.pythonhosted.org/packages/44/6e/' diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index 539f350d4a2a..679a9fa03d1a 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -16,10 +16,13 @@ import torch.distributed as dist from torch.distributed import ReduceOp import datetime +import os +import glob # manually load the nccl library -# TODO: find the path programmatically -nccl = ctypes.CDLL("/vllm-workspace/libnccl.so.2.16.2") +_path = os.path.dirname(os.path.abspath(__file__)) +so_file = glob.glob(f"{_path}/../../lib/nvidia/lib/libnccl.so.*")[0] +nccl = ctypes.CDLL(so_file) # === export types and functions from nccl to Python === # for the original nccl definition, please check From dfc9d8237ec12d29c51124e9d9af58d20fc4d5bb Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 22:38:37 -0700 Subject: [PATCH 38/54] rename cupy --> pynccl --- .../parallel_utils/parallel_state.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py index 8f3b1549d609..55c65e434d9c 100644 --- a/vllm/model_executor/parallel_utils/parallel_state.py +++ b/vllm/model_executor/parallel_utils/parallel_state.py @@ -210,36 +210,36 @@ def destroy_model_parallel(): global _PIPELINE_GLOBAL_RANKS _PIPELINE_GLOBAL_RANKS = None - # Destroy the cupy states if any. + # Destroy the pynccl states if any. pynccl_utils.destroy_process_group() -# Whether to use cupy for nccl all reduce. -# We use cupy for all reduce when using CUDA graph, because torch.distributed +# Whether to use pynccl for nccl all reduce. +# We use pynccl for all reduce when using CUDA graph, because torch.distributed # is not well supported by CUDA graph. -_ENABLE_CUPY_FOR_ALL_REDUCE = False +_ENABLE_PYNCCL_FOR_ALL_REDUCE = False @contextlib.contextmanager def with_cupy_nccl_for_all_reduce(): - """use CuPy nccl instead of torch.distributed for all reduce""" + """use pynccl instead of torch.distributed for all reduce""" tp_size = get_tensor_model_parallel_world_size() if tp_size == 1: # No-op. # NOTE(woosuk): We don't initialize CuPy when tp_size is 1. yield else: - global _ENABLE_CUPY_FOR_ALL_REDUCE - old = _ENABLE_CUPY_FOR_ALL_REDUCE - _ENABLE_CUPY_FOR_ALL_REDUCE = True + global _ENABLE_PYNCCL_FOR_ALL_REDUCE + old = _ENABLE_PYNCCL_FOR_ALL_REDUCE + _ENABLE_PYNCCL_FOR_ALL_REDUCE = True stream = torch.cuda.current_stream() with pynccl_utils.set_cupy_stream(stream): yield - _ENABLE_CUPY_FOR_ALL_REDUCE = old + _ENABLE_PYNCCL_FOR_ALL_REDUCE = old def is_cupy_nccl_enabled_for_all_reduce(): - """check if CuPy nccl is enabled for all reduce""" - global _ENABLE_CUPY_FOR_ALL_REDUCE - return _ENABLE_CUPY_FOR_ALL_REDUCE + """check if pynccl is enabled for all reduce""" + global _ENABLE_PYNCCL_FOR_ALL_REDUCE + return _ENABLE_PYNCCL_FOR_ALL_REDUCE From 8a5a011a6722caf45cbb61fc5dfc5ca013b40a0c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 22:41:20 -0700 Subject: [PATCH 39/54] rename cupy --> pynccl --- vllm/worker/model_runner.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0fa33a6debc5..374f519afc81 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -15,7 +15,7 @@ from vllm.model_executor.parallel_utils.communication_op import ( broadcast_tensor_dict) from vllm.model_executor.parallel_utils.parallel_state import ( - with_cupy_nccl_for_all_reduce) + with_pynccl_for_all_reduce) from vllm.model_executor.parallel_utils import custom_all_reduce from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata @@ -834,7 +834,7 @@ def capture( # Run the model once without capturing the graph. # This is to make sure that the captured graph does not include the # kernel launches for initial benchmarking (e.g., Triton autotune). - with _maybe_cupy_nccl(): + with _maybe_pynccl(): self.model( input_ids, positions, @@ -848,7 +848,7 @@ def capture( # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement self.graph = torch.cuda.CUDAGraph() with torch.cuda.graph(self.graph, pool=memory_pool): # noqa: SIM117 - with _maybe_cupy_nccl(): + with _maybe_pynccl(): hidden_states = self.model( input_ids, positions, @@ -899,10 +899,10 @@ def __call__(self, *args, **kwargs): @contextlib.contextmanager -def _maybe_cupy_nccl(): +def _maybe_pynccl(): if pynccl_utils.is_initialized( ) and not custom_all_reduce.is_initialized(): - with with_cupy_nccl_for_all_reduce(): + with with_pynccl_for_all_reduce(): yield else: yield From a009e311c5aa9db048ad60f139235856a4e96ae7 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 22:41:59 -0700 Subject: [PATCH 40/54] rename cupy --> pynccl --- vllm/model_executor/parallel_utils/parallel_state.py | 4 ++-- vllm/model_executor/parallel_utils/pynccl_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py index 55c65e434d9c..dbec519fa1db 100644 --- a/vllm/model_executor/parallel_utils/parallel_state.py +++ b/vllm/model_executor/parallel_utils/parallel_state.py @@ -221,7 +221,7 @@ def destroy_model_parallel(): @contextlib.contextmanager -def with_cupy_nccl_for_all_reduce(): +def with_pynccl_for_all_reduce(): """use pynccl instead of torch.distributed for all reduce""" tp_size = get_tensor_model_parallel_world_size() if tp_size == 1: @@ -234,7 +234,7 @@ def with_cupy_nccl_for_all_reduce(): _ENABLE_PYNCCL_FOR_ALL_REDUCE = True stream = torch.cuda.current_stream() - with pynccl_utils.set_cupy_stream(stream): + with pynccl_utils.set_pynccl_stream(stream): yield _ENABLE_PYNCCL_FOR_ALL_REDUCE = old diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py index 62b8b22c3143..e498526b71bb 100644 --- a/vllm/model_executor/parallel_utils/pynccl_utils.py +++ b/vllm/model_executor/parallel_utils/pynccl_utils.py @@ -29,7 +29,7 @@ def is_initialized() -> bool: @contextlib.contextmanager -def set_cupy_stream(stream: torch.cuda.Stream): +def set_pynccl_stream(stream: torch.cuda.Stream): """Set the cuda stream for communication""" try: comm.stream = stream From 68e4792739b418dbaa00117f8b04a17b379e66cd Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 22:42:45 -0700 Subject: [PATCH 41/54] rename cupy --> pynccl --- vllm/model_executor/parallel_utils/communication_op.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index 01036ab5fba2..28433d31f56a 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -9,7 +9,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tensor_model_parallel_group, - is_cupy_nccl_enabled_for_all_reduce, + is_pynccl_enabled_for_all_reduce, ) from vllm.model_executor.parallel_utils.custom_all_reduce import ( custom_all_reduce) @@ -33,7 +33,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: out = custom_all_reduce(input_) if out is not None: return out - if is_cupy_nccl_enabled_for_all_reduce(): + if is_pynccl_enabled_for_all_reduce(): # TODO: support multiple parallel groups. pynccl_utils.all_reduce(input_) else: From 0a6fab18f01823413546f3d2e1a0dec06ae9e344 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 23:07:18 -0700 Subject: [PATCH 42/54] fix wget install order --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6246a7e40773..7139bff6d5b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,9 @@ RUN ldconfig /usr/local/cuda-12.1/compat/ WORKDIR /workspace +# used for downloading files +RUN apt install -y wget unzip + # install build and runtime dependencies COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ @@ -93,9 +96,6 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -# used for downloading files -RUN apt install -y wget unzip - #################### TEST IMAGE #################### From a82a97618b6071cb7aeeb1dfba9ad2943ce4f3a5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Mar 2024 23:19:27 -0700 Subject: [PATCH 43/54] rename cupy --> pynccl --- vllm/model_executor/parallel_utils/parallel_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py index dbec519fa1db..63890d9cd5bd 100644 --- a/vllm/model_executor/parallel_utils/parallel_state.py +++ b/vllm/model_executor/parallel_utils/parallel_state.py @@ -239,7 +239,7 @@ def with_pynccl_for_all_reduce(): _ENABLE_PYNCCL_FOR_ALL_REDUCE = old -def is_cupy_nccl_enabled_for_all_reduce(): +def is_pynccl_enabled_for_all_reduce(): """check if pynccl is enabled for all reduce""" global _ENABLE_PYNCCL_FOR_ALL_REDUCE return _ENABLE_PYNCCL_FOR_ALL_REDUCE From 1c6ec4836244ff6a71137f182fdf1c60af7da97d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Mar 2024 00:09:18 -0700 Subject: [PATCH 44/54] fix so filename and search path --- Dockerfile | 5 +++++ vllm/model_executor/parallel_utils/pynccl.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7139bff6d5b3..379b2581eaf1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -133,3 +133,8 @@ COPY vllm vllm ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] #################### OPENAI API SERVER #################### + +# the package has been installed to /usr/local/lib/python3.10/dist-packages +# remove it so that test scripts will not import vllm from current directory +# (which does not have so files) +RUN rm -rf vllm diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index 679a9fa03d1a..8232f231de4c 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -21,7 +21,7 @@ # manually load the nccl library _path = os.path.dirname(os.path.abspath(__file__)) -so_file = glob.glob(f"{_path}/../../lib/nvidia/lib/libnccl.so.*")[0] +so_file = glob.glob(f"{_path}/../../lib/nvidia/nccl/lib/libnccl.so.*")[0] nccl = ctypes.CDLL(so_file) # === export types and functions from nccl to Python === From 47ff82ab2e9c2c5e64e5a552da96fb5cbcbd3a38 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Mar 2024 00:38:20 -0700 Subject: [PATCH 45/54] fix dockerfile --- Dockerfile | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 379b2581eaf1..1c6cd48f0dbb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -95,7 +95,10 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose - +# the package has been installed to /usr/local/lib/python3.10/dist-packages +# remove it so that test scripts will not import vllm from current directory +# (which does not have so files) +RUN rm -rf vllm #################### TEST IMAGE #################### @@ -133,8 +136,3 @@ COPY vllm vllm ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] #################### OPENAI API SERVER #################### - -# the package has been installed to /usr/local/lib/python3.10/dist-packages -# remove it so that test scripts will not import vllm from current directory -# (which does not have so files) -RUN rm -rf vllm From b0c15c2478800aa0e5c4a7afbb6be5eaac547151 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Mar 2024 00:58:15 -0700 Subject: [PATCH 46/54] fix dockerfile --- Dockerfile | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1c6cd48f0dbb..01c3a19db618 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,11 +94,10 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml -RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -# the package has been installed to /usr/local/lib/python3.10/dist-packages -# remove it so that test scripts will not import vllm from current directory -# (which does not have so files) -RUN rm -rf vllm +# the package will be installed to /usr/local/lib/python3.10/dist-packages +# *.so files will only be there, rather than in the source directory +# use `-e` to install in editable mode, so that directory links to the source +RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install -e . --verbose #################### TEST IMAGE #################### From 0b4f7dddbd26d4efbdf7e0b97de275b59a60bdf2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Mar 2024 10:18:44 -0700 Subject: [PATCH 47/54] download and use manifest in to force keeping .so file --- MANIFEST.in | 1 + setup.py | 103 ++++++++++++++++++++++++---------------------------- 2 files changed, 48 insertions(+), 56 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index aa16da6500e6..677fa19721fc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,3 +4,4 @@ include CMakeLists.txt recursive-include cmake * recursive-include csrc * +recursive-include vllm/lib * diff --git a/setup.py b/setup.py index 7a4db9ac2e00..79cfc107c084 100644 --- a/setup.py +++ b/setup.py @@ -190,64 +190,56 @@ def _is_neuron() -> bool: return torch_neuronx_installed -class CustomInstallCommand(install): +def _install_punica() -> bool: + return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) - def run(self): - # Call the standard install process first - install.run(self) - - if not _is_cuda(): - return - - # tricky part, nccl 2.19 has a bug that increased memory overhead - # of cudagraph. However, pytorch has binary dependencies on nccl 2.19, - # simply `pip install nvidia-nccl-cu12==2.18.3` will break pytorch, - # so we have to manually download nccl 2.18 and keep the library to - # a secrect place - - # Define the URL of the file and the directory to unzip to - file_url = ( - 'https://files.pythonhosted.org/packages/44/6e/' - '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff' - '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl') - - logger.info('Installing NVIDIA NCCL library...') - - # `self.install_lib` is something like /path/to/python/site-packages/ - target_dir = self.install_lib + "vllm/lib/" - # `self.root` is something like `/tmp/pip-install-abc123/`, i.e. the - # temporary directory where the package is being built - temp_dir = self.root - local_zip_path = ( - f"{temp_dir}/" - "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl") - # check if the target directory exists - if not os.path.exists(target_dir): - logger.info(f'Creating target directory {target_dir} ...') - os.makedirs(target_dir) - # Check if the file is already downloaded - if os.path.exists(target_dir + "nvidia"): - logger.info('library already exists.') - return - if not os.path.exists(local_zip_path): - # Download the file - logger.info('Downloading file...') - os.system(f"wget {file_url} -q -P {temp_dir}/") - # Unzip the file - logger.info('Unzipping file...') - with zipfile.ZipFile(local_zip_path, 'r') as zip_ref: - zip_ref.extractall(temp_dir) - shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info") - os.remove(local_zip_path) - # Move the unzipped files to the target directory - logger.info('Moving files...') - os.system(f"mv {temp_dir}/nvidia {target_dir}") - so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2" - os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3")) +if _is_cuda(): -def _install_punica() -> bool: - return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) + # tricky part, nccl 2.19 has a bug that increased memory overhead + # of cudagraph. However, pytorch has binary dependencies on nccl 2.19, + # simply `pip install nvidia-nccl-cu12==2.18.3` will break pytorch, + # so we have to manually download nccl 2.18 and keep the library to + # a secrect place + + # Define the URL of the file and the directory to unzip to + file_url = ( + 'https://files.pythonhosted.org/packages/44/6e/' + '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff' + '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl') + + logger.info('Installing NVIDIA NCCL library...') + + target_dir = os.path.dirname(os.path.abspath(__file__)) + "/vllm/lib/" + # `self.root` is something like `/tmp/pip-install-abc123/`, i.e. the + # temporary directory where the package is being built + temp_dir = self.root + local_zip_path = ( + f"{temp_dir}/" + "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl") + # check if the target directory exists + if not os.path.exists(target_dir): + logger.info(f'Creating target directory {target_dir} ...') + os.makedirs(target_dir) + # Check if the file is already downloaded + if os.path.exists(target_dir + "nvidia"): + logger.info('library already exists.') + return + if not os.path.exists(local_zip_path): + # Download the file + logger.info('Downloading file...') + os.system(f"wget {file_url} -q -P {temp_dir}/") + # Unzip the file + logger.info('Unzipping file...') + with zipfile.ZipFile(local_zip_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info") + os.remove(local_zip_path) + # Move the unzipped files to the target directory + logger.info('Moving files...') + os.system(f"mv {temp_dir}/nvidia {target_dir}") + so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2" + os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3")) def get_hipcc_rocm_version(): @@ -426,7 +418,6 @@ def get_requirements() -> List[str]: ext_modules=ext_modules, cmdclass={ "build_ext": cmake_build_ext if not _is_neuron() else build_ext, - "install": CustomInstallCommand, }, package_data=package_data, ) From 7942050a198361ee10d16353bd2fefe92a0776e2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Mar 2024 10:21:31 -0700 Subject: [PATCH 48/54] download and use manifest in to force keeping .so file --- setup.py | 64 ++++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/setup.py b/setup.py index 79cfc107c084..06dfeee0d262 100644 --- a/setup.py +++ b/setup.py @@ -8,13 +8,13 @@ from packaging.version import parse, Version from setuptools import setup, find_packages, Extension from setuptools.command.build_ext import build_ext -from setuptools.command.install import install from shutil import which import torch from torch.utils.cpp_extension import CUDA_HOME import zipfile import shutil import logging +import tempfile logger = logging.getLogger(__name__) @@ -203,43 +203,39 @@ def _install_punica() -> bool: # a secrect place # Define the URL of the file and the directory to unzip to - file_url = ( - 'https://files.pythonhosted.org/packages/44/6e/' - '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff' - '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl') + file_url = ('https://files.pythonhosted.org/packages/44/6e/' + '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff' + '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl') logger.info('Installing NVIDIA NCCL library...') target_dir = os.path.dirname(os.path.abspath(__file__)) + "/vllm/lib/" - # `self.root` is something like `/tmp/pip-install-abc123/`, i.e. the - # temporary directory where the package is being built - temp_dir = self.root - local_zip_path = ( - f"{temp_dir}/" - "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl") - # check if the target directory exists - if not os.path.exists(target_dir): - logger.info(f'Creating target directory {target_dir} ...') - os.makedirs(target_dir) - # Check if the file is already downloaded - if os.path.exists(target_dir + "nvidia"): - logger.info('library already exists.') - return - if not os.path.exists(local_zip_path): - # Download the file - logger.info('Downloading file...') - os.system(f"wget {file_url} -q -P {temp_dir}/") - # Unzip the file - logger.info('Unzipping file...') - with zipfile.ZipFile(local_zip_path, 'r') as zip_ref: - zip_ref.extractall(temp_dir) - shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info") - os.remove(local_zip_path) - # Move the unzipped files to the target directory - logger.info('Moving files...') - os.system(f"mv {temp_dir}/nvidia {target_dir}") - so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2" - os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3")) + with tempfile.TemporaryDirectory() as temp_dir: + local_zip_path = ( + f"{temp_dir}/" + "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl") + # check if the target directory exists + if not os.path.exists(target_dir): + logger.info(f'Creating target directory {target_dir} ...') + os.makedirs(target_dir) + # Check if the file is already downloaded + if os.path.exists(target_dir + "nvidia"): + logger.info('library already exists.') + else: + # Download the file + logger.info('Downloading file...') + os.system(f"wget {file_url} -q -P {temp_dir}/") + # Unzip the file + logger.info('Unzipping file...') + with zipfile.ZipFile(local_zip_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info") + os.remove(local_zip_path) + # Move the unzipped files to the target directory + logger.info('Moving files...') + os.system(f"mv {temp_dir}/nvidia {target_dir}") + so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2" + os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3")) def get_hipcc_rocm_version(): From 20a3ec487089b0fd984f8a30717fa3ceca2fc111 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Mar 2024 10:24:08 -0700 Subject: [PATCH 49/54] restore dockerfile --- Dockerfile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 01c3a19db618..d78ddd25ccf7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,10 +94,7 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml -# the package will be installed to /usr/local/lib/python3.10/dist-packages -# *.so files will only be there, rather than in the source directory -# use `-e` to install in editable mode, so that directory links to the source -RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install -e . --verbose +RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose #################### TEST IMAGE #################### From 0ca27b7403a540949f7c0c8c8cb787bcaad97fe7 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Mar 2024 10:29:37 -0700 Subject: [PATCH 50/54] add lib file to package data --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 06dfeee0d262..2fbf579915c1 100644 --- a/setup.py +++ b/setup.py @@ -380,7 +380,8 @@ def get_requirements() -> List[str]: ext_modules.append(CMakeExtension(name="vllm._C")) package_data = { - "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] + "vllm": + ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "lib/*"] } if os.environ.get("VLLM_USE_PRECOMPILED"): package_data["vllm"].append("*.so") From a3c2340ae36ce8ee782691d30111377eaf7ae6ce Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Mar 2024 10:47:34 -0700 Subject: [PATCH 51/54] add libnccl.so.2.18.3 via hard-coding --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 2fbf579915c1..39338d30f139 100644 --- a/setup.py +++ b/setup.py @@ -380,8 +380,10 @@ def get_requirements() -> List[str]: ext_modules.append(CMakeExtension(name="vllm._C")) package_data = { - "vllm": - ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "lib/*"] + "vllm": [ + "py.typed", "model_executor/layers/fused_moe/configs/*.json", + "lib/nvidia/nccl/lib/libnccl.so.2.18.3" + ] } if os.environ.get("VLLM_USE_PRECOMPILED"): package_data["vllm"].append("*.so") From 71e29762b860d84c27b7a3953bb7e90dd5bda3f5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 25 Mar 2024 09:43:40 -0700 Subject: [PATCH 52/54] enable VLLM_NCCL_SO_PATH at runtime --- vllm/model_executor/parallel_utils/pynccl.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py index 8232f231de4c..9f0aaf5f9321 100644 --- a/vllm/model_executor/parallel_utils/pynccl.py +++ b/vllm/model_executor/parallel_utils/pynccl.py @@ -18,10 +18,20 @@ import datetime import os import glob +import logging + +logger = logging.getLogger(__name__) + +so_file = os.environ.get("VLLM_NCCL_SO_PATH", "") # manually load the nccl library -_path = os.path.dirname(os.path.abspath(__file__)) -so_file = glob.glob(f"{_path}/../../lib/nvidia/nccl/lib/libnccl.so.*")[0] +if so_file: + logger.info( + f"Loading nccl from environment variable VLLM_NCCL_SO_PATH={so_file}") +else: + _path = os.path.dirname(os.path.abspath(__file__)) + so_file = glob.glob(f"{_path}/../../lib/nvidia/nccl/lib/libnccl.so.*")[0] + logger.info(f"Loading nccl from vLLM builtin file {so_file}") nccl = ctypes.CDLL(so_file) # === export types and functions from nccl to Python === From 3d9332a938931522088aacca945b1063ad2424db Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 25 Mar 2024 09:46:42 -0700 Subject: [PATCH 53/54] nit, os.makedirs(target_dir, exist_ok=True) --- setup.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 39338d30f139..07d08be5fbb2 100644 --- a/setup.py +++ b/setup.py @@ -214,10 +214,8 @@ def _install_punica() -> bool: local_zip_path = ( f"{temp_dir}/" "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl") - # check if the target directory exists - if not os.path.exists(target_dir): - logger.info(f'Creating target directory {target_dir} ...') - os.makedirs(target_dir) + # make sure the target directory exists + os.makedirs(target_dir, exist_ok=True) # Check if the file is already downloaded if os.path.exists(target_dir + "nvidia"): logger.info('library already exists.') From 76f46f65401d6fb897ce388fe92e219fbfd6656b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 25 Mar 2024 12:34:12 -0700 Subject: [PATCH 54/54] upgrade to pt 2.2.1 --- .github/workflows/publish.yml | 2 +- CMakeLists.txt | 2 +- pyproject.toml | 2 +- requirements-build.txt | 2 +- requirements.txt | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d88f646636ba..2db687a287ef 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -49,7 +49,7 @@ jobs: matrix: os: ['ubuntu-20.04'] python-version: ['3.8', '3.9', '3.10', '3.11'] - pytorch-version: ['2.2.0'] # Must be the most recent version that meets requirements.txt. + pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements.txt. cuda-version: ['11.8', '12.1'] steps: diff --git a/CMakeLists.txt b/CMakeLists.txt index 29d11f9a04d6..be3dc520e43f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.2.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1") set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") diff --git a/pyproject.toml b/pyproject.toml index 05fbfbb89d27..509c2a630b4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "ninja", "packaging", "setuptools >= 49.4.0", - "torch == 2.2.0", + "torch == 2.2.1", "wheel", ] build-backend = "setuptools.build_meta" diff --git a/requirements-build.txt b/requirements-build.txt index a0a6aef8bb16..2bc07fb152aa 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -3,5 +3,5 @@ cmake>=3.21 ninja packaging setuptools>=49.4.0 -torch==2.2.0 +torch==2.2.1 wheel diff --git a/requirements.txt b/requirements.txt index d8fd1986ecb8..57996f5cc231 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,8 @@ psutil ray >= 2.9 sentencepiece # Required for LLaMA tokenizer. numpy -torch == 2.2.0 -xformers == 0.0.24 # Requires PyTorch 2.2.0. +torch == 2.2.1 +xformers == 0.0.25 # Requires PyTorch 2.2.1. transformers >= 4.39.0 # Required for StarCoder2. fastapi uvicorn[standard]