From 1a7d82642b65a563c1f47e5097e5c0fb764e0c21 Mon Sep 17 00:00:00 2001
From: Harry Mellor <harrym@graphcore.ai>
Date: Wed, 7 Feb 2024 14:33:50 +0000
Subject: [PATCH 01/54] Upgrade to `torch==2.2.0`

- `torch==2.1.1` -> `torch==2.2.0`
- `xformers==0.0.23.post1` -> `xformers==0.0.24`
- ROCM not updated because no `torch==2.2.0` containers have been published yet
---
 .github/workflows/publish.yml | 2 +-
 Dockerfile.rocm               | 2 +-
 pyproject.toml                | 2 +-
 requirements-build.txt        | 2 +-
 requirements-dev.txt          | 1 +
 requirements.txt              | 4 ++--
 6 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 5211dc180798..d88f646636ba 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.1.2']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.0']  # Must be the most recent version that meets requirements.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index f49b321372ed..1ae9e9527820 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -65,7 +65,7 @@ RUN mkdir libs \
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.23 --no-deps
+RUN python3 -m pip install xformers==0.0.24 --no-deps
 
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
diff --git a/pyproject.toml b/pyproject.toml
index b197256f6ff5..e430f703154f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.1.2",
+    "torch == 2.2.0",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index 7e7e48a1313e..fdc6dcd503fb 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,5 +2,5 @@
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.1.2
+torch==2.2.0
 wheel
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f8126008d079..b764313487e7 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -15,6 +15,7 @@ pytest-forked
 pytest-asyncio
 httpx
 einops # required for MPT
+wheel # required for flash_attn
 flash_attn # required for HuggingFace's llama implementation
 openai
 requests
diff --git a/requirements.txt b/requirements.txt
index 5684b2c29634..dc5c939f2d36 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,9 +3,9 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.1.2
+torch == 2.2.0
 transformers >= 4.37.0 # Required for Qwen2
-xformers == 0.0.23.post1  # Required for CUDA 12.1.
+xformers == 0.0.24  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From 7de363fdcae21891392a113ac16c797f66d862d8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <harrym@graphcore.ai>
Date: Wed, 7 Feb 2024 15:28:19 +0000
Subject: [PATCH 02/54] Remove `wheel` from `requirements-dev.txt`

---
 requirements-dev.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index b764313487e7..f8126008d079 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -15,7 +15,6 @@ pytest-forked
 pytest-asyncio
 httpx
 einops # required for MPT
-wheel # required for flash_attn
 flash_attn # required for HuggingFace's llama implementation
 openai
 requests

From 9bc921d2cc6484297b262487ea42e1524a74e9ba Mon Sep 17 00:00:00 2001
From: Harry Mellor <harrym@graphcore.ai>
Date: Mon, 12 Feb 2024 12:49:47 +0000
Subject: [PATCH 03/54] Revert change to `Dockerfile.rocm`

---
 Dockerfile.rocm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 1ae9e9527820..f49b321372ed 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -65,7 +65,7 @@ RUN mkdir libs \
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.24 --no-deps
+RUN python3 -m pip install xformers==0.0.23 --no-deps
 
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue

From 76ab3e7274b4641a503fcfefdb04281b26736531 Mon Sep 17 00:00:00 2001
From: Harry Mellor <harrym@graphcore.ai>
Date: Thu, 15 Feb 2024 10:59:45 +0000
Subject: [PATCH 04/54] Kick CI


From 922aa0c8f21c03b26fd058ac89f3d051e444da73 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 22 Feb 2024 08:46:52 +0000
Subject: [PATCH 05/54] Update requirements.txt

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 264a63e85ec7..594d1d6fbb1b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.2.0
 transformers >= 4.38.0  # Required for Gemma.
-xformers == 0.0.24  # Required for CUDA 12.1.
+xformers == 0.0.24  # Required for PyTorch 2.2.0.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From daca4e15560b743901d97a1dfffeb58134338a9f Mon Sep 17 00:00:00 2001
From: Harry Mellor <harrym@graphcore.ai>
Date: Mon, 4 Mar 2024 14:01:17 +0000
Subject: [PATCH 06/54] Update to 2.2.1

---
 .github/workflows/publish.yml | 2 +-
 pyproject.toml                | 2 +-
 requirements-build.txt        | 2 +-
 requirements.txt              | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index d88f646636ba..2db687a287ef 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.2.0']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/pyproject.toml b/pyproject.toml
index f2d39237af0f..508c58f42898 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.2.0",
+    "torch == 2.2.1",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index fdc6dcd503fb..18cfb5894d18 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,5 +2,5 @@
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.2.0
+torch==2.2.1
 wheel
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 60d58d1c3a19..03f63f2fe97b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,9 +3,9 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.2.0
+torch == 2.2.1
 transformers >= 4.38.0  # Required for Gemma.
-xformers == 0.0.24  # Required for PyTorch 2.2.0.
+xformers == 0.0.24  # Required for PyTorch 2.2.1.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From 015b7d4dfde87d6a1cbf5652d0a90a3d5eb6826e Mon Sep 17 00:00:00 2001
From: Harry Mellor <harrym@graphcore.ai>
Date: Mon, 4 Mar 2024 15:21:04 +0000
Subject: [PATCH 07/54] Revert "Update to 2.2.1"

This reverts commit daca4e15560b743901d97a1dfffeb58134338a9f.
---
 .github/workflows/publish.yml | 2 +-
 pyproject.toml                | 2 +-
 requirements-build.txt        | 2 +-
 requirements.txt              | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 2db687a287ef..d88f646636ba 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.0']  # Must be the most recent version that meets requirements.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/pyproject.toml b/pyproject.toml
index 508c58f42898..f2d39237af0f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.2.1",
+    "torch == 2.2.0",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index 18cfb5894d18..fdc6dcd503fb 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,5 +2,5 @@
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.2.1
+torch==2.2.0
 wheel
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 03f63f2fe97b..60d58d1c3a19 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,9 +3,9 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.2.1
+torch == 2.2.0
 transformers >= 4.38.0  # Required for Gemma.
-xformers == 0.0.24  # Required for PyTorch 2.2.1.
+xformers == 0.0.24  # Required for PyTorch 2.2.0.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From 75f05de27b0a25df106b9850cc78075de1df4e42 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 15 Mar 2024 20:57:28 +0000
Subject: [PATCH 08/54] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6a30bdbbfc41..16f369365079 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.2.0
 transformers >= 4.38.0  # Required for Gemma.
-xformers == 0.0.24  # Required for PyTorch 2.2.0.
+xformers == 0.0.24  # Requires PyTorch 2.2.0.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From e82cf3a5a1147ecba9b4aeeedec7a58039747fa1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 15 Mar 2024 18:38:05 -0700
Subject: [PATCH 09/54] try to test one distributed at a time

---
 .buildkite/test-pipeline.yaml                    |  9 +++++++--
 .../test_basic_distributed_correctness.py        | 16 +++++++++++++---
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8badc16d0cb7..8810dda8d826 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -22,8 +22,13 @@ steps:
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
-- label: Distributed Correctness Test
-  command: pytest -v -s --forked test_basic_distributed_correctness.py
+- label: Distributed Correctness Test-facebook/opt-125m
+  command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Distributed Correctness Test-meta-llama/Llama-2-7b-hf
+  command: TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 82075356fccb..75d6a84adfc7 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -1,13 +1,23 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 
-Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
+vLLM will allocate all the available memory, so we need to run the tests one
+by one. The solution is to pass arguments (model name) by environment
+variables.
+Run:
+
+```sh
+TEST_DIST_MODEL=facebook/opt-125m pytest \
+    test_basic_distributed_correctness.py
+TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
+    test_basic_distributed_correctness.py
+```
 """
+import os
 import pytest
 import torch
 
 MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    os.environ["TEST_DIST_MODEL"],
 ]
 
 
From 4accd02f6f14fd8a09c387f0ff0a1f7c3c4df9e1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 16 Mar 2024 00:54:11 -0700
Subject: [PATCH 10/54] try pytorch 2.2.1

---
 .github/workflows/publish.yml | 2 +-
 pyproject.toml                | 2 +-
 requirements-build.txt        | 2 +-
 requirements.txt              | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index d88f646636ba..2db687a287ef 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.2.0']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/pyproject.toml b/pyproject.toml
index c0645fd09e7f..c812af904b89 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.2.0",
+    "torch == 2.2.1",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index fdc6dcd503fb..18cfb5894d18 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,5 +2,5 @@
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.2.0
+torch==2.2.1
 wheel
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 16f369365079..12062276d12e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,9 +3,9 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.2.0
+torch == 2.2.1
 transformers >= 4.38.0  # Required for Gemma.
-xformers == 0.0.24  # Requires PyTorch 2.2.0.
+xformers == 0.0.25  # Requires PyTorch 2.2.1.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From a92346f0e7897e37f994822d1516e1c71a76a347 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Wed, 20 Mar 2024 23:38:14 -0700
Subject: [PATCH 11/54] try to fix test

---
 .buildkite/test-pipeline.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8810dda8d826..e313226e08dd 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -28,7 +28,11 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
 
 - label: Distributed Correctness Test-meta-llama/Llama-2-7b-hf
-  command: TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py
+  command:
+    - export TORCH_FILE=$(python -c "import torch; print(torch.__file__)")
+    - export NCCL_DIR=$(echo $TORCH_FILE | sed 's|torch/__init__.py|nvidia/nccl|')
+    - rm -rf $NCCL_DIR || true # remove the directory if it exists
+    - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 

From e7f215b08361a52bc7d960c5c65e7ac627ebd087 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Wed, 20 Mar 2024 23:57:44 -0700
Subject: [PATCH 12/54] use pip install to resolve the problem

---
 .buildkite/test-pipeline.yaml | 6 +-----
 requirements.txt              | 1 +
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e313226e08dd..8810dda8d826 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -28,11 +28,7 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
 
 - label: Distributed Correctness Test-meta-llama/Llama-2-7b-hf
-  command:
-    - export TORCH_FILE=$(python -c "import torch; print(torch.__file__)")
-    - export NCCL_DIR=$(echo $TORCH_FILE | sed 's|torch/__init__.py|nvidia/nccl|')
-    - rm -rf $NCCL_DIR || true # remove the directory if it exists
-    - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py
+  command: TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
diff --git a/requirements.txt b/requirements.txt
index 12062276d12e..bb51db100e5c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ pynvml == 11.5.0
 triton >= 2.1.0
 outlines == 0.0.34
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
+nvidia-nccl-cu12 == 2.18.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
\ No newline at end of file

From f99fe2ac75d4b4c0a47966cb17566f5ca766b925 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 21 Mar 2024 01:29:03 -0700
Subject: [PATCH 13/54] remove nccl version to test

---
 .buildkite/test-pipeline.yaml | 4 +++-
 requirements.txt              | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8810dda8d826..eb4c702e9108 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -23,7 +23,9 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
 
 - label: Distributed Correctness Test-facebook/opt-125m
-  command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
+  command: 
+  - pip install nvidia-nccl-cu12==2.19.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
+  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
diff --git a/requirements.txt b/requirements.txt
index bb51db100e5c..12062276d12e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,3 @@ pynvml == 11.5.0
 triton >= 2.1.0
 outlines == 0.0.34
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
-nvidia-nccl-cu12 == 2.18.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
\ No newline at end of file

From 0f3181f434c262ee7934f3171bc8257b6cc47c00 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 21 Mar 2024 07:16:30 -0700
Subject: [PATCH 14/54] move to Dockerfile

---
 .buildkite/test-pipeline.yaml | 4 +---
 Dockerfile                    | 4 ++--
 requirements.txt              | 1 +
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index eb4c702e9108..8810dda8d826 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -23,9 +23,7 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
 
 - label: Distributed Correctness Test-facebook/opt-125m
-  command: 
-  - pip install nvidia-nccl-cu12==2.19.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
-  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
+  command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
diff --git a/Dockerfile b/Dockerfile
index 8be03b3567f0..ff1629f02a8d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,7 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements.txt --force-reinstall
 
 # install development dependencies
 COPY requirements-dev.txt requirements-dev.txt
@@ -106,7 +106,7 @@ RUN apt-get update -y \
 WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements.txt --force-reinstall
 
 # Install flash attention (from pre-built wheel)
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
diff --git a/requirements.txt b/requirements.txt
index 12062276d12e..e0d80463a8ae 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ pynvml == 11.5.0
 triton >= 2.1.0
 outlines == 0.0.34
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
+nvidia-nccl-cu12 == 2.19.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
\ No newline at end of file

From 6ef38435c567fbba4570bcc8ae181c4a5f9304f5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 21 Mar 2024 07:18:27 -0700
Subject: [PATCH 15/54] fix version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e0d80463a8ae..1e5c3f911e21 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,4 @@ pynvml == 11.5.0
 triton >= 2.1.0
 outlines == 0.0.34
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
-nvidia-nccl-cu12 == 2.19.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
\ No newline at end of file
+nvidia-nccl-cu12 == 2.18.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph

From 7db0e1b4a87deeef29861a090f42522099bb7575 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 21 Mar 2024 07:29:51 -0700
Subject: [PATCH 16/54] use docerfile

---
 Dockerfile       | 16 ++++++++++++++--
 requirements.txt |  1 -
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ff1629f02a8d..1ae333da5dab 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,13 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt --force-reinstall
+    pip install -r requirements.txt
+# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
+# so we use 2.18.3
+# and we are in a dependency hell with torch and cupy
+# have to manually install to downgrade nccl version
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install nvidia-nccl-cu12==2.18.3
 
 # install development dependencies
 COPY requirements-dev.txt requirements-dev.txt
@@ -106,7 +112,13 @@ RUN apt-get update -y \
 WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt --force-reinstall
+    pip install -r requirements.txt
+# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
+# so we use 2.18.3
+# and we are in a dependency hell with torch and cupy
+# have to manually install to downgrade nccl version
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install nvidia-nccl-cu12==2.18.3
 
 # Install flash attention (from pre-built wheel)
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
diff --git a/requirements.txt b/requirements.txt
index 1e5c3f911e21..12062276d12e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,3 @@ pynvml == 11.5.0
 triton >= 2.1.0
 outlines == 0.0.34
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
-nvidia-nccl-cu12 == 2.18.3 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph

From 62650ae6fd02fa37a7130a7a700b082a3ba6f2aa Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 21 Mar 2024 08:05:46 -0700
Subject: [PATCH 17/54] try 2.2.0 first

---
 .github/workflows/publish.yml | 2 +-
 pyproject.toml                | 2 +-
 requirements-build.txt        | 2 +-
 requirements.txt              | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 2db687a287ef..d88f646636ba 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.0']  # Must be the most recent version that meets requirements.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/pyproject.toml b/pyproject.toml
index c812af904b89..c0645fd09e7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.2.1",
+    "torch == 2.2.0",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index 18cfb5894d18..fdc6dcd503fb 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,5 +2,5 @@
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.2.1
+torch==2.2.0
 wheel
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 12062276d12e..16f369365079 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,9 +3,9 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.2.1
+torch == 2.2.0
 transformers >= 4.38.0  # Required for Gemma.
-xformers == 0.0.25  # Requires PyTorch 2.2.1.
+xformers == 0.0.24  # Requires PyTorch 2.2.0.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From 4ed16b9dbf1aca8c20c620d1bc801a5b7f3e2a7c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 21 Mar 2024 09:20:20 -0700
Subject: [PATCH 18/54] place nccl install after vllm

---
 Dockerfile | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1ae333da5dab..179758ba0b71 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,12 +19,6 @@ WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements.txt
-# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
-# so we use 2.18.3
-# and we are in a dependency hell with torch and cupy
-# have to manually install to downgrade nccl version
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install nvidia-nccl-cu12==2.18.3
 
 # install development dependencies
 COPY requirements-dev.txt requirements-dev.txt
@@ -96,6 +90,13 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
+# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
+# so we use 2.18.3
+# and we are in a dependency hell with torch and cupy
+# have to manually install to downgrade nccl version
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install nvidia-nccl-cu12==2.18.3
+
 #################### TEST IMAGE ####################
 
 
@@ -113,6 +114,10 @@ WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements.txt
+
+# Install flash attention (from pre-built wheel)
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 # important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
 # so we use 2.18.3
 # and we are in a dependency hell with torch and cupy
@@ -120,10 +125,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install nvidia-nccl-cu12==2.18.3
 
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-
 #################### RUNTIME BASE IMAGE ####################
 
 
From 2d215dfac817f9d3669daf09791e00210d00ac7a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 21 Mar 2024 10:57:36 -0700
Subject: [PATCH 19/54] patchelf

---
 Dockerfile | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 179758ba0b71..f579e4fa0ae5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -90,13 +90,15 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
-# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
-# so we use 2.18.3
-# and we are in a dependency hell with torch and cupy
-# have to manually install to downgrade nccl version
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install nvidia-nccl-cu12==2.18.3
-
+# install nccl 2.16.2
+RUN python3 -m cupyx.tools.install_library --cuda 12.x --library nccl
+RUN apt install patchelf -y
+# rename to 2.16
+RUN cp /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2 /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2.16
+# rename pytorch-dependent nccl to 2.19
+RUN cp /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2 /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2.19
+RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.19 /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so
+RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.16 /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so
 #################### TEST IMAGE ####################
 
 
@@ -118,12 +120,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # Install flash attention (from pre-built wheel)
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
     pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-# important, cupy-cuda12x with 2.19 leads to much larger memory overhead with cudagraph
-# so we use 2.18.3
-# and we are in a dependency hell with torch and cupy
-# have to manually install to downgrade nccl version
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install nvidia-nccl-cu12==2.18.3
 
 #################### RUNTIME BASE IMAGE ####################
 

From 0f6f24376158c9b4d1e3e93e11cc3e7ce8790a00 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 21 Mar 2024 12:56:23 -0700
Subject: [PATCH 20/54] update rpath for cupy

---
 Dockerfile | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f579e4fa0ae5..cf688294371c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -90,15 +90,25 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
-# install nccl 2.16.2
-RUN python3 -m cupyx.tools.install_library --cuda 12.x --library nccl
+
+# tricky part, manipulate so files to use the correct nccl version
 RUN apt install patchelf -y
+
+# cupy part
+# nccl 2.16.2 that is good for cupy
+RUN python3 -m cupyx.tools.install_library --cuda 12.x --library nccl
 # rename to 2.16
 RUN cp /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2 /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2.16
+# link to the correct nccl version
+RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.16 /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so
+# set rpath to the correct nccl version
+RUN patchelf --set-rpath /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/ /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so
+
+# pytorch part
 # rename pytorch-dependent nccl to 2.19
 RUN cp /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2 /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2.19
+# link to a more specific nccl version
 RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.19 /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so
-RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.16 /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so
 #################### TEST IMAGE ####################
 
 
From da1df5e24bef5b910b8132853109d30fe64bf378 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 15:47:10 -0700
Subject: [PATCH 21/54] try to write a custom pynccl

---
 Dockerfile                                    |  25 +-
 .../parallel_utils/cupy_utils.py              |  65 ++++--
 vllm/model_executor/parallel_utils/pynccl.py  | 217 ++++++++++++++++++
 3 files changed, 272 insertions(+), 35 deletions(-)
 create mode 100644 vllm/model_executor/parallel_utils/pynccl.py

diff --git a/Dockerfile b/Dockerfile
index cf688294371c..9d8e357a781e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -91,24 +91,13 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
 
-# tricky part, manipulate so files to use the correct nccl version
-RUN apt install patchelf -y
-
-# cupy part
-# nccl 2.16.2 that is good for cupy
-RUN python3 -m cupyx.tools.install_library --cuda 12.x --library nccl
-# rename to 2.16
-RUN cp /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2 /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/libnccl.so.2.16
-# link to the correct nccl version
-RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.16 /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so
-# set rpath to the correct nccl version
-RUN patchelf --set-rpath /root/.cupy/cuda_lib/12.x/nccl/2.16.2/lib/ /usr/local/lib/python3.10/dist-packages/cupy_backends/cuda/libs/nccl.cpython-310-x86_64-linux-gnu.so
-
-# pytorch part
-# rename pytorch-dependent nccl to 2.19
-RUN cp /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2 /usr/local/lib/python3.10/dist-packages/nvidia/nccl/lib/libnccl.so.2.19
-# link to a more specific nccl version
-RUN patchelf --replace-needed libnccl.so.2 libnccl.so.2.19 /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so
+# tricky part, only this version nccl is good
+RUN wget https://developer.download.nvidia.com/compute/redist/nccl/v2.16.2/nccl_2.16.2-1+cuda12.0_x86_64.txz
+RUN tar -xvf nccl_2.16.2-1+cuda12.0_x86_64.txz
+RUN cp ./nccl_2.16.2-1+cuda12.0_x86_64/lib/libnccl.so.2.16.2 ./libnccl.so.2.16.2
+RUN rm -rf ./nccl_2.16.2-1+cuda12.0_x86_64
+RUN rm nccl_2.16.2-1+cuda12.0_x86_64.txz
+
 #################### TEST IMAGE ####################
 
 
diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py
index f8cffc01e3c3..f58ce6c4bdba 100644
--- a/vllm/model_executor/parallel_utils/cupy_utils.py
+++ b/vllm/model_executor/parallel_utils/cupy_utils.py
@@ -12,10 +12,15 @@
 import torch
 from torch.distributed import ReduceOp
 
+
 try:
-    import cupy
-    from cupy.cuda import nccl
-    from cupyx.distributed import NCCLBackend
+    # import cupy
+    # from cupy.cuda import nccl
+    # from cupyx.distributed import NCCLBackend
+    from .pynccl import NCCLCommunicator, ncclGetVersion
+    print(f"nccl version {ncclGetVersion()}")
+    comm: NCCLCommunicator = None
+
 except ImportError as e:
     cupy = e
     nccl = None
@@ -32,7 +37,7 @@ class NCCLBackend:
 }
 
 
-class NCCLBackendWithBFloat16(NCCLBackend):
+class NCCLBackendWithBFloat16:
     # This is enough to add bfloat16 support for most operations,
     # but broadcast will fail (will require changes in compiled
     # cupy code).
@@ -55,12 +60,18 @@ def barrier(self) -> None:
 
 def is_initialized() -> bool:
     """Returns whether the NCCL backend is initialized."""
-    return _NCCL_BACKEND is not None
+    return comm is not None
 
 
 @contextlib.contextmanager
 def set_cupy_stream(stream: torch.cuda.Stream):
     """Set the cuda stream for communication"""
+    try:
+        comm.stream = stream
+        yield
+    finally:
+        pass
+    return
     cupy_stream = cupy.cuda.ExternalStream(stream.cuda_stream,
                                            stream.device_index)
     with cupy_stream:
@@ -74,6 +85,9 @@ def init_process_group(world_size: int, rank: int, host: str,
     # TODO: handle NCCL timeouts.
     """
     assert not is_initialized()
+    global comm
+    comm = NCCLCommunicator(init_method=f"tcp://{host}:{port}", world_size=world_size, rank=rank)
+    return
 
     if isinstance(cupy, Exception):
         raise ImportError(
@@ -99,21 +113,36 @@ def init_process_group(world_size: int, rank: int, host: str,
 def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
     """All-reduces the input tensor across the process group."""
     assert input_.is_cuda, f"{input_} should be a cuda tensor"
-    # Hack to support bfloat16
-    torch_dtype = input_.dtype
-    if torch_dtype is torch.bfloat16:
-        # We need to view as float16, otherwise
-        # cupy will fail. This will not change
-        # the underlying data.
-        input_ = input_.view(torch.float16)
-    cupy_input = cupy.asarray(input_)
-    cupy_input._torch_dtype = torch_dtype  # pylint: disable=protected-access
-    _NCCL_BACKEND.all_reduce(in_array=cupy_input,
-                             out_array=cupy_input,
-                             op=_OP_MAPPING[op])
+    free_bytes = torch.cuda.mem_get_info()[0]
+    # # Hack to support bfloat16
+    # torch_dtype = input_.dtype
+    # if torch_dtype is torch.bfloat16:
+    #     # We need to view as float16, otherwise
+    #     # cupy will fail. This will not change
+    #     # the underlying data.
+    #     input_ = input_.view(torch.float16)
+    # cupy_input = cupy.asarray(input_)
+    # cupy_input._torch_dtype = torch_dtype  # pylint: disable=protected-access
+    # _NCCL_BACKEND.all_reduce(in_array=cupy_input,
+    #                          out_array=cupy_input,
+    #                          op=_OP_MAPPING[op])
+    comm.all_reduce(input_, op)
+
+    import os
+    env_name = os.environ['CONDA_DEFAULT_ENV']
+    dir_name = f"/home/gcpuser/vllm/{env_name}-process-{os.getpid()}"
+    with open(f"{dir_name}.txt", "a") as f:
+        f.write(f"{free_bytes=} before allreduce\n")
+        free_bytes_after = torch.cuda.mem_get_info()[0]
+        f.write(f"{free_bytes_after=} after allreduce\n")
+        f.write(f"memory cost during allreduce: {(free_bytes - free_bytes_after) / 1024 / 1024} MiB\n")
 
 
 def destroy_process_group() -> None:
+    global comm
+    del comm
+    comm = None
+    return
     """Destroys the NCCL backend."""
     global _NCCL_BACKEND
     global _WORLD_SIZE
@@ -123,8 +152,10 @@ def destroy_process_group() -> None:
 
 def get_world_size() -> int:
     """Returns the world size."""
+    return comm.world_size
     return _WORLD_SIZE
 
 
 def get_nccl_backend():
+    return comm
     return _NCCL_BACKEND
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
new file mode 100644
index 000000000000..c6f8e6e6840c
--- /dev/null
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -0,0 +1,217 @@
+# ===================== pynccl.py =====================
+# This file is a pure Python wrapper for the NCCL library.
+# Copyright (c) 2024 vLLM team
+# Author: Kaichao You
+# Email: youkaichao@gmail.com
+# All rights reserved.
+# ====================================================
+
+
+# ===================== import region =====================
+import torch
+import ctypes
+import torch.distributed as dist
+from torch.distributed import ReduceOp
+import datetime
+
+# manually load the nccl library
+nccl = ctypes.CDLL("/vllm-workspace/libnccl.so.2.16.2")
+# use `pip install nvidia-nccl-cu12==2.16.2` to install from pypi
+# then you can use the following line to load the library
+# and they cause increased memory overhead
+# nccl = ctypes.CDLL("/opt/conda/envs/${CONDA_ENV}/lib/python3.9/site-packages/nvidia/nccl/lib/libnccl.so.2")
+
+# ===================== declare types and functions =====================
+
+ncclResult_t = ctypes.c_int
+
+# equivalent to c declaration:
+# ncclResult_t  ncclGetVersion(int *version);
+_c_ncclGetVersion = nccl.ncclGetVersion
+_c_ncclGetVersion.restype = ctypes.c_int
+_c_ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
+
+def ncclGetVersion() -> int:
+    version = ctypes.c_int()
+    result = _c_ncclGetVersion(ctypes.byref(version))
+    assert result == 0
+    # something like 21903 --> "2.19.3"
+    version_str = str(version.value)
+    major = version_str[0].lstrip("0")
+    minor = version_str[1:3].lstrip("0")
+    patch = version_str[3:].lstrip("0")
+    return f"{major}.{minor}.{patch}"
+
+class NcclUniqueId(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+# equivalent to c declaration:
+# ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
+_c_ncclGetUniqueId = nccl.ncclGetUniqueId
+_c_ncclGetUniqueId.restype = ctypes.c_int
+_c_ncclGetUniqueId.argtypes = [ctypes.POINTER(NcclUniqueId)]
+def ncclGetUniqueId() -> NcclUniqueId:
+    unique_id = NcclUniqueId()
+    result = _c_ncclGetUniqueId(ctypes.byref(unique_id))
+    assert result == 0
+    return unique_id
+
+def test_ncclGetUniqueId():
+    unique_id = ncclGetUniqueId()
+    # print something like:
+    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    print(list(unique_id.internal))
+
+
+# equivalent to c declaration:
+# ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+# note that ncclComm_t is a pointer type, so the first argument is a pointer to a pointer
+_c_ncclCommInitRank = nccl.ncclCommInitRank
+_c_ncclCommInitRank.restype = ctypes.c_int
+_c_ncclCommInitRank.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int]
+
+# enums
+class ncclDataType_t(ctypes.c_int):
+    ncclInt8 = 0
+    ncclChar = 0
+    ncclUint8 = 1
+    ncclInt32 = 2
+    ncclInt = 2
+    ncclUint32 = 3
+    ncclInt64 = 4
+    ncclUint64 = 5
+    ncclFloat16 = 6
+    ncclHalf = 6
+    ncclFloat32 = 7
+    ncclFloat = 7
+    ncclFloat64 = 8
+    ncclDouble = 8
+    ncclBfloat16 = 9  # Uncomment if __CUDA_BF16_TYPES_EXIST__ is defined
+    ncclNumTypes = 10 # Uncomment if __CUDA_BF16_TYPES_EXIST__ is defined
+
+    @classmethod
+    def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t':
+        if dtype == torch.int8:
+            return cls.ncclInt8
+        if dtype == torch.uint8:
+            return cls.ncclUint8
+        if dtype == torch.int32:
+            return cls.ncclInt32
+        if dtype == torch.int64:
+            return cls.ncclInt64
+        if dtype == torch.float16:
+            return cls.ncclFloat16
+        if dtype == torch.float32:
+            return cls.ncclFloat32
+        if dtype == torch.float64:
+            return cls.ncclFloat64
+        if dtype == torch.bfloat16:
+            return cls.ncclBfloat16
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+class ncclRedOp_t(ctypes.c_int):
+    ncclSum = 0
+    ncclProd = 1
+    ncclMax = 2
+    ncclMin = 3
+    ncclAvg = 4
+    ncclNumOps = 5
+    # ncclMaxRedOp value is based on enum size and int size, here simplified
+    ncclMaxRedOp = 0x7fffffff
+
+    @classmethod
+    def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
+        if op == ReduceOp.SUM:
+            return cls.ncclSum
+        if op == ReduceOp.PRODUCT:
+            return cls.ncclProd
+        if op == ReduceOp.MAX:
+            return cls.ncclMax
+        if op == ReduceOp.MIN:
+            return cls.ncclMin
+        if op == ReduceOp.AVG:
+            return cls.ncclAvg
+        raise ValueError(f"Unsupported op: {op}")
+
+
+# equivalent to c declaration:
+# ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+# note that cudaStream_t is a pointer type, so the last argument is a pointer
+_c_ncclAllReduce = nccl.ncclAllReduce
+_c_ncclAllReduce.restype = ctypes.c_int
+_c_ncclAllReduce.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t, ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p]
+
+
+# equivalent to c declaration:
+# ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+_c_ncclCommDestroy = nccl.ncclCommDestroy
+_c_ncclCommDestroy.restype = ctypes.c_int
+_c_ncclCommDestroy.argtypes = [ctypes.c_void_p]
+
+class NCCLCommunicator:
+    def __init__(
+        self,
+        backend = None,
+        init_method = None,
+        timeout = datetime.timedelta(seconds=10),
+        world_size: int = -1,
+        rank: int = -1,
+        store = None,
+        group_name: str = "",
+        pg_options = None,
+    ):
+        if not dist.is_initialized():
+            backend = backend or "nccl"
+            assert backend == 'nccl', "only use gloo backend for starting the NCCL communicator"
+            dist.init_process_group(
+                backend=backend,
+                init_method=init_method,
+                timeout=timeout,
+                world_size=world_size,
+                rank=rank,
+                store=store,
+                group_name=group_name,
+                pg_options=pg_options
+            )
+        self.world_size = dist.get_world_size()
+        self.rank = dist.get_rank()
+        torch.cuda.set_device(self.rank)
+        if self.rank == 0:
+            self.unique_id = ncclGetUniqueId()
+        else:
+            self.unique_id = NcclUniqueId()
+        tensor = torch.ByteTensor(list(self.unique_id.internal)).cuda(self.rank)
+        dist.broadcast(tensor, src=0)
+        byte_list = tensor.cpu().tolist()
+        self.unique_id = NcclUniqueId()
+        for i, byte in enumerate(byte_list):
+            self.unique_id.internal[i] = byte
+        self.comm = ctypes.c_void_p()
+        result = _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size, self.unique_id, self.rank)
+        assert result == 0
+        self.stream = torch.cuda.Stream(device=f"cuda:{self.rank}")
+
+    def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None):
+        if stream is None:
+            stream = self.stream
+        result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()), ctypes.c_void_p(tensor.data_ptr()), tensor.numel(), ncclDataType_t.from_torch(tensor.dtype), ncclRedOp_t.from_torch(op), self.comm, ctypes.c_void_p(stream.cuda_stream))
+        assert result == 0
+
+    def __del__(self):
+        dist.destroy_process_group()
+        _c_ncclCommDestroy(self.comm)
+
+def test_NCCLCommunicator():
+    # use `torchrun` to launch the script
+    # e.g. `torchrun --nproc_per_node=2 pynccl.py`
+    comm = NCCLCommunicator()
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+    comm.all_reduce(tensor)
+    result = tensor.mean().cpu().item()
+    assert result == comm.world_size
+    print(result)
+
+if __name__ == "__main__":
+    test_ncclGetUniqueId()
+    test_NCCLCommunicator()
+# ===================== pynccl.py =====================

From b4085a16865dac77c246f3896ec79c1ab1680bea Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 16:05:20 -0700
Subject: [PATCH 22/54] add wget

---
 .github/ISSUE_TEMPLATE/300-usage.yml | 1 +
 Dockerfile                           | 1 +
 vllm/worker/worker.py                | 6 ++++++
 3 files changed, 8 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index 88227b4b2e7b..b5638bca0158 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -12,6 +12,7 @@ body:
   attributes:
     label: Your current environment
     description: |
+      If you want your issue to be answered quickly, it is **very important** to provide enough information about your environment.
       Please run the following and paste the output below.
       ```sh
       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
diff --git a/Dockerfile b/Dockerfile
index 9d8e357a781e..a8c4c3a8286a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -92,6 +92,7 @@ RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
 
 # tricky part, only this version nccl is good
+RUN apt install -y wget
 RUN wget https://developer.download.nvidia.com/compute/redist/nccl/v2.16.2/nccl_2.16.2-1+cuda12.0_x86_64.txz
 RUN tar -xvf nccl_2.16.2-1+cuda12.0_x86_64.txz
 RUN cp ./nccl_2.16.2-1+cuda12.0_x86_64/lib/libnccl.so.2.16.2 ./libnccl.so.2.16.2
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 0dcd4018afa5..be7baa009cfb 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -142,6 +142,12 @@ def profile_num_available_blocks(
             self.model_runner.remove_all_loras()
         gc.collect()
         torch.cuda.empty_cache()
+        print(
+            f"{free_gpu_memory=}, {total_gpu_memory=}, {self.init_gpu_memory=}, {peak_memory=}"
+        )
+        print(
+            f"{block_size=}, {cache_block_size=}, {num_gpu_blocks=}, {num_cpu_blocks=}"
+        )
         return num_gpu_blocks, num_cpu_blocks
 
     def init_cache_engine(self, cache_config: CacheConfig) -> None:

From f77c9ae19eba798d9dc8683c1152f2a71357816e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 16:18:00 -0700
Subject: [PATCH 23/54] delete logging code

---
 .../parallel_utils/cupy_utils.py              | 22 -------------------
 1 file changed, 22 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py
index f58ce6c4bdba..94e72bf62314 100644
--- a/vllm/model_executor/parallel_utils/cupy_utils.py
+++ b/vllm/model_executor/parallel_utils/cupy_utils.py
@@ -113,30 +113,8 @@ def init_process_group(world_size: int, rank: int, host: str,
 def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
     """All-reduces the input tensor across the process group."""
     assert input_.is_cuda, f"{input_} should be a cuda tensor"
-    free_bytes = torch.cuda.mem_get_info()[0]
-    # # Hack to support bfloat16
-    # torch_dtype = input_.dtype
-    # if torch_dtype is torch.bfloat16:
-    #     # We need to view as float16, otherwise
-    #     # cupy will fail. This will not change
-    #     # the underlying data.
-    #     input_ = input_.view(torch.float16)
-    # cupy_input = cupy.asarray(input_)
-    # cupy_input._torch_dtype = torch_dtype  # pylint: disable=protected-access
-    # _NCCL_BACKEND.all_reduce(in_array=cupy_input,
-    #                          out_array=cupy_input,
-    #                          op=_OP_MAPPING[op])
     comm.all_reduce(input_, op)
 
-    import os
-    env_name = os.environ['CONDA_DEFAULT_ENV']
-    dir_name = f"/home/gcpuser/vllm/{env_name}-process-{os.getpid()}"
-    with open(f"{dir_name}.txt", "a") as f:
-        f.write(f"{free_bytes=} before allreduce\n")
-        free_bytes_after = torch.cuda.mem_get_info()[0]
-        f.write(f"{free_bytes_after=} after allreduce\n")
-        f.write(f"memory cost during allreduce: {(free_bytes - free_bytes_after) / 1024 / 1024} MiB\n")
-
 
 def destroy_process_group() -> None:
     global comm

From 2766418b5d57d5d944db8aa02f4d7dcb8e720da1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 16:57:39 -0700
Subject: [PATCH 24/54] remove some debugging print

---
 .github/ISSUE_TEMPLATE/300-usage.yml | 1 -
 vllm/worker/worker.py                | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index b5638bca0158..88227b4b2e7b 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -12,7 +12,6 @@ body:
   attributes:
     label: Your current environment
     description: |
-      If you want your issue to be answered quickly, it is **very important** to provide enough information about your environment.
       Please run the following and paste the output below.
       ```sh
       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index be7baa009cfb..0dcd4018afa5 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -142,12 +142,6 @@ def profile_num_available_blocks(
             self.model_runner.remove_all_loras()
         gc.collect()
         torch.cuda.empty_cache()
-        print(
-            f"{free_gpu_memory=}, {total_gpu_memory=}, {self.init_gpu_memory=}, {peak_memory=}"
-        )
-        print(
-            f"{block_size=}, {cache_block_size=}, {num_gpu_blocks=}, {num_cpu_blocks=}"
-        )
         return num_gpu_blocks, num_cpu_blocks
 
     def init_cache_engine(self, cache_config: CacheConfig) -> None:

From 0e18aed831218f2acb72f05d3c9028a00ec64a6f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 17:10:51 -0700
Subject: [PATCH 25/54] use nccl 2.18.3

---
 Dockerfile | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a8c4c3a8286a..fb1bef248ae5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -91,13 +91,16 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
 
-# tricky part, only this version nccl is good
-RUN apt install -y wget
-RUN wget https://developer.download.nvidia.com/compute/redist/nccl/v2.16.2/nccl_2.16.2-1+cuda12.0_x86_64.txz
-RUN tar -xvf nccl_2.16.2-1+cuda12.0_x86_64.txz
-RUN cp ./nccl_2.16.2-1+cuda12.0_x86_64/lib/libnccl.so.2.16.2 ./libnccl.so.2.16.2
-RUN rm -rf ./nccl_2.16.2-1+cuda12.0_x86_64
-RUN rm nccl_2.16.2-1+cuda12.0_x86_64.txz
+# tricky part, nccl 2.19 has a bug that increased memory overhead of cudagraph
+# however, pytorch has binary dependencies on nccl 2.19
+# simply using `pip install nvidia-nccl-cu12==2.18.3` will break pytorch
+# so we have to manually download nccl 2.18 and keep the library to a secrect place
+RUN apt install -y wget unzip
+RUN wget https://files.pythonhosted.org/packages/44/6e/3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl
+RUN unzip nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl
+RUN cp ./nvidia/nccl/lib/libnccl.so.2 ./libnccl.so.2
+RUN rm -rf ./nvidia
+RUN rm nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl
 
 #################### TEST IMAGE ####################
 

From 7c531b037038fa9d1a6c8d02fe7b1b5849728325 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 18:31:51 -0700
Subject: [PATCH 26/54] add test for pynccl

---
 tests/distributed/test_pynccl.py             | 48 ++++++++++++++++++
 vllm/model_executor/parallel_utils/pynccl.py | 51 ++++++--------------
 2 files changed, 64 insertions(+), 35 deletions(-)
 create mode 100644 tests/distributed/test_pynccl.py

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
new file mode 100644
index 000000000000..1dd9ec8dada5
--- /dev/null
+++ b/tests/distributed/test_pynccl.py
@@ -0,0 +1,48 @@
+# this script is not run with `pytest`.
+# It is run with `torchrun`.
+import os
+import multiprocessing
+import pytest
+import torch
+from vllm.model_executor.parallel_utils.pynccl import (
+    NCCLCommunicator,
+    ncclGetUniqueId,
+    ncclCommInitRank,
+    ncclCommDestroy,
+)
+
+def worker_fn(env):
+    import os
+    os.environ.update(env)
+
+    # when environments are properly set, the usage is simple
+    comm = NCCLCommunicator()
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+    comm.all_reduce(tensor)
+    result = tensor.mean().cpu().item()
+    assert result == comm.world_size
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl():
+    number_of_processes = 2
+    processes = []
+    for i in range(number_of_processes):
+        env = os.environ.copy()
+        env['RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=worker_fn, args=(env,))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+
+def test_ncclGetUniqueId():
+    unique_id = ncclGetUniqueId()
+    # `list(unique_id.internal)` is something like this:
+    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    # as long as the function doesn't raise an exception, we're good
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index c6f8e6e6840c..8827aab25675 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -1,9 +1,13 @@
-# ===================== pynccl.py =====================
+# ===================== pynccl.py ==================================
 # This file is a pure Python wrapper for the NCCL library.
-# Copyright (c) 2024 vLLM team
-# Author: Kaichao You
-# Email: youkaichao@gmail.com
-# All rights reserved.
+# The main purpose is to use NCCL combined with CUDA graph.
+# Before writing this script, we tried the following approach:
+# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
+#  often gets stuck when initializing the NCCL communicator.
+# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
+#  contains many other potential cuda APIs, that are not allowed during
+#  capturing the CUDA graph. For further details, please check
+# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/199366
 # ====================================================
 
 
@@ -15,13 +19,12 @@
 import datetime
 
 # manually load the nccl library
+# TODO: find the path programmatically
 nccl = ctypes.CDLL("/vllm-workspace/libnccl.so.2.16.2")
-# use `pip install nvidia-nccl-cu12==2.16.2` to install from pypi
-# then you can use the following line to load the library
-# and they cause increased memory overhead
-# nccl = ctypes.CDLL("/opt/conda/envs/${CONDA_ENV}/lib/python3.9/site-packages/nvidia/nccl/lib/libnccl.so.2")
 
-# ===================== declare types and functions =====================
+# ===================== export types and functions from nccl to Python =====================
+# for the original nccl definition, please check
+# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
 
 ncclResult_t = ctypes.c_int
 
@@ -56,13 +59,6 @@ def ncclGetUniqueId() -> NcclUniqueId:
     assert result == 0
     return unique_id
 
-def test_ncclGetUniqueId():
-    unique_id = ncclGetUniqueId()
-    # print something like:
-    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-    print(list(unique_id.internal))
-
-
 # equivalent to c declaration:
 # ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 # note that ncclComm_t is a pointer type, so the first argument is a pointer to a pointer
@@ -86,8 +82,8 @@ class ncclDataType_t(ctypes.c_int):
     ncclFloat = 7
     ncclFloat64 = 8
     ncclDouble = 8
-    ncclBfloat16 = 9  # Uncomment if __CUDA_BF16_TYPES_EXIST__ is defined
-    ncclNumTypes = 10 # Uncomment if __CUDA_BF16_TYPES_EXIST__ is defined
+    ncclBfloat16 = 9
+    ncclNumTypes = 10
 
     @classmethod
     def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t':
@@ -116,8 +112,6 @@ class ncclRedOp_t(ctypes.c_int):
     ncclMin = 3
     ncclAvg = 4
     ncclNumOps = 5
-    # ncclMaxRedOp value is based on enum size and int size, here simplified
-    ncclMaxRedOp = 0x7fffffff
 
     @classmethod
     def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
@@ -162,7 +156,7 @@ def __init__(
     ):
         if not dist.is_initialized():
             backend = backend or "nccl"
-            assert backend == 'nccl', "only use gloo backend for starting the NCCL communicator"
+            assert backend == 'nccl', "only use nccl backend for starting the NCCL communicator"
             dist.init_process_group(
                 backend=backend,
                 init_method=init_method,
@@ -201,17 +195,4 @@ def __del__(self):
         dist.destroy_process_group()
         _c_ncclCommDestroy(self.comm)
 
-def test_NCCLCommunicator():
-    # use `torchrun` to launch the script
-    # e.g. `torchrun --nproc_per_node=2 pynccl.py`
-    comm = NCCLCommunicator()
-    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
-    comm.all_reduce(tensor)
-    result = tensor.mean().cpu().item()
-    assert result == comm.world_size
-    print(result)
-
-if __name__ == "__main__":
-    test_ncclGetUniqueId()
-    test_NCCLCommunicator()
 # ===================== pynccl.py =====================

From 1abf38e76dcd438241c2512b27fb6b0546b89ad2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 18:39:33 -0700
Subject: [PATCH 27/54] fix linter

---
 tests/distributed/test_pynccl.py              | 14 +++-
 .../parallel_utils/cupy_utils.py              |  6 +-
 vllm/model_executor/parallel_utils/pynccl.py  | 84 ++++++++++++-------
 3 files changed, 69 insertions(+), 35 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 1dd9ec8dada5..06f90452c42b 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -7,10 +7,9 @@
 from vllm.model_executor.parallel_utils.pynccl import (
     NCCLCommunicator,
     ncclGetUniqueId,
-    ncclCommInitRank,
-    ncclCommDestroy,
 )
 
+
 def worker_fn(env):
     import os
     os.environ.update(env)
@@ -22,6 +21,7 @@ def worker_fn(env):
     result = tensor.mean().cpu().item()
     assert result == comm.world_size
 
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 def test_pynccl():
@@ -33,7 +33,7 @@ def test_pynccl():
         env['WORLD_SIZE'] = str(number_of_processes)
         env['MASTER_ADDR'] = 'localhost'
         env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=worker_fn, args=(env,))
+        p = multiprocessing.Process(target=worker_fn, args=(env, ))
         processes.append(p)
         p.start()
 
@@ -44,5 +44,11 @@ def test_pynccl():
 def test_ncclGetUniqueId():
     unique_id = ncclGetUniqueId()
     # `list(unique_id.internal)` is something like this:
-    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     # as long as the function doesn't raise an exception, we're good
+    assert unique_id is not None
diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py
index 94e72bf62314..cb8d7385f31f 100644
--- a/vllm/model_executor/parallel_utils/cupy_utils.py
+++ b/vllm/model_executor/parallel_utils/cupy_utils.py
@@ -12,7 +12,6 @@
 import torch
 from torch.distributed import ReduceOp
 
-
 try:
     # import cupy
     # from cupy.cuda import nccl
@@ -86,7 +85,9 @@ def init_process_group(world_size: int, rank: int, host: str,
     """
     assert not is_initialized()
     global comm
-    comm = NCCLCommunicator(init_method=f"tcp://{host}:{port}", world_size=world_size, rank=rank)
+    comm = NCCLCommunicator(init_method=f"tcp://{host}:{port}",
+                            world_size=world_size,
+                            rank=rank)
     return
 
     if isinstance(cupy, Exception):
@@ -118,7 +119,6 @@ def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
 
 def destroy_process_group() -> None:
     global comm
-    del comm
     comm = None
     return
     """Destroys the NCCL backend."""
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index 8827aab25675..539f350d4a2a 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -10,7 +10,6 @@
 # https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/199366
 # ====================================================
 
-
 # ===================== import region =====================
 import torch
 import ctypes
@@ -22,7 +21,7 @@
 # TODO: find the path programmatically
 nccl = ctypes.CDLL("/vllm-workspace/libnccl.so.2.16.2")
 
-# ===================== export types and functions from nccl to Python =====================
+# === export types and functions from nccl to Python ===
 # for the original nccl definition, please check
 # https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
 
@@ -34,6 +33,7 @@
 _c_ncclGetVersion.restype = ctypes.c_int
 _c_ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
 
+
 def ncclGetVersion() -> int:
     version = ctypes.c_int()
     result = _c_ncclGetVersion(ctypes.byref(version))
@@ -45,26 +45,36 @@ def ncclGetVersion() -> int:
     patch = version_str[3:].lstrip("0")
     return f"{major}.{minor}.{patch}"
 
+
 class NcclUniqueId(ctypes.Structure):
     _fields_ = [("internal", ctypes.c_byte * 128)]
 
+
 # equivalent to c declaration:
 # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
 _c_ncclGetUniqueId = nccl.ncclGetUniqueId
 _c_ncclGetUniqueId.restype = ctypes.c_int
 _c_ncclGetUniqueId.argtypes = [ctypes.POINTER(NcclUniqueId)]
+
+
 def ncclGetUniqueId() -> NcclUniqueId:
     unique_id = NcclUniqueId()
     result = _c_ncclGetUniqueId(ctypes.byref(unique_id))
     assert result == 0
     return unique_id
 
+
 # equivalent to c declaration:
-# ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
-# note that ncclComm_t is a pointer type, so the first argument is a pointer to a pointer
+# ncclResult_t  ncclCommInitRank(
+#   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+# note that ncclComm_t is a pointer type, so the first argument
+# is a pointer to a pointer
 _c_ncclCommInitRank = nccl.ncclCommInitRank
 _c_ncclCommInitRank.restype = ctypes.c_int
-_c_ncclCommInitRank.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int]
+_c_ncclCommInitRank.argtypes = [
+    ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int
+]
+
 
 # enums
 class ncclDataType_t(ctypes.c_int):
@@ -105,6 +115,7 @@ def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t':
             return cls.ncclBfloat16
         raise ValueError(f"Unsupported dtype: {dtype}")
 
+
 class ncclRedOp_t(ctypes.c_int):
     ncclSum = 0
     ncclProd = 1
@@ -129,12 +140,17 @@ def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
 
 
 # equivalent to c declaration:
-# ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+# ncclResult_t  ncclAllReduce(
+#   const void* sendbuff, void* recvbuff, size_t count,
+#   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+#   udaStream_t stream);
 # note that cudaStream_t is a pointer type, so the last argument is a pointer
 _c_ncclAllReduce = nccl.ncclAllReduce
 _c_ncclAllReduce.restype = ctypes.c_int
-_c_ncclAllReduce.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t, ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p]
-
+_c_ncclAllReduce.argtypes = [
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t,
+    ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p
+]
 
 # equivalent to c declaration:
 # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
@@ -142,31 +158,32 @@ def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
 _c_ncclCommDestroy.restype = ctypes.c_int
 _c_ncclCommDestroy.argtypes = [ctypes.c_void_p]
 
+
 class NCCLCommunicator:
+
     def __init__(
         self,
-        backend = None,
-        init_method = None,
-        timeout = datetime.timedelta(seconds=10),
+        backend=None,
+        init_method=None,
+        timeout=datetime.timedelta(seconds=10),
         world_size: int = -1,
         rank: int = -1,
-        store = None,
+        store=None,
         group_name: str = "",
-        pg_options = None,
+        pg_options=None,
     ):
         if not dist.is_initialized():
             backend = backend or "nccl"
-            assert backend == 'nccl', "only use nccl backend for starting the NCCL communicator"
-            dist.init_process_group(
-                backend=backend,
-                init_method=init_method,
-                timeout=timeout,
-                world_size=world_size,
-                rank=rank,
-                store=store,
-                group_name=group_name,
-                pg_options=pg_options
-            )
+            assert backend == 'nccl', (
+                "only use nccl backend for starting the NCCL communicator")
+            dist.init_process_group(backend=backend,
+                                    init_method=init_method,
+                                    timeout=timeout,
+                                    world_size=world_size,
+                                    rank=rank,
+                                    store=store,
+                                    group_name=group_name,
+                                    pg_options=pg_options)
         self.world_size = dist.get_world_size()
         self.rank = dist.get_rank()
         torch.cuda.set_device(self.rank)
@@ -174,25 +191,36 @@ def __init__(
             self.unique_id = ncclGetUniqueId()
         else:
             self.unique_id = NcclUniqueId()
-        tensor = torch.ByteTensor(list(self.unique_id.internal)).cuda(self.rank)
+        tensor = torch.ByteTensor(list(self.unique_id.internal)).cuda(
+            self.rank)
         dist.broadcast(tensor, src=0)
         byte_list = tensor.cpu().tolist()
         self.unique_id = NcclUniqueId()
         for i, byte in enumerate(byte_list):
             self.unique_id.internal[i] = byte
         self.comm = ctypes.c_void_p()
-        result = _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size, self.unique_id, self.rank)
+        result = _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size,
+                                     self.unique_id, self.rank)
         assert result == 0
         self.stream = torch.cuda.Stream(device=f"cuda:{self.rank}")
 
-    def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None):
+    def all_reduce(self,
+                   tensor: torch.Tensor,
+                   op: ReduceOp = ReduceOp.SUM,
+                   stream=None):
         if stream is None:
             stream = self.stream
-        result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()), ctypes.c_void_p(tensor.data_ptr()), tensor.numel(), ncclDataType_t.from_torch(tensor.dtype), ncclRedOp_t.from_torch(op), self.comm, ctypes.c_void_p(stream.cuda_stream))
+        result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
+                                  ctypes.c_void_p(tensor.data_ptr()),
+                                  tensor.numel(),
+                                  ncclDataType_t.from_torch(tensor.dtype),
+                                  ncclRedOp_t.from_torch(op), self.comm,
+                                  ctypes.c_void_p(stream.cuda_stream))
         assert result == 0
 
     def __del__(self):
         dist.destroy_process_group()
         _c_ncclCommDestroy(self.comm)
 
+
 # ===================== pynccl.py =====================

From 5d661a679166dcffd44affb0bd6508e3f42b3e7c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 18:45:49 -0700
Subject: [PATCH 28/54] update cupy_utils to pynccl

---
 .../parallel_utils/cupy_utils.py              | 95 ++-----------------
 1 file changed, 7 insertions(+), 88 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py
index cb8d7385f31f..5c06b40a3f56 100644
--- a/vllm/model_executor/parallel_utils/cupy_utils.py
+++ b/vllm/model_executor/parallel_utils/cupy_utils.py
@@ -1,60 +1,17 @@
-"""CuPy utilities for all-reduce.
+from .pynccl import NCCLCommunicator, ncclGetVersion
 
-We use CuPy all-reduce instead of torch.distributed.all_reduce when capturing
-CUDA graphs, because torch.distributed.all_reduce causes errors when capturing
-CUDA graphs.
-
-NOTE: We use CuPy 12.3 since CuPy 13.0 does not support Python 3.8.
-TODO: Remove this file when torch.distributed.all_reduce is fixed.
-"""
 import contextlib
-
+import logging
 import torch
-from torch.distributed import ReduceOp
-
-try:
-    # import cupy
-    # from cupy.cuda import nccl
-    # from cupyx.distributed import NCCLBackend
-    from .pynccl import NCCLCommunicator, ncclGetVersion
-    print(f"nccl version {ncclGetVersion()}")
-    comm: NCCLCommunicator = None
-
-except ImportError as e:
-    cupy = e
-    nccl = None
-
-    class NCCLBackend:
-        ...
-
-
-_OP_MAPPING = {
-    ReduceOp.SUM: "sum",
-    ReduceOp.PRODUCT: "prod",
-    ReduceOp.MIN: "min",
-    ReduceOp.MAX: "max",
-}
 
+from typing import Optional
+from torch.distributed import ReduceOp
 
-class NCCLBackendWithBFloat16:
-    # This is enough to add bfloat16 support for most operations,
-    # but broadcast will fail (will require changes in compiled
-    # cupy code).
-    def _get_nccl_dtype_and_count(self, array, count=None):
-        nccl_dtype, count = super()._get_nccl_dtype_and_count(array, count)
-        torch_dtype = getattr(array, "_torch_dtype", None)
-        if torch_dtype is torch.bfloat16:
-            nccl_dtype = nccl.NCCL_BFLOAT16
-        return nccl_dtype, count
-
-    def barrier(self) -> None:
-        raise RuntimeError(
-            "Currently, CuPy NCCL barrier is not supported since the TCP "
-            "store is immediately stopped after the initialization.")
+logger = logging.getLogger(__name__)
 
+logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
 
-_NCCL_BACKEND = None
-_WORLD_SIZE = 0
+comm: Optional[NCCLCommunicator] = None
 
 
 def is_initialized() -> bool:
@@ -70,45 +27,15 @@ def set_cupy_stream(stream: torch.cuda.Stream):
         yield
     finally:
         pass
-    return
-    cupy_stream = cupy.cuda.ExternalStream(stream.cuda_stream,
-                                           stream.device_index)
-    with cupy_stream:
-        yield
 
 
 def init_process_group(world_size: int, rank: int, host: str,
                        port: int) -> None:
-    """Initializes the CuPy NCCL backend.
-
-    # TODO: handle NCCL timeouts.
-    """
     assert not is_initialized()
     global comm
     comm = NCCLCommunicator(init_method=f"tcp://{host}:{port}",
                             world_size=world_size,
                             rank=rank)
-    return
-
-    if isinstance(cupy, Exception):
-        raise ImportError(
-            "NCCLBackend is not available. Please install cupy.") from cupy
-
-    # TODO(woosuk): Create TP and PP process groups for CuPy.
-    global _NCCL_BACKEND
-    global _WORLD_SIZE
-    assert world_size > 0, f"{world_size=} should be a positive integer"
-    assert 0 <= rank < world_size, (
-        f"{rank=} should be a integer between [0, {world_size})")
-
-    cupy.cuda.runtime.setDevice(torch.cuda.current_device())
-    _NCCL_BACKEND = NCCLBackendWithBFloat16(world_size, rank, host, port)
-    _WORLD_SIZE = world_size
-
-    # Stop the TCP store to prevent the deadlock issues at termination time.
-    # FIXME(woosuk): This is hacky. Find a more robust solution.
-    if rank == 0 and hasattr(_NCCL_BACKEND, "_store"):
-        _NCCL_BACKEND._store.stop()
 
 
 def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
@@ -120,20 +47,12 @@ def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
 def destroy_process_group() -> None:
     global comm
     comm = None
-    return
-    """Destroys the NCCL backend."""
-    global _NCCL_BACKEND
-    global _WORLD_SIZE
-    _NCCL_BACKEND = None
-    _WORLD_SIZE = 0
 
 
 def get_world_size() -> int:
     """Returns the world size."""
     return comm.world_size
-    return _WORLD_SIZE
 
 
 def get_nccl_backend():
     return comm
-    return _NCCL_BACKEND

From 99f96d7f7f21498309f55bf8cef658beeeed4b54 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 18:52:46 -0700
Subject: [PATCH 29/54] rename cupy_utils to pynccl_utils

---
 .../parallel_utils/communication_op.py               |  4 ++--
 vllm/model_executor/parallel_utils/parallel_state.py |  6 +++---
 .../{cupy_utils.py => pynccl_utils.py}               |  0
 vllm/worker/model_runner.py                          |  6 +++---
 vllm/worker/worker.py                                | 12 ++++++------
 5 files changed, 14 insertions(+), 14 deletions(-)
 rename vllm/model_executor/parallel_utils/{cupy_utils.py => pynccl_utils.py} (100%)

diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index 6f00fd001d95..01036ab5fba2 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -4,7 +4,7 @@
 import torch
 from torch.distributed import ProcessGroup
 
-from vllm.model_executor.parallel_utils import cupy_utils
+from vllm.model_executor.parallel_utils import pynccl_utils
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -35,7 +35,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
         return out
     if is_cupy_nccl_enabled_for_all_reduce():
         # TODO: support multiple parallel groups.
-        cupy_utils.all_reduce(input_)
+        pynccl_utils.all_reduce(input_)
     else:
         torch.distributed.all_reduce(input_,
                                      group=get_tensor_model_parallel_group())
diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py
index c821936d06e4..8f3b1549d609 100644
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from vllm.model_executor.parallel_utils import cupy_utils
+from vllm.model_executor.parallel_utils import pynccl_utils
 
 # Tensor model parallel group that the current rank belongs to.
 _TENSOR_MODEL_PARALLEL_GROUP = None
@@ -211,7 +211,7 @@ def destroy_model_parallel():
     _PIPELINE_GLOBAL_RANKS = None
 
     # Destroy the cupy states if any.
-    cupy_utils.destroy_process_group()
+    pynccl_utils.destroy_process_group()
 
 
 # Whether to use cupy for nccl all reduce.
@@ -234,7 +234,7 @@ def with_cupy_nccl_for_all_reduce():
         _ENABLE_CUPY_FOR_ALL_REDUCE = True
 
         stream = torch.cuda.current_stream()
-        with cupy_utils.set_cupy_stream(stream):
+        with pynccl_utils.set_cupy_stream(stream):
             yield
         _ENABLE_CUPY_FOR_ALL_REDUCE = old
 
diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py
similarity index 100%
rename from vllm/model_executor/parallel_utils/cupy_utils.py
rename to vllm/model_executor/parallel_utils/pynccl_utils.py
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index b8eeb51379f4..b2d6c96165c4 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -11,7 +11,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import InputMetadata, SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.parallel_utils import cupy_utils
+from vllm.model_executor.parallel_utils import pynccl_utils
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
 from vllm.model_executor.parallel_utils.parallel_state import (
@@ -720,7 +720,7 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
         """
         # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
         # deleted before the CUDA graphs.
-        self.cupy_nccl_backend = cupy_utils.get_nccl_backend()
+        self.cupy_nccl_backend = pynccl_utils.get_nccl_backend()
 
         assert not self.model_config.enforce_eager
         logger.info("Capturing the model for CUDA graphs. This may lead to "
@@ -900,7 +900,7 @@ def __call__(self, *args, **kwargs):
 
 @contextlib.contextmanager
 def _maybe_cupy_nccl():
-    if cupy_utils.is_initialized() and not custom_all_reduce.is_initialized():
+    if pynccl_utils.is_initialized() and not custom_all_reduce.is_initialized():
         with with_cupy_nccl_for_all_reduce():
             yield
     else:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index d8999dc17212..c979effae048 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -9,7 +9,7 @@
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, LoRAConfig)
 from vllm.model_executor import set_random_seed
-from vllm.model_executor.parallel_utils import cupy_utils
+from vllm.model_executor.parallel_utils import pynccl_utils
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
 from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
@@ -262,8 +262,8 @@ def init_distributed_environment(
             init_method=distributed_init_method,
         )
 
-    if cupy_utils.is_initialized():
-        cupy_world_size = cupy_utils.get_world_size()
+    if pynccl_utils.is_initialized():
+        cupy_world_size = pynccl_utils.get_world_size()
         if cupy_world_size != parallel_config.world_size:
             raise RuntimeError(
                 "cupy.distributed is already initialized but the cupy world "
@@ -273,7 +273,7 @@ def init_distributed_environment(
         # NOTE(woosuk): We don't initialize CuPy process group when world size
         # is 1.
         # TODO(woosuk): Support multi-node connection.
-        cupy_utils.init_process_group(
+        pynccl_utils.init_process_group(
             world_size=parallel_config.world_size,
             rank=rank,
             host="localhost",
@@ -282,8 +282,8 @@ def init_distributed_environment(
 
     # A small all_reduce for warmup.
     torch.distributed.all_reduce(torch.zeros(1).cuda())
-    if cupy_utils.is_initialized():
-        cupy_utils.all_reduce(torch.zeros(1).cuda())
+    if pynccl_utils.is_initialized():
+        pynccl_utils.all_reduce(torch.zeros(1).cuda())
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 

From b567f0408202172fd8f6ec2df196948930197da6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 18:53:48 -0700
Subject: [PATCH 30/54] update import

---
 vllm/model_executor/parallel_utils/pynccl_utils.py | 6 ++++--
 vllm/worker/model_runner.py                        | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py
index 5c06b40a3f56..584c26bced9f 100644
--- a/vllm/model_executor/parallel_utils/pynccl_utils.py
+++ b/vllm/model_executor/parallel_utils/pynccl_utils.py
@@ -1,11 +1,13 @@
-from .pynccl import NCCLCommunicator, ncclGetVersion
-
 import contextlib
 import logging
 import torch
 
 from typing import Optional
 from torch.distributed import ReduceOp
+from vllm.model_executor.parallel_utils.pynccl import (
+    NCCLCommunicator,
+    ncclGetVersion,
+)
 
 logger = logging.getLogger(__name__)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index b2d6c96165c4..0fa33a6debc5 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -900,7 +900,8 @@ def __call__(self, *args, **kwargs):
 
 @contextlib.contextmanager
 def _maybe_cupy_nccl():
-    if pynccl_utils.is_initialized() and not custom_all_reduce.is_initialized():
+    if pynccl_utils.is_initialized(
+    ) and not custom_all_reduce.is_initialized():
         with with_cupy_nccl_for_all_reduce():
             yield
     else:

From 74fcf08b020a0e85728584f2775c8c5a0028f960 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 19:02:38 -0700
Subject: [PATCH 31/54] update pytorch in cmake

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66842e6845ed..7c9ca497fc2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.2.0")
 set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
 

From 43da101f440095998ab2e4eee69599afd25703fd Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 19:16:05 -0700
Subject: [PATCH 32/54] add test with cudagraph

---
 tests/distributed/test_pynccl.py | 68 ++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 06f90452c42b..651e1471c93c 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -10,22 +10,8 @@
 )
 
 
-def worker_fn(env):
-    import os
-    os.environ.update(env)
-
-    # when environments are properly set, the usage is simple
-    comm = NCCLCommunicator()
-    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
-    comm.all_reduce(tensor)
-    result = tensor.mean().cpu().item()
-    assert result == comm.world_size
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-def test_pynccl():
-    number_of_processes = 2
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
     processes = []
     for i in range(number_of_processes):
         env = os.environ.copy()
@@ -33,7 +19,7 @@ def test_pynccl():
         env['WORLD_SIZE'] = str(number_of_processes)
         env['MASTER_ADDR'] = 'localhost'
         env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=worker_fn, args=(env, ))
+        p = multiprocessing.Process(target=fn, args=(env, ))
         processes.append(p)
         p.start()
 
@@ -41,6 +27,54 @@ def test_pynccl():
         p.join()
 
 
+def update_env(fn):
+
+    def wrapper(env):
+        import os
+        os.environ.update(env)
+        fn()
+
+    return wrapper
+
+
+@update_env
+def worker_fn():
+    comm = NCCLCommunicator()
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+    comm.all_reduce(tensor)
+    result = tensor.mean().cpu().item()
+    assert result == comm.world_size
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl():
+    distributed_run(worker_fn, 2)
+
+
+@update_env
+def worker_fn_with_cudagraph():
+    with torch.no_grad():
+        graph = torch.cuda.CUDAGraph()
+        comm = NCCLCommunicator()
+        # run something in the default stream to initialize torch engine
+        a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
+        torch.cuda.synchronize()
+        with torch.cuda.graph(graph, stream=comm.stream):
+            comm.all_reduce(a)
+        comm.stream.synchronize()
+        assert a.mean().cpu().item() == comm.world_size
+        graph.replay()
+        comm.stream.synchronize()
+        assert a.mean().cpu().item() == comm.world_size**2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_with_cudagraph():
+    distributed_run(worker_fn_with_cudagraph, 2)
+
+
 def test_ncclGetUniqueId():
     unique_id = ncclGetUniqueId()
     # `list(unique_id.internal)` is something like this:

From 37e7425674417378d567120bc54a3e3fe751e543 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 19:30:04 -0700
Subject: [PATCH 33/54] fix test; fix TORCH_CUDA_ARCH_LIST

---
 CMakeLists.txt                   | 3 +++
 tests/distributed/test_pynccl.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c9ca497fc2b..29d11f9a04d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,9 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
+# used when building pytorch-related extensions
+set(TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0;8.6;8.9;9.0")
+
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 651e1471c93c..58376306c277 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -63,7 +63,7 @@ def worker_fn_with_cudagraph():
         with torch.cuda.graph(graph, stream=comm.stream):
             comm.all_reduce(a)
         comm.stream.synchronize()
-        assert a.mean().cpu().item() == comm.world_size
+        assert a.mean().cpu().item() == comm.world_size**0
         graph.replay()
         comm.stream.synchronize()
         assert a.mean().cpu().item() == comm.world_size**2

From 7e983f5e32302b44773c9252af3cce281f8344f8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 19:51:30 -0700
Subject: [PATCH 34/54] fix amd tests

---
 .../parallel_utils/pynccl_utils.py            | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py
index 584c26bced9f..4f1e03c260d7 100644
--- a/vllm/model_executor/parallel_utils/pynccl_utils.py
+++ b/vllm/model_executor/parallel_utils/pynccl_utils.py
@@ -4,16 +4,21 @@
 
 from typing import Optional
 from torch.distributed import ReduceOp
-from vllm.model_executor.parallel_utils.pynccl import (
-    NCCLCommunicator,
-    ncclGetVersion,
-)
 
 logger = logging.getLogger(__name__)
 
-logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
-
-comm: Optional[NCCLCommunicator] = None
+try:
+    from vllm.model_executor.parallel_utils.pynccl import (
+        NCCLCommunicator,
+        ncclGetVersion,
+    )
+    logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
+except Exception:
+    # in non-NVIDIA environments, we can't import the nccl module
+    # e.g. when running on machines with AMD GPUs
+    pass
+
+comm: Optional["NCCLCommunicator"] = None
 
 
 def is_initialized() -> bool:

From e3f8d5f866f530d32bfe37bbf2676bbd0fce87c4 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 20:14:20 -0700
Subject: [PATCH 35/54] add pynccl test

---
 .buildkite/test-pipeline.yaml                      | 5 +++++
 vllm/model_executor/parallel_utils/pynccl_utils.py | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index cae2d9f0517f..0654fcfef0da 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -22,6 +22,11 @@ steps:
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
+- label: Distributed pynccl Test
+  command: pytest -v -s --forked test_pynccl.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
 - label: Distributed Correctness Test-facebook/opt-125m
   command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py
index 4f1e03c260d7..62b8b22c3143 100644
--- a/vllm/model_executor/parallel_utils/pynccl_utils.py
+++ b/vllm/model_executor/parallel_utils/pynccl_utils.py
@@ -13,9 +13,11 @@
         ncclGetVersion,
     )
     logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
-except Exception:
+except Exception as e:
     # in non-NVIDIA environments, we can't import the nccl module
     # e.g. when running on machines with AMD GPUs
+    logger.info(f"Failed to import NCCL library: {e}")
+    logger.info("It is expected if you are not running on NVIDIA GPUs.")
     pass
 
 comm: Optional["NCCLCommunicator"] = None

From 4e277ae3ba83308acdbda59dec66f029c826be17 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 22:22:26 -0700
Subject: [PATCH 36/54] pack up libnccl.so

---
 setup.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 47cac5996f81..5baefdec8af3 100644
--- a/setup.py
+++ b/setup.py
@@ -8,9 +8,15 @@
 from packaging.version import parse, Version
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext
+from setuptools.command.install import install
 from shutil import which
 import torch
 from torch.utils.cpp_extension import CUDA_HOME
+import zipfile
+import shutil
+import logging
+
+logger = logging.getLogger(__name__)
 
 ROOT_DIR = os.path.dirname(__file__)
 
@@ -184,6 +190,56 @@ def _is_neuron() -> bool:
     return torch_neuronx_installed
 
 
+class CustomInstallCommand(install):
+
+    def run(self):
+        # Call the standard install process first
+        install.run(self)
+
+        if not _is_cuda():
+            return
+
+        # Define the URL of the file and the directory to unzip to
+        file_url = (
+            'https://files.pythonhosted.org/packages/44/6e/'
+            '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff'
+            '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl')
+
+        logger.info('Installing NVIDIA NCCL library...')
+
+        # `self.install_lib` is something like /path/to/python/site-packages/
+        target_dir = self.install_lib + "vllm/lib/"
+        # `self.root` is something like `/tmp/pip-install-abc123/`, i.e. the
+        #  temporary directory where the package is being built
+        temp_dir = self.root
+        local_zip_path = (
+            f"{temp_dir}/"
+            "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl")
+        # check if the target directory exists
+        if not os.path.exists(target_dir):
+            logger.info(f'Creating target directory {target_dir} ...')
+            os.makedirs(target_dir)
+        # Check if the file is already downloaded
+        if os.path.exists(target_dir + "nvidia"):
+            logger.info('library already exists.')
+            return
+        if not os.path.exists(local_zip_path):
+            # Download the file
+            logger.info('Downloading file...')
+            os.system(f"wget {file_url} -q -P {temp_dir}/")
+        # Unzip the file
+        logger.info('Unzipping file...')
+        with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+        shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info")
+        os.remove(local_zip_path)
+        # Move the unzipped files to the target directory
+        logger.info('Moving files...')
+        os.system(f"mv {temp_dir}/nvidia {target_dir}")
+        so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2"
+        os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3"))
+
+
 def _install_punica() -> bool:
     return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
 
@@ -362,6 +418,9 @@ def get_requirements() -> List[str]:
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={
+        "build_ext": cmake_build_ext if not _is_neuron() else build_ext,
+        "install": CustomInstallCommand,
+    },
     package_data=package_data,
 )

From a20d802bba8a173122e100bfec183e6563220f6d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 22:29:08 -0700
Subject: [PATCH 37/54] add so in setup.py, and use programatical path in
 pynccl

---
 Dockerfile                                   | 10 +---------
 setup.py                                     |  6 ++++++
 vllm/model_executor/parallel_utils/pynccl.py |  7 +++++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 4dbbc65a9b1b..6246a7e40773 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -93,16 +93,8 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
 
-# tricky part, nccl 2.19 has a bug that increased memory overhead of cudagraph
-# however, pytorch has binary dependencies on nccl 2.19
-# simply using `pip install nvidia-nccl-cu12==2.18.3` will break pytorch
-# so we have to manually download nccl 2.18 and keep the library to a secrect place
+# used for downloading files
 RUN apt install -y wget unzip
-RUN wget https://files.pythonhosted.org/packages/44/6e/3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl
-RUN unzip nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl
-RUN cp ./nvidia/nccl/lib/libnccl.so.2 ./libnccl.so.2
-RUN rm -rf ./nvidia
-RUN rm nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl
 
 #################### TEST IMAGE ####################
 
diff --git a/setup.py b/setup.py
index 5baefdec8af3..7a4db9ac2e00 100644
--- a/setup.py
+++ b/setup.py
@@ -199,6 +199,12 @@ def run(self):
         if not _is_cuda():
             return
 
+        # tricky part, nccl 2.19 has a bug that increased memory overhead
+        # of cudagraph. However, pytorch has binary dependencies on nccl 2.19,
+        # simply `pip install nvidia-nccl-cu12==2.18.3` will break pytorch,
+        # so we have to manually download nccl 2.18 and keep the library to
+        #  a secrect place
+
         # Define the URL of the file and the directory to unzip to
         file_url = (
             'https://files.pythonhosted.org/packages/44/6e/'
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index 539f350d4a2a..679a9fa03d1a 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -16,10 +16,13 @@
 import torch.distributed as dist
 from torch.distributed import ReduceOp
 import datetime
+import os
+import glob
 
 # manually load the nccl library
-# TODO: find the path programmatically
-nccl = ctypes.CDLL("/vllm-workspace/libnccl.so.2.16.2")
+_path = os.path.dirname(os.path.abspath(__file__))
+so_file = glob.glob(f"{_path}/../../lib/nvidia/lib/libnccl.so.*")[0]
+nccl = ctypes.CDLL(so_file)
 
 # === export types and functions from nccl to Python ===
 # for the original nccl definition, please check

From dfc9d8237ec12d29c51124e9d9af58d20fc4d5bb Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 22:38:37 -0700
Subject: [PATCH 38/54] rename cupy --> pynccl

---
 .../parallel_utils/parallel_state.py          | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py
index 8f3b1549d609..55c65e434d9c 100644
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -210,36 +210,36 @@ def destroy_model_parallel():
     global _PIPELINE_GLOBAL_RANKS
     _PIPELINE_GLOBAL_RANKS = None
 
-    # Destroy the cupy states if any.
+    # Destroy the pynccl states if any.
     pynccl_utils.destroy_process_group()
 
 
-# Whether to use cupy for nccl all reduce.
-# We use cupy for all reduce when using CUDA graph, because torch.distributed
+# Whether to use pynccl for nccl all reduce.
+# We use pynccl for all reduce when using CUDA graph, because torch.distributed
 # is not well supported by CUDA graph.
-_ENABLE_CUPY_FOR_ALL_REDUCE = False
+_ENABLE_PYNCCL_FOR_ALL_REDUCE = False
 
 
 @contextlib.contextmanager
 def with_cupy_nccl_for_all_reduce():
-    """use CuPy nccl instead of torch.distributed for all reduce"""
+    """use pynccl instead of torch.distributed for all reduce"""
     tp_size = get_tensor_model_parallel_world_size()
     if tp_size == 1:
         # No-op.
         # NOTE(woosuk): We don't initialize CuPy when tp_size is 1.
         yield
     else:
-        global _ENABLE_CUPY_FOR_ALL_REDUCE
-        old = _ENABLE_CUPY_FOR_ALL_REDUCE
-        _ENABLE_CUPY_FOR_ALL_REDUCE = True
+        global _ENABLE_PYNCCL_FOR_ALL_REDUCE
+        old = _ENABLE_PYNCCL_FOR_ALL_REDUCE
+        _ENABLE_PYNCCL_FOR_ALL_REDUCE = True
 
         stream = torch.cuda.current_stream()
         with pynccl_utils.set_cupy_stream(stream):
             yield
-        _ENABLE_CUPY_FOR_ALL_REDUCE = old
+        _ENABLE_PYNCCL_FOR_ALL_REDUCE = old
 
 
 def is_cupy_nccl_enabled_for_all_reduce():
-    """check if CuPy nccl is enabled for all reduce"""
-    global _ENABLE_CUPY_FOR_ALL_REDUCE
-    return _ENABLE_CUPY_FOR_ALL_REDUCE
+    """check if pynccl is enabled for all reduce"""
+    global _ENABLE_PYNCCL_FOR_ALL_REDUCE
+    return _ENABLE_PYNCCL_FOR_ALL_REDUCE

From 8a5a011a6722caf45cbb61fc5dfc5ca013b40a0c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 22:41:20 -0700
Subject: [PATCH 39/54] rename cupy --> pynccl

---
 vllm/worker/model_runner.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0fa33a6debc5..374f519afc81 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -15,7 +15,7 @@
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
 from vllm.model_executor.parallel_utils.parallel_state import (
-    with_cupy_nccl_for_all_reduce)
+    with_pynccl_for_all_reduce)
 from vllm.model_executor.parallel_utils import custom_all_reduce
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
@@ -834,7 +834,7 @@ def capture(
         # Run the model once without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        with _maybe_cupy_nccl():
+        with _maybe_pynccl():
             self.model(
                 input_ids,
                 positions,
@@ -848,7 +848,7 @@ def capture(
         # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
         self.graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self.graph, pool=memory_pool):  # noqa: SIM117
-            with _maybe_cupy_nccl():
+            with _maybe_pynccl():
                 hidden_states = self.model(
                     input_ids,
                     positions,
@@ -899,10 +899,10 @@ def __call__(self, *args, **kwargs):
 
 
 @contextlib.contextmanager
-def _maybe_cupy_nccl():
+def _maybe_pynccl():
     if pynccl_utils.is_initialized(
     ) and not custom_all_reduce.is_initialized():
-        with with_cupy_nccl_for_all_reduce():
+        with with_pynccl_for_all_reduce():
             yield
     else:
         yield

From a009e311c5aa9db048ad60f139235856a4e96ae7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 22:41:59 -0700
Subject: [PATCH 40/54] rename cupy --> pynccl

---
 vllm/model_executor/parallel_utils/parallel_state.py | 4 ++--
 vllm/model_executor/parallel_utils/pynccl_utils.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py
index 55c65e434d9c..dbec519fa1db 100644
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -221,7 +221,7 @@ def destroy_model_parallel():
 
 
 @contextlib.contextmanager
-def with_cupy_nccl_for_all_reduce():
+def with_pynccl_for_all_reduce():
     """use pynccl instead of torch.distributed for all reduce"""
     tp_size = get_tensor_model_parallel_world_size()
     if tp_size == 1:
@@ -234,7 +234,7 @@ def with_cupy_nccl_for_all_reduce():
         _ENABLE_PYNCCL_FOR_ALL_REDUCE = True
 
         stream = torch.cuda.current_stream()
-        with pynccl_utils.set_cupy_stream(stream):
+        with pynccl_utils.set_pynccl_stream(stream):
             yield
         _ENABLE_PYNCCL_FOR_ALL_REDUCE = old
 
diff --git a/vllm/model_executor/parallel_utils/pynccl_utils.py b/vllm/model_executor/parallel_utils/pynccl_utils.py
index 62b8b22c3143..e498526b71bb 100644
--- a/vllm/model_executor/parallel_utils/pynccl_utils.py
+++ b/vllm/model_executor/parallel_utils/pynccl_utils.py
@@ -29,7 +29,7 @@ def is_initialized() -> bool:
 
 
 @contextlib.contextmanager
-def set_cupy_stream(stream: torch.cuda.Stream):
+def set_pynccl_stream(stream: torch.cuda.Stream):
     """Set the cuda stream for communication"""
     try:
         comm.stream = stream

From 68e4792739b418dbaa00117f8b04a17b379e66cd Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 22:42:45 -0700
Subject: [PATCH 41/54] rename cupy --> pynccl

---
 vllm/model_executor/parallel_utils/communication_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index 01036ab5fba2..28433d31f56a 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -9,7 +9,7 @@
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     get_tensor_model_parallel_group,
-    is_cupy_nccl_enabled_for_all_reduce,
+    is_pynccl_enabled_for_all_reduce,
 )
 from vllm.model_executor.parallel_utils.custom_all_reduce import (
     custom_all_reduce)
@@ -33,7 +33,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     out = custom_all_reduce(input_)
     if out is not None:
         return out
-    if is_cupy_nccl_enabled_for_all_reduce():
+    if is_pynccl_enabled_for_all_reduce():
         # TODO: support multiple parallel groups.
         pynccl_utils.all_reduce(input_)
     else:

From 0a6fab18f01823413546f3d2e1a0dec06ae9e344 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 23:07:18 -0700
Subject: [PATCH 42/54] fix wget install order

---
 Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6246a7e40773..7139bff6d5b3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,6 +15,9 @@ RUN ldconfig /usr/local/cuda-12.1/compat/
 
 WORKDIR /workspace
 
+# used for downloading files
+RUN apt install -y wget unzip
+
 # install build and runtime dependencies
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
@@ -93,9 +96,6 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
 
-# used for downloading files
-RUN apt install -y wget unzip
-
 #################### TEST IMAGE ####################
 
 
From a82a97618b6071cb7aeeb1dfba9ad2943ce4f3a5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Fri, 22 Mar 2024 23:19:27 -0700
Subject: [PATCH 43/54] rename cupy --> pynccl

---
 vllm/model_executor/parallel_utils/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py
index dbec519fa1db..63890d9cd5bd 100644
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -239,7 +239,7 @@ def with_pynccl_for_all_reduce():
         _ENABLE_PYNCCL_FOR_ALL_REDUCE = old
 
 
-def is_cupy_nccl_enabled_for_all_reduce():
+def is_pynccl_enabled_for_all_reduce():
     """check if pynccl is enabled for all reduce"""
     global _ENABLE_PYNCCL_FOR_ALL_REDUCE
     return _ENABLE_PYNCCL_FOR_ALL_REDUCE

From 1c6ec4836244ff6a71137f182fdf1c60af7da97d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 23 Mar 2024 00:09:18 -0700
Subject: [PATCH 44/54] fix so filename and search path

---
 Dockerfile                                   | 5 +++++
 vllm/model_executor/parallel_utils/pynccl.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 7139bff6d5b3..379b2581eaf1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -133,3 +133,8 @@ COPY vllm vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
+
+# the package has been installed to /usr/local/lib/python3.10/dist-packages
+# remove it so that test scripts will not import vllm from current directory
+# (which does not have so files)
+RUN rm -rf vllm
diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index 679a9fa03d1a..8232f231de4c 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -21,7 +21,7 @@
 
 # manually load the nccl library
 _path = os.path.dirname(os.path.abspath(__file__))
-so_file = glob.glob(f"{_path}/../../lib/nvidia/lib/libnccl.so.*")[0]
+so_file = glob.glob(f"{_path}/../../lib/nvidia/nccl/lib/libnccl.so.*")[0]
 nccl = ctypes.CDLL(so_file)
 
 # === export types and functions from nccl to Python ===

From 47ff82ab2e9c2c5e64e5a552da96fb5cbcbd3a38 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 23 Mar 2024 00:38:20 -0700
Subject: [PATCH 45/54] fix dockerfile

---
 Dockerfile | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 379b2581eaf1..1c6cd48f0dbb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -95,7 +95,10 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
-
+# the package has been installed to /usr/local/lib/python3.10/dist-packages
+# remove it so that test scripts will not import vllm from current directory
+# (which does not have so files)
+RUN rm -rf vllm
 #################### TEST IMAGE ####################
 
 
@@ -133,8 +136,3 @@ COPY vllm vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
-
-# the package has been installed to /usr/local/lib/python3.10/dist-packages
-# remove it so that test scripts will not import vllm from current directory
-# (which does not have so files)
-RUN rm -rf vllm

From b0c15c2478800aa0e5c4a7afbb6be5eaac547151 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 23 Mar 2024 00:58:15 -0700
Subject: [PATCH 46/54] fix dockerfile

---
 Dockerfile | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1c6cd48f0dbb..01c3a19db618 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -94,11 +94,10 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
     pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
-RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
-# the package has been installed to /usr/local/lib/python3.10/dist-packages
-# remove it so that test scripts will not import vllm from current directory
-# (which does not have so files)
-RUN rm -rf vllm
+# the package will be installed to /usr/local/lib/python3.10/dist-packages
+# *.so files will only be there, rather than in the source directory
+# use `-e` to install in editable mode, so that directory links to the source
+RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install -e . --verbose
 #################### TEST IMAGE ####################
 
 
From 0b4f7dddbd26d4efbdf7e0b97de275b59a60bdf2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 23 Mar 2024 10:18:44 -0700
Subject: [PATCH 47/54] download and use manifest in to force keeping .so file

---
 MANIFEST.in |   1 +
 setup.py    | 103 ++++++++++++++++++++++++----------------------------
 2 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index aa16da6500e6..677fa19721fc 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,3 +4,4 @@ include CMakeLists.txt
 
 recursive-include cmake *
 recursive-include csrc *
+recursive-include vllm/lib *
diff --git a/setup.py b/setup.py
index 7a4db9ac2e00..79cfc107c084 100644
--- a/setup.py
+++ b/setup.py
@@ -190,64 +190,56 @@ def _is_neuron() -> bool:
     return torch_neuronx_installed
 
 
-class CustomInstallCommand(install):
+def _install_punica() -> bool:
+    return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
 
-    def run(self):
-        # Call the standard install process first
-        install.run(self)
-
-        if not _is_cuda():
-            return
-
-        # tricky part, nccl 2.19 has a bug that increased memory overhead
-        # of cudagraph. However, pytorch has binary dependencies on nccl 2.19,
-        # simply `pip install nvidia-nccl-cu12==2.18.3` will break pytorch,
-        # so we have to manually download nccl 2.18 and keep the library to
-        #  a secrect place
-
-        # Define the URL of the file and the directory to unzip to
-        file_url = (
-            'https://files.pythonhosted.org/packages/44/6e/'
-            '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff'
-            '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl')
-
-        logger.info('Installing NVIDIA NCCL library...')
-
-        # `self.install_lib` is something like /path/to/python/site-packages/
-        target_dir = self.install_lib + "vllm/lib/"
-        # `self.root` is something like `/tmp/pip-install-abc123/`, i.e. the
-        #  temporary directory where the package is being built
-        temp_dir = self.root
-        local_zip_path = (
-            f"{temp_dir}/"
-            "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl")
-        # check if the target directory exists
-        if not os.path.exists(target_dir):
-            logger.info(f'Creating target directory {target_dir} ...')
-            os.makedirs(target_dir)
-        # Check if the file is already downloaded
-        if os.path.exists(target_dir + "nvidia"):
-            logger.info('library already exists.')
-            return
-        if not os.path.exists(local_zip_path):
-            # Download the file
-            logger.info('Downloading file...')
-            os.system(f"wget {file_url} -q -P {temp_dir}/")
-        # Unzip the file
-        logger.info('Unzipping file...')
-        with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
-            zip_ref.extractall(temp_dir)
-        shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info")
-        os.remove(local_zip_path)
-        # Move the unzipped files to the target directory
-        logger.info('Moving files...')
-        os.system(f"mv {temp_dir}/nvidia {target_dir}")
-        so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2"
-        os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3"))
 
+if _is_cuda():
 
-def _install_punica() -> bool:
-    return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
+    # tricky part, nccl 2.19 has a bug that increased memory overhead
+    # of cudagraph. However, pytorch has binary dependencies on nccl 2.19,
+    # simply `pip install nvidia-nccl-cu12==2.18.3` will break pytorch,
+    # so we have to manually download nccl 2.18 and keep the library to
+    #  a secrect place
+
+    # Define the URL of the file and the directory to unzip to
+    file_url = (
+        'https://files.pythonhosted.org/packages/44/6e/'
+        '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff'
+        '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl')
+
+    logger.info('Installing NVIDIA NCCL library...')
+
+    target_dir = os.path.dirname(os.path.abspath(__file__)) + "/vllm/lib/"
+    # `self.root` is something like `/tmp/pip-install-abc123/`, i.e. the
+    #  temporary directory where the package is being built
+    temp_dir = self.root
+    local_zip_path = (
+        f"{temp_dir}/"
+        "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl")
+    # check if the target directory exists
+    if not os.path.exists(target_dir):
+        logger.info(f'Creating target directory {target_dir} ...')
+        os.makedirs(target_dir)
+    # Check if the file is already downloaded
+    if os.path.exists(target_dir + "nvidia"):
+        logger.info('library already exists.')
+        return
+    if not os.path.exists(local_zip_path):
+        # Download the file
+        logger.info('Downloading file...')
+        os.system(f"wget {file_url} -q -P {temp_dir}/")
+    # Unzip the file
+    logger.info('Unzipping file...')
+    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
+        zip_ref.extractall(temp_dir)
+    shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info")
+    os.remove(local_zip_path)
+    # Move the unzipped files to the target directory
+    logger.info('Moving files...')
+    os.system(f"mv {temp_dir}/nvidia {target_dir}")
+    so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2"
+    os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3"))
 
 
 def get_hipcc_rocm_version():
@@ -426,7 +418,6 @@ def get_requirements() -> List[str]:
     ext_modules=ext_modules,
     cmdclass={
         "build_ext": cmake_build_ext if not _is_neuron() else build_ext,
-        "install": CustomInstallCommand,
     },
     package_data=package_data,
 )

From 7942050a198361ee10d16353bd2fefe92a0776e2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 23 Mar 2024 10:21:31 -0700
Subject: [PATCH 48/54] download and use manifest in to force keeping .so file

---
 setup.py | 64 ++++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/setup.py b/setup.py
index 79cfc107c084..06dfeee0d262 100644
--- a/setup.py
+++ b/setup.py
@@ -8,13 +8,13 @@
 from packaging.version import parse, Version
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext
-from setuptools.command.install import install
 from shutil import which
 import torch
 from torch.utils.cpp_extension import CUDA_HOME
 import zipfile
 import shutil
 import logging
+import tempfile
 
 logger = logging.getLogger(__name__)
 
@@ -203,43 +203,39 @@ def _install_punica() -> bool:
     #  a secrect place
 
     # Define the URL of the file and the directory to unzip to
-    file_url = (
-        'https://files.pythonhosted.org/packages/44/6e/'
-        '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff'
-        '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl')
+    file_url = ('https://files.pythonhosted.org/packages/44/6e/'
+                '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff'
+                '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl')
 
     logger.info('Installing NVIDIA NCCL library...')
 
     target_dir = os.path.dirname(os.path.abspath(__file__)) + "/vllm/lib/"
-    # `self.root` is something like `/tmp/pip-install-abc123/`, i.e. the
-    #  temporary directory where the package is being built
-    temp_dir = self.root
-    local_zip_path = (
-        f"{temp_dir}/"
-        "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl")
-    # check if the target directory exists
-    if not os.path.exists(target_dir):
-        logger.info(f'Creating target directory {target_dir} ...')
-        os.makedirs(target_dir)
-    # Check if the file is already downloaded
-    if os.path.exists(target_dir + "nvidia"):
-        logger.info('library already exists.')
-        return
-    if not os.path.exists(local_zip_path):
-        # Download the file
-        logger.info('Downloading file...')
-        os.system(f"wget {file_url} -q -P {temp_dir}/")
-    # Unzip the file
-    logger.info('Unzipping file...')
-    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
-        zip_ref.extractall(temp_dir)
-    shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info")
-    os.remove(local_zip_path)
-    # Move the unzipped files to the target directory
-    logger.info('Moving files...')
-    os.system(f"mv {temp_dir}/nvidia {target_dir}")
-    so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2"
-    os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3"))
+    with tempfile.TemporaryDirectory() as temp_dir:
+        local_zip_path = (
+            f"{temp_dir}/"
+            "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl")
+        # check if the target directory exists
+        if not os.path.exists(target_dir):
+            logger.info(f'Creating target directory {target_dir} ...')
+            os.makedirs(target_dir)
+        # Check if the file is already downloaded
+        if os.path.exists(target_dir + "nvidia"):
+            logger.info('library already exists.')
+        else:
+            # Download the file
+            logger.info('Downloading file...')
+            os.system(f"wget {file_url} -q -P {temp_dir}/")
+            # Unzip the file
+            logger.info('Unzipping file...')
+            with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)
+            shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info")
+            os.remove(local_zip_path)
+            # Move the unzipped files to the target directory
+            logger.info('Moving files...')
+            os.system(f"mv {temp_dir}/nvidia {target_dir}")
+            so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2"
+            os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3"))
 
 
 def get_hipcc_rocm_version():

From 20a3ec487089b0fd984f8a30717fa3ceca2fc111 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 23 Mar 2024 10:24:08 -0700
Subject: [PATCH 49/54] restore dockerfile

---
 Dockerfile | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 01c3a19db618..d78ddd25ccf7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -94,10 +94,7 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta
     pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
-# the package will be installed to /usr/local/lib/python3.10/dist-packages
-# *.so files will only be there, rather than in the source directory
-# use `-e` to install in editable mode, so that directory links to the source
-RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install -e . --verbose
+RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
 #################### TEST IMAGE ####################
 
 
From 0ca27b7403a540949f7c0c8c8cb787bcaad97fe7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 23 Mar 2024 10:29:37 -0700
Subject: [PATCH 50/54] add lib file to package data

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 06dfeee0d262..2fbf579915c1 100644
--- a/setup.py
+++ b/setup.py
@@ -380,7 +380,8 @@ def get_requirements() -> List[str]:
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
 package_data = {
-    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+    "vllm":
+    ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "lib/*"]
 }
 if os.environ.get("VLLM_USE_PRECOMPILED"):
     package_data["vllm"].append("*.so")

From a3c2340ae36ce8ee782691d30111377eaf7ae6ce Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Sat, 23 Mar 2024 10:47:34 -0700
Subject: [PATCH 51/54] add libnccl.so.2.18.3 via hard-coding

---
 setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 2fbf579915c1..39338d30f139 100644
--- a/setup.py
+++ b/setup.py
@@ -380,8 +380,10 @@ def get_requirements() -> List[str]:
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
 package_data = {
-    "vllm":
-    ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "lib/*"]
+    "vllm": [
+        "py.typed", "model_executor/layers/fused_moe/configs/*.json",
+        "lib/nvidia/nccl/lib/libnccl.so.2.18.3"
+    ]
 }
 if os.environ.get("VLLM_USE_PRECOMPILED"):
     package_data["vllm"].append("*.so")

From 71e29762b860d84c27b7a3953bb7e90dd5bda3f5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Mon, 25 Mar 2024 09:43:40 -0700
Subject: [PATCH 52/54] enable VLLM_NCCL_SO_PATH at runtime

---
 vllm/model_executor/parallel_utils/pynccl.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/pynccl.py b/vllm/model_executor/parallel_utils/pynccl.py
index 8232f231de4c..9f0aaf5f9321 100644
--- a/vllm/model_executor/parallel_utils/pynccl.py
+++ b/vllm/model_executor/parallel_utils/pynccl.py
@@ -18,10 +18,20 @@
 import datetime
 import os
 import glob
+import logging
+
+logger = logging.getLogger(__name__)
+
+so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
 
 # manually load the nccl library
-_path = os.path.dirname(os.path.abspath(__file__))
-so_file = glob.glob(f"{_path}/../../lib/nvidia/nccl/lib/libnccl.so.*")[0]
+if so_file:
+    logger.info(
+        f"Loading nccl from environment variable VLLM_NCCL_SO_PATH={so_file}")
+else:
+    _path = os.path.dirname(os.path.abspath(__file__))
+    so_file = glob.glob(f"{_path}/../../lib/nvidia/nccl/lib/libnccl.so.*")[0]
+    logger.info(f"Loading nccl from vLLM builtin file {so_file}")
 nccl = ctypes.CDLL(so_file)
 
 # === export types and functions from nccl to Python ===

From 3d9332a938931522088aacca945b1063ad2424db Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Mon, 25 Mar 2024 09:46:42 -0700
Subject: [PATCH 53/54] nit, os.makedirs(target_dir, exist_ok=True)

---
 setup.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 39338d30f139..07d08be5fbb2 100644
--- a/setup.py
+++ b/setup.py
@@ -214,10 +214,8 @@ def _install_punica() -> bool:
         local_zip_path = (
             f"{temp_dir}/"
             "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl")
-        # check if the target directory exists
-        if not os.path.exists(target_dir):
-            logger.info(f'Creating target directory {target_dir} ...')
-            os.makedirs(target_dir)
+        # make sure the target directory exists
+        os.makedirs(target_dir, exist_ok=True)
         # Check if the file is already downloaded
         if os.path.exists(target_dir + "nvidia"):
             logger.info('library already exists.')

From 76f46f65401d6fb897ce388fe92e219fbfd6656b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Mon, 25 Mar 2024 12:34:12 -0700
Subject: [PATCH 54/54] upgrade to pt 2.2.1

---
 .github/workflows/publish.yml | 2 +-
 CMakeLists.txt                | 2 +-
 pyproject.toml                | 2 +-
 requirements-build.txt        | 2 +-
 requirements.txt              | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index d88f646636ba..2db687a287ef 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.2.0']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29d11f9a04d6..be3dc520e43f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.2.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
 
diff --git a/pyproject.toml b/pyproject.toml
index 05fbfbb89d27..509c2a630b4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.2.0",
+    "torch == 2.2.1",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index a0a6aef8bb16..2bc07fb152aa 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.2.0
+torch==2.2.1
 wheel
diff --git a/requirements.txt b/requirements.txt
index d8fd1986ecb8..57996f5cc231 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,8 +4,8 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.2.0
-xformers == 0.0.24  # Requires PyTorch 2.2.0.
+torch == 2.2.1
+xformers == 0.0.25  # Requires PyTorch 2.2.1.
 transformers >= 4.39.0  # Required for StarCoder2.
 fastapi
 uvicorn[standard]