vllm-project · youkaichao · Feb 7, 2024 · Feb 7, 2024 · Feb 12, 2024 · Feb 15, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -22,8 +22,18 @@ steps:
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
-- label: Distributed Correctness Test
-  command: pytest -v -s --forked test_basic_distributed_correctness.py
+- label: Distributed pynccl Test
+  command: pytest -v -s --forked test_pynccl.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Distributed Correctness Test-facebook/opt-125m
+  command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Distributed Correctness Test-meta-llama/Llama-2-7b-hf
+  command: TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.1.2']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:

@@ -15,6 +15,9 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
+# used when building pytorch-related extensions
+set(TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0;8.6;8.9;9.0")
+
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 
@@ -28,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
 

diff --git a/Dockerfile b/Dockerfile
@@ -15,6 +15,9 @@ RUN ldconfig /usr/local/cuda-12.1/compat/
 
 WORKDIR /workspace
 
+# used for downloading files
+RUN apt install -y wget unzip
+
 # install build and runtime dependencies
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -4,3 +4,4 @@ include CMakeLists.txt
 
 recursive-include cmake *
 recursive-include csrc *
+recursive-include vllm/lib *
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.1.2",
+    "torch == 2.2.1",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"

diff --git a/requirements-build.txt b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.1.2
+torch==2.2.1
 wheel
diff --git a/requirements.txt b/requirements.txt
@@ -4,9 +4,9 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.1.2
+torch == 2.2.1
+xformers == 0.0.25  # Requires PyTorch 2.2.1.
 transformers >= 4.39.0  # Required for StarCoder2.
-xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

diff --git a/setup.py b/setup.py
@@ -11,6 +11,12 @@
 from shutil import which
 import torch
 from torch.utils.cpp_extension import CUDA_HOME
+import zipfile
+import shutil
+import logging
+import tempfile
+
+logger = logging.getLogger(__name__)
 
 ROOT_DIR = os.path.dirname(__file__)
 
@@ -188,6 +194,48 @@ def _install_punica() -> bool:
     return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
 
 
+if _is_cuda():
+
+    # tricky part, nccl 2.19 has a bug that increased memory overhead
+    # of cudagraph. However, pytorch has binary dependencies on nccl 2.19,
+    # simply `pip install nvidia-nccl-cu12==2.18.3` will break pytorch,
+    # so we have to manually download nccl 2.18 and keep the library to
+    #  a secrect place
+
+    # Define the URL of the file and the directory to unzip to
+    file_url = ('https://files.pythonhosted.org/packages/44/6e/'
+                '3c9cd7007072f8a63dae7b5eddd1cc1525fd357377467ce3a4749b02d5ff'
+                '/nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl')
+
+    logger.info('Installing NVIDIA NCCL library...')
+
+    target_dir = os.path.dirname(os.path.abspath(__file__)) + "/vllm/lib/"
+    with tempfile.TemporaryDirectory() as temp_dir:
+        local_zip_path = (
+            f"{temp_dir}/"
+            "nvidia_nccl_cu12-2.18.3-py3-none-manylinux1_x86_64.whl")
+        # make sure the target directory exists
+        os.makedirs(target_dir, exist_ok=True)
+        # Check if the file is already downloaded
+        if os.path.exists(target_dir + "nvidia"):
+            logger.info('library already exists.')
+        else:
+            # Download the file
+            logger.info('Downloading file...')
+            os.system(f"wget {file_url} -q -P {temp_dir}/")
+            # Unzip the file
+            logger.info('Unzipping file...')
+            with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)
+            shutil.rmtree(f"{temp_dir}/nvidia_nccl_cu12-2.18.3.dist-info")
+            os.remove(local_zip_path)
+            # Move the unzipped files to the target directory
+            logger.info('Moving files...')
+            os.system(f"mv {temp_dir}/nvidia {target_dir}")
+            so_path = f"{target_dir}/nvidia/nccl/lib/libnccl.so.2"
+            os.rename(so_path, so_path.replace(".so.2", ".so.2.18.3"))
+
+
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -330,7 +378,10 @@ def get_requirements() -> List[str]:
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
 package_data = {
-    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+    "vllm": [
+        "py.typed", "model_executor/layers/fused_moe/configs/*.json",
+        "lib/nvidia/nccl/lib/libnccl.so.2.18.3"
+    ]
 }
 if os.environ.get("VLLM_USE_PRECOMPILED"):
     package_data["vllm"].append("*.so")
@@ -362,6 +413,8 @@ def get_requirements() -> List[str]:
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={
+        "build_ext": cmake_build_ext if not _is_neuron() else build_ext,
+    },
     package_data=package_data,
 )
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
@@ -1,13 +1,23 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 
-Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
+vLLM will allocate all the available memory, so we need to run the tests one
+by one. The solution is to pass arguments (model name) by environment
+variables.
+Run:
+
+```sh
+TEST_DIST_MODEL=facebook/opt-125m pytest \
+    test_basic_distributed_correctness.py
+TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
+    test_basic_distributed_correctness.py
+```
 """
+import os
 import pytest
 import torch
 
 MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    os.environ["TEST_DIST_MODEL"],
 ]
 
 

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
@@ -0,0 +1,88 @@
+# this script is not run with `pytest`.
+# It is run with `torchrun`.
+import os
+import multiprocessing
+import pytest
+import torch
+from vllm.model_executor.parallel_utils.pynccl import (
+    NCCLCommunicator,
+    ncclGetUniqueId,
+)
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+    for i in range(number_of_processes):
+        env = os.environ.copy()
+        env['RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+
+def update_env(fn):
+
+    def wrapper(env):
+        import os
+        os.environ.update(env)
+        fn()
+
+    return wrapper
+
+
+@update_env
+def worker_fn():
+    comm = NCCLCommunicator()
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+    comm.all_reduce(tensor)
+    result = tensor.mean().cpu().item()
+    assert result == comm.world_size
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl():
+    distributed_run(worker_fn, 2)
+
+
+@update_env
+def worker_fn_with_cudagraph():
+    with torch.no_grad():
+        graph = torch.cuda.CUDAGraph()
+        comm = NCCLCommunicator()
+        # run something in the default stream to initialize torch engine
+        a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
+        torch.cuda.synchronize()
+        with torch.cuda.graph(graph, stream=comm.stream):
+            comm.all_reduce(a)
+        comm.stream.synchronize()
+        assert a.mean().cpu().item() == comm.world_size**0
+        graph.replay()
+        comm.stream.synchronize()
+        assert a.mean().cpu().item() == comm.world_size**2
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_pynccl_with_cudagraph():
+    distributed_run(worker_fn_with_cudagraph, 2)
+
+
+def test_ncclGetUniqueId():
+    unique_id = ncclGetUniqueId()
+    # `list(unique_id.internal)` is something like this:
+    # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    # as long as the function doesn't raise an exception, we're good
+    assert unique_id is not None
diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
@@ -4,12 +4,12 @@
 import torch
 from torch.distributed import ProcessGroup
 
-from vllm.model_executor.parallel_utils import cupy_utils
+from vllm.model_executor.parallel_utils import pynccl_utils
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     get_tensor_model_parallel_group,
-    is_cupy_nccl_enabled_for_all_reduce,
+    is_pynccl_enabled_for_all_reduce,
 )
 from vllm.model_executor.parallel_utils.custom_all_reduce import (
     custom_all_reduce)
@@ -33,9 +33,9 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     out = custom_all_reduce(input_)
     if out is not None:
         return out
-    if is_cupy_nccl_enabled_for_all_reduce():
+    if is_pynccl_enabled_for_all_reduce():
         # TODO: support multiple parallel groups.
-        cupy_utils.all_reduce(input_)
+        pynccl_utils.all_reduce(input_)
     else:
         torch.distributed.all_reduce(input_,
                                      group=get_tensor_model_parallel_group())
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,3 +4,4 @@ include CMakeLists.txt

		recursive-include cmake *
		recursive-include csrc *
		recursive-include vllm/lib *