vllm-project · hongxiayang · Jun 6, 2024 · Jun 7, 2024 · Jun 10, 2024 · Jun 12, 2024
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -1,15 +1,16 @@
 # default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+ARG BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2"
 
 FROM $BASE_IMAGE
 
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+ARG BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2"
 
 RUN echo "Base image is $BASE_IMAGE"
 
-# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
-# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
+# Tested and supported base rocm/pytorch images
+ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
+    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
+    ROCM_6_1_BASE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2"
 
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
@@ -68,15 +69,15 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
     && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+    && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \
         patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
     && python3 setup.py install \
     && cd ..; \
     fi
 
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
-RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
     rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
 # build triton
@@ -107,11 +108,11 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
+    && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
+       patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \
     && python3 setup.py install \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
+    && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \
+    && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \
     && cd ..
 
 

diff --git a/README.md b/README.md
@@ -27,6 +27,7 @@ Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/
 
 *Latest News* 🔥
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
+- [2024/06] Added ROCm 6.1 support to vLLM.
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -155,8 +155,11 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     # Find the intersection of the supported + detected architectures to
     # set the module architecture flags.
     #
+
+    set(ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+
     set(${GPU_ARCHES})
-    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
+    foreach (_ARCH ${ROCM_SUPPORTED_ARCHS})
       if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
         list(APPEND ${GPU_ARCHES} ${_ARCH})
       endif()

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
@@ -11,7 +11,7 @@ Requirements
 * OS: Linux
 * Python: 3.8 -- 3.11
 * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-* ROCm 6.0 and ROCm 5.7
+* ROCm 6.1, 6.0 and ROCm 5.7
 
 Installation options:
 
@@ -27,7 +27,7 @@ You can build and install vLLM from source.
 
 First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
 
-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
+`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.1 by default, but also supports ROCm 6.0 and ROCm 5.7.
 It provides flexibility to customize the build of docker image using the following arguments:
 
 * `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
@@ -39,18 +39,25 @@ It provides flexibility to customize the build of docker image using the followi
 Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
 
 
-To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
+To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default:
 
 .. code-block:: console
 
     $ docker build -f Dockerfile.rocm -t vllm-rocm .
 
-To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
+To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
 
 .. code-block:: console
 
     $ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
 
+To build docker image for vllm on ROCm 6.0, you can specify ``BASE_IMAGE`` as below:
+
+.. code-block:: console
+
+    $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
+       -f Dockerfile.rocm -t vllm-rocm . 
+
 To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:
 
 .. code-block:: console

diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
@@ -17,6 +17,8 @@
 import pytest
 import torch
 
+from vllm.utils import is_hip
+
 MODELS = [
     os.environ["TEST_DIST_MODEL"],
 ]
@@ -40,7 +42,8 @@ def test_models(
     distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
 
     backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
-    enforce_eager = backend_by_env_var == "FLASHINFER"
+    # TODO revisit after hipgraph is fully supported on multi-gpus
+    enforce_eager = is_hip() or backend_by_env_var == "FLASHINFER"
 
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
@@ -16,6 +16,8 @@
 import pytest
 import torch
 
+from vllm.utils import is_hip
+
 MODELS = [
     os.environ["TEST_DIST_MODEL"],
 ]
@@ -52,6 +54,7 @@ def test_models(
             model,
             dtype=dtype,
             tensor_parallel_size=2,
+            enforce_eager=is_hip(),
             max_num_seqs=max_num_seqs,
             enable_chunked_prefill=enable_chunked_prefill,
             max_num_batched_tokens=max_num_batched_tokens,

@@ -11,9 +11,14 @@
     CompressedTensorsLinearMethod, CompressedTensorsW4A16,
     CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
     CompressedTensorsW8A8StaticTensor)
+from vllm.utils import is_hip
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
+    if is_hip():
+        pytest.skip(
+            "compressed-tensors quantization currently not supported in ROCm.")
+
     model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
@@ -43,6 +48,10 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
 
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
+    if is_hip():
+        pytest.skip(
+            "compressed-tensors quantization currently not supported in ROCm.")
+
     model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
     with vllm_runner(model_path) as llm:
         sampling_params = SamplingParams()
@@ -51,6 +60,10 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
 
 
 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
+    if is_hip():
+        pytest.skip(
+            "compressed-tensors quantization currently not supported in ROCm.")
+
     model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
     with vllm_runner(model_path, dtype=torch.float16) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
@@ -68,6 +81,10 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
 ])
 def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
+    if is_hip():
+        pytest.skip(
+            "compressed-tensors quantization currently not supported in ROCm.")
+
     model, strategy, group = w4a16_args
     with vllm_runner(model) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
@@ -86,6 +103,10 @@ def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
 
 
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
+    if is_hip():
+        pytest.skip(
+            "compressed-tensors quantization currently not supported in ROCm.")
+
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
     with vllm_runner(model_path) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501

diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
@@ -9,6 +9,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.model_executor.model_loader.loader import ShardedStateLoader
+from vllm.utils import is_hip
 
 prompts = [
     "Hello, my name is",
@@ -106,6 +107,7 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
                             enable_lora=enable_lora,
                             gpu_memory_utilization=gpu_memory_utilization,
                             tensor_parallel_size=tp_size,
+                            enforce_eager=is_hip(),
                         ))
         p.start()
         p.join()
@@ -119,6 +121,7 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
                             gpu_memory_utilization=gpu_memory_utilization,
                             tensor_parallel_size=tp_size,
                             load_format="sharded_state",
+                            enforce_eager=is_hip(),
                         ))
         p.start()
         p.join()