Add FA3 (#4166)

CUHKSZzxy · web-flow · commit ecb6cceabd28 · 2025-12-04T14:22:10.000+08:00
* add fa3

* separate hopper

* use pre-built FA3

* FA3 for cu128 only

* simplify

* remove hopper image, add dlslime

* FA3 for cu128, cu130

* skip Dive for cu13
diff --git a/.github/workflows/test_docker.yml b/.github/workflows/test_docker.yml
@@ -56,6 +56,7 @@ jobs:
           docker images
           docker run --rm lmdeploy:latest lmdeploy check_env
       - name: Dive
+        if: ${{ matrix.cuda_version == 'cu12' }}
         uses: MaxymVlasov/dive-action@v1.5.0
         with:
           image: lmdeploy:latest
diff --git a/docker/install.sh b/docker/install.sh
@@ -25,11 +25,11 @@ popd >/dev/null
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     apt-get install -y --no-install-recommends cuda-minimal-build-11-8
 elif [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-4 dkms
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-4 numactl dkms
 elif [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-8 dkms
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-8 numactl dkms
 elif [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-13-0 dkms
+    apt-get install -y --no-install-recommends cuda-minimal-build-13-0 numactl dkms
 fi
 
 apt-get clean -y
@@ -66,12 +66,20 @@ fi
 pip install torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
 pip install /wheels/*.whl
 
-
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]] && [[ "${PYTHON_VERSION}" != "3.9" ]]; then
-    pip install cuda-python dlblas==0.0.6
+    pip install cuda-python dlblas==0.0.6 dlslime==0.0.1.post10
+fi
+
+# install pre-built flash attention 3 wheel
+if [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then
+    FA3_WHEELS_URL="https://windreamer.github.io/flash-attention3-wheels/cu128_torch280"
+    pip install flash_attn_3 --find-links ${FA3_WHEELS_URL} --extra-index-url https://download.pytorch.org/whl/cu128
+elif [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
+    FA3_WHEELS_URL="https://windreamer.github.io/flash-attention3-wheels/cu130_torch290"
+    pip install flash_attn_3 --find-links ${FA3_WHEELS_URL} --extra-index-url https://download.pytorch.org/whl/cu130
 fi
 
-# install pre-compiled flash attention wheel
+# install pre-built flash attention wheel
 PLATFORM="linux_x86_64"
 PY_VERSION=$(python3 - <<'PY'
 import torch, sys
diff --git a/docker/prepare_wheel.sh b/docker/prepare_wheel.sh
@@ -23,7 +23,6 @@ if [[ ${PYTHON_VERSION} = "3.13" ]]; then
 fi
 
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
-
     GDRCOPY_VERSION=2.5.1
     DEEP_EP_VERSION=9af0e0d  # v1.2.1
     DEEP_GEMM_VERSION=c9f8b34  # v2.1.1.post3