From a144bed4013e94dbe25ef79e3642378ab959b3fc Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Sat, 18 Jan 2025 09:42:07 +0800
Subject: [PATCH 1/6] check cuda v8 for marlin

---
 gptqmodel/nn_modules/qlinear/marlin.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
index af03bc823..1c3501f57 100644
--- a/gptqmodel/nn_modules/qlinear/marlin.py
+++ b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -31,6 +31,8 @@
 except ImportError as e:
     marlin_import_exception = e
 
+HAS_CUDA_V8 = any(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count()))
+
 GPTQ_MARLIN_TILE = 16
 GPTQ_MARLIN_MIN_THREAD_N = 64
 GPTQ_MARLIN_MIN_THREAD_K = 128
@@ -307,6 +309,8 @@ def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeat
     def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
         if IS_ROCM:
             return False, RuntimeError("marlin kernel is not supported by rocm.")
+        if not HAS_CUDA_V8:
+            return False, RuntimeError("marlin kernel requires CUDA version >= 8.")
         if marlin_import_exception is not None:
             return False, marlin_import_exception
         return cls._validate(**args)

From d7190e344c433b2a42767b492cdeb95b16ce133f Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Sat, 18 Jan 2025 09:48:28 +0800
Subject: [PATCH 2/6] check cuda 8 for installation

---
 setup.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/setup.py b/setup.py
index e9c6224f8..c0db1aa4e 100644
--- a/setup.py
+++ b/setup.py
@@ -125,6 +125,8 @@ def get_version_tag() -> str:
 import torch  # noqa: E402
 
 if TORCH_CUDA_ARCH_LIST is None:
+    HAS_CUDA_V8 = any(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count()))
+
     got_cuda_v6 = any(torch.cuda.get_device_capability(i)[0] >= 6 for i in range(torch.cuda.device_count()))
     got_cuda_between_v6_and_v8 = any(6 <= torch.cuda.get_device_capability(i)[0] < 8 for i in range(torch.cuda.device_count()))
 
@@ -139,7 +141,8 @@ def get_version_tag() -> str:
     if BUILD_CUDA_EXT and not FORCE_BUILD:
         if got_cuda_between_v6_and_v8:
             FORCE_BUILD = True
-
+else:
+    HAS_CUDA_V8 = len([arch for arch in TORCH_CUDA_ARCH_LIST.split() if float(arch.split('+')[0]) >= 8]) > 0
 
 if RELEASE_MODE == "1":
     common_setup_kwargs["version"] += f"+{get_version_tag()}"
@@ -217,21 +220,20 @@ def get_version_tag() -> str:
         ),
     ]
 
-    if sys.platform != "win32":
-        # TODO: VC++: fatal error C1061: compiler limit : blocks nested too deeply
-        marlin_kernel = cpp_ext.CUDAExtension(
-            "gptqmodel_marlin_kernels",
-            [
-                "gptqmodel_ext/marlin/marlin_cuda.cpp",
-                "gptqmodel_ext/marlin/marlin_cuda_kernel.cu",
-                "gptqmodel_ext/marlin/marlin_repack.cu",
-            ],
-            extra_link_args=extra_link_args,
-            extra_compile_args=extra_compile_args,
-        )
+    if sys.platform != "win32":# TODO: VC++: fatal error C1061: compiler limit : blocks nested too deeply
         # https://rocm.docs.amd.com/projects/HIPIFY/en/docs-6.1.0/tables/CUDA_Device_API_supported_by_HIP.html
         # nv_bfloat16 and nv_bfloat162 (2x bf16) missing replacement in ROCm
-        if not ROCM_VERSION:
+        if HAS_CUDA_V8 and not ROCM_VERSION:
+            marlin_kernel = cpp_ext.CUDAExtension(
+                "gptqmodel_marlin_kernels",
+                [
+                    "gptqmodel_ext/marlin/marlin_cuda.cpp",
+                    "gptqmodel_ext/marlin/marlin_cuda_kernel.cu",
+                    "gptqmodel_ext/marlin/marlin_repack.cu",
+                ],
+                extra_link_args=extra_link_args,
+                extra_compile_args=extra_compile_args,
+            )
             extensions.append(marlin_kernel)
         extensions += [
             # TODO: VC++: error lnk2001 unresolved external symbol cublasHgemm

From a59d222d5dd8f08057e85b0139008f8eb1a27014 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Sat, 18 Jan 2025 09:53:49 +0800
Subject: [PATCH 3/6] update msg

---
 gptqmodel/nn_modules/qlinear/marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
index 1c3501f57..5cf947157 100644
--- a/gptqmodel/nn_modules/qlinear/marlin.py
+++ b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -310,7 +310,7 @@ def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
         if IS_ROCM:
             return False, RuntimeError("marlin kernel is not supported by rocm.")
         if not HAS_CUDA_V8:
-            return False, RuntimeError("marlin kernel requires CUDA version >= 8.")
+            return False, RuntimeError("marlin kernel requires Compute Capability >= 8.0.")
         if marlin_import_exception is not None:
             return False, marlin_import_exception
         return cls._validate(**args)

From b164e6d4f02eea8eba5e63026964943c1f044eff Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Sat, 18 Jan 2025 09:56:30 +0800
Subject: [PATCH 4/6] update skip marlin msg

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index c0db1aa4e..62f136b73 100644
--- a/setup.py
+++ b/setup.py
@@ -235,6 +235,8 @@ def get_version_tag() -> str:
                 extra_compile_args=extra_compile_args,
             )
             extensions.append(marlin_kernel)
+        elif not HAS_CUDA_V8:
+            print(f"marlin kernel only supports compute capability >= 8.0, there's no such cuda device, skipped.")
         extensions += [
             # TODO: VC++: error lnk2001 unresolved external symbol cublasHgemm
             cpp_ext.CUDAExtension(

From b44b227ed511eb25e051b6b34d05771c7f01da3e Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Sat, 18 Jan 2025 10:25:05 +0800
Subject: [PATCH 5/6] check rocm first

---
 gptqmodel/nn_modules/qlinear/marlin.py | 3 +--
 setup.py                               | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
index 5cf947157..0276dde63 100644
--- a/gptqmodel/nn_modules/qlinear/marlin.py
+++ b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -31,7 +31,6 @@
 except ImportError as e:
     marlin_import_exception = e
 
-HAS_CUDA_V8 = any(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count()))
 
 GPTQ_MARLIN_TILE = 16
 GPTQ_MARLIN_MIN_THREAD_N = 64
@@ -309,7 +308,7 @@ def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeat
     def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
         if IS_ROCM:
             return False, RuntimeError("marlin kernel is not supported by rocm.")
-        if not HAS_CUDA_V8:
+        if not any(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count())):
             return False, RuntimeError("marlin kernel requires Compute Capability >= 8.0.")
         if marlin_import_exception is not None:
             return False, marlin_import_exception
diff --git a/setup.py b/setup.py
index 62f136b73..041035011 100644
--- a/setup.py
+++ b/setup.py
@@ -142,7 +142,7 @@ def get_version_tag() -> str:
         if got_cuda_between_v6_and_v8:
             FORCE_BUILD = True
 else:
-    HAS_CUDA_V8 = len([arch for arch in TORCH_CUDA_ARCH_LIST.split() if float(arch.split('+')[0]) >= 8]) > 0
+    HAS_CUDA_V8 = ROCM_VERSION and len([arch for arch in TORCH_CUDA_ARCH_LIST.split() if float(arch.split('+')[0]) >= 8]) > 0
 
 if RELEASE_MODE == "1":
     common_setup_kwargs["version"] += f"+{get_version_tag()}"

From 7d749b3021a63fed5aee1e7fcc400cdab69d1f12 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Sat, 18 Jan 2025 10:26:49 +0800
Subject: [PATCH 6/6] check not ROCM_VERSION

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 041035011..f48170368 100644
--- a/setup.py
+++ b/setup.py
@@ -142,7 +142,7 @@ def get_version_tag() -> str:
         if got_cuda_between_v6_and_v8:
             FORCE_BUILD = True
 else:
-    HAS_CUDA_V8 = ROCM_VERSION and len([arch for arch in TORCH_CUDA_ARCH_LIST.split() if float(arch.split('+')[0]) >= 8]) > 0
+    HAS_CUDA_V8 = not ROCM_VERSION and len([arch for arch in TORCH_CUDA_ARCH_LIST.split() if float(arch.split('+')[0]) >= 8]) > 0
 
 if RELEASE_MODE == "1":
     common_setup_kwargs["version"] += f"+{get_version_tag()}"