check compute capability for marlin in validate_device() (#1095)

CSY-ModelCloud · Qubitium · web-flow · commit 620fcf1185c0 · 2025-01-18T12:21:59.000+08:00
* check cuda v8 for marlin

* check cuda 8 for installation

* update msg

* update skip marlin msg

* check rocm first

* check not ROCM_VERSION

* check compute capability with validate_device

* check rocm

* check all devices' capability

* use local model path

* Update marlin.py

---------

Co-authored-by: Qubitium-ModelCloud &lt;qubitium@modelcloud.ai&gt;
diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -15,6 +15,7 @@
 
 # Adapted from vllm at https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/gptq_marlin.py
 
+import os
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
@@ -306,14 +307,26 @@ def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeat
 
     @classmethod
     def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
-        if IS_ROCM:
-            return False, RuntimeError("marlin kernel is not supported by rocm.")
-        if not any(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count())):
-            return False, RuntimeError("marlin kernel requires Compute Capability >= 8.0.")
         if marlin_import_exception is not None:
             return False, marlin_import_exception
         return cls._validate(**args)
 
+    @classmethod
+    def validate_device(cls, device: DEVICE):
+        super().validate_device(device)
+        CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if device == DEVICE.CUDA:
+            if IS_ROCM:
+                raise NotImplementedError("Marlin kernel is not supported on ROCm.")
+                
+            if CUDA_VISIBLE_DEVICES is None:
+                has_cuda_v8 = all(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count()))
+            else:
+                has_cuda_v8 = all(torch.cuda.get_device_capability(int(i))[0] >= 8 for i in CUDA_VISIBLE_DEVICES.split(","))
+                
+            if not has_cuda_v8:
+                raise NotImplementedError("Marlin kernel only supports compute capability >= 8.0.")
+
     def post_init(self):
         device = self.qweight.device
         # Allocate marlin workspace
@@ -420,4 +433,4 @@ def dequantize_qzeros(layer):
 
     return unpacked_qzeros
 
-__all__ = ["MarlinQuantLinear"]
+__all__ = ["MarlinQuantLinear"]
diff --git a/tests/test_q4_marlin.py b/tests/test_q4_marlin.py
@@ -32,39 +32,39 @@ class TestQ4Marlin(unittest.TestCase):
     @parameterized.expand(
         [
             # act_order==False, group_size=128
-            ("TheBloke/Llama-2-7B-GPTQ", "main",
+            ("/monster/data/model/Llama-2-7B-GPTQ", "main",
              "<s> I am in Paris and I am in love. everybody knows that.\n"
              "I am in Paris and I am in love.\n"
              "I am in Paris and I am in love. everybody knows that.\n"
              "I am in Paris and I am in love. everybody knows that.\n"
              "I am in Paris and I am in love"),
 
             # act_order==True, group_size=128
-            ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main",
+            ("/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "main",
              "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"),
             # act_order==True, group_size=64
-            ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True",
+            ("/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq-4bit-64g-actorder_True",
              "<s> I am in Paris and I am so happy to be here. I have been here for 10 years and I have never been happier. I have been here for 10 years and I have never been happier. I have been here for 10 years and I have never been happier. I"),
             # act_order==True, group_size=32
-            ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True",
+            ("/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq-4bit-32g-actorder_True",
              "<s> I am in Paris and I am in love with you.\n"
              "\n"
              "Scene 2:\n"
              "\n"
              "(The stage is now dark, with only the sound of the rain falling on the windowpane. The lights come up on a young couple, JESSICA and JASON, sitting on a park ben"),
 
             # # 8-bit, act_order==True, group_size=channelwise
-            ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True",
+            ("/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq-8bit--1g-actorder_True",
              "<s> I am in Paris and I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy"),
             # # 8-bit, act_order==True, group_size=128
-            ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True",
+            ("/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq-8bit-128g-actorder_True",
              "<s> I am in Paris and I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy to be here. I am so happy"),
             # # 8-bit, act_order==True, group_size=32
-            ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True",
+            ("/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq-8bit-32g-actorder_True",
              "<s> I am in Paris and I am looking for a good restaurant for a special occasion. Can you recommend any restaurants in Paris that are known for their specialties? I am looking for something unique and special. Please let me know if you have any recommendations."),
 
             # # 4-bit, act_order==True, group_size=128
-            ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main",
+            ("/monster/data/model/gemma-1.1-2b-it-GPTQ", "main",
              "<bos>I am in Paris and I am looking for a good bakery with fresh bread.\n"
              "\n"
              "**What are some good bakeries in Paris with fresh bread?**\n"
@@ -76,12 +76,12 @@ class TestQ4Marlin(unittest.TestCase):
              "* I am open to both traditional bakeries and newer, trendy")
         ]
     )
-    def test_generation(self, model_id, revision, reference_output):
+    def test_generation(self, model_id, reference_output):
         prompt = "I am in Paris and"
         device = torch.device("cuda:0")
 
         try:
-            model_q = GPTQModel.load(model_id, revision=revision, device="cuda:0", backend=BACKEND.MARLIN)
+            model_q = GPTQModel.load(model_id, device="cuda:0", backend=BACKEND.MARLIN)
         except ValueError as e:
             raise e