From 179fc6b2976caa2e7edf0f5fc259dc4d9dbca3de Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Fri, 21 Feb 2025 15:52:03 +0800
Subject: [PATCH 1/5] [CI] fix sglang test name

---
 .github/workflows/unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 34d466be4..948b2b070 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -563,7 +563,7 @@ jobs:
             uv pip install colorlog
           fi
 
-          if [[ "${{ matrix.test_script }}" == "test_sglang.py" ]]; then
+          if [[ "${{ matrix.test_script }}" == "test_sglang" ]]; then
             uv pip install transformers==4.48.3
           fi
 

From c926734c70557d660db6aa9488b50738653f9a08 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Fri, 21 Feb 2025 15:54:57 +0800
Subject: [PATCH 2/5] [CI] don't pass gpu to ipex

---
 .github/workflows/unit_tests.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 948b2b070..c6ab3d019 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -631,6 +631,9 @@ jobs:
       - name: Run tests
         if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
         run: |
+          if [[ "${{ matrix.test_script }}" == *ipex* ]]; then
+            export CUDA_VISIBLE_DEVICES=""
+          fi
           if [[ "${{ matrix.test_script }}" == *xpu* ]]; then
             export CUDA_VISIBLE_DEVICES=""
             source /etc/profile.d/pyenv.sh && pyenv activate xpu

From c19a94df27c42dfdd4c28a9868bbd2185434c6d9 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Fri, 21 Feb 2025 15:58:49 +0800
Subject: [PATCH 3/5] [CI] print status after getting gpu

---
 .github/workflows/unit_tests.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index c6ab3d019..ec2cba93e 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -417,6 +417,7 @@ jobs:
             if [ "$gpu_id" -lt 0 ]; then
               echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}&exclusive=${{ github.event.inputs.exclusive-gpu }} returned $gpu_id"
               echo "No available GPU, waiting 5 seconds..."
+              curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2
               sleep 5
             else
               echo "Allocated GPU ID: $gpu_id"
@@ -428,6 +429,7 @@ jobs:
           echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
           echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV
           echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp"
+          curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2
 
       - name: Run tests
         if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
@@ -616,6 +618,7 @@ jobs:
             if [ "$gpu_id" -lt 0 ]; then
               echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}&exclusive=${{ github.event.inputs.exclusive-gpu }} returned $gpu_id"
               echo "No available GPU, waiting 5 seconds..."
+              curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2
               sleep 5
             else
               echo "Allocated GPU ID: $gpu_id"
@@ -627,6 +630,7 @@ jobs:
           echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
           echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV
           echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp"
+          curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2
 
       - name: Run tests
         if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}

From 7994f20b1715ac628b545cf1fa722e30a8dbcfe1 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Fri, 21 Feb 2025 16:02:52 +0800
Subject: [PATCH 4/5] remove packing test for exllama

---
 tests/test_packing.py | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/tests/test_packing.py b/tests/test_packing.py
index e8d377c08..88d5da5f6 100644
--- a/tests/test_packing.py
+++ b/tests/test_packing.py
@@ -98,38 +98,3 @@ def pack(self, qlinearCls):
         qlinear.pack(self.linear, self.s.T, self.zeros.T, g_idx=None)
 
         return qlinear
-
-    def test_compare_exllama_triton_torch(self):
-        # validate exllama packer
-        exllama_linear = self.pack(ExllamaQuantLinear)
-
-        dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(exllama_linear)
-        dequantized_weight = dequantized_weight.to(torch.float16)
-
-        self.assertTrue(torch.equal(dequantized_weight, self.linear.weight))
-        self.assertTrue(torch.all(dequantized_qzeros == 8))
-
-        triton_linear = self.pack(TritonV2QuantLinear)
-
-        dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(triton_linear)
-        dequantized_weight = dequantized_weight.to(torch.float16)
-
-        self.assertTrue(torch.equal(dequantized_weight, self.linear.weight))
-        self.assertTrue(torch.all(dequantized_qzeros == 8))
-
-        self.assertTrue(torch.allclose(exllama_linear.qweight, triton_linear.qweight))
-        self.assertTrue(torch.allclose(exllama_linear.scales, triton_linear.scales))
-        self.assertTrue(torch.allclose(exllama_linear.qzeros, triton_linear.qzeros))
-
-        # validate torch packer
-        torch_linear = self.pack(TorchQuantLinear)
-
-        dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(torch_linear)
-        dequantized_weight = dequantized_weight.to(torch.float16)
-
-        self.assertTrue(torch.equal(dequantized_weight, self.linear.weight))
-        self.assertTrue(torch.all(dequantized_qzeros == 8))
-
-        self.assertTrue(torch.allclose(exllama_linear.qweight, torch_linear.qweight))
-        self.assertTrue(torch.allclose(exllama_linear.scales, torch_linear.scales))
-        self.assertTrue(torch.allclose(exllama_linear.qzeros, torch_linear.qzeros))

From 7b7d96ecb126546e35e307d624a263d53f5387dd Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Fri, 21 Feb 2025 16:17:09 +0800
Subject: [PATCH 5/5] add triton & torch test back

---
 tests/test_packing.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/test_packing.py b/tests/test_packing.py
index 88d5da5f6..72727eee1 100644
--- a/tests/test_packing.py
+++ b/tests/test_packing.py
@@ -98,3 +98,25 @@ def pack(self, qlinearCls):
         qlinear.pack(self.linear, self.s.T, self.zeros.T, g_idx=None)
 
         return qlinear
+
+    def test_compare_exllama_triton_torch(self):
+        triton_linear = self.pack(TritonV2QuantLinear)
+
+        dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(triton_linear)
+        dequantized_weight = dequantized_weight.to(torch.float16)
+
+        self.assertTrue(torch.equal(dequantized_weight, self.linear.weight))
+        self.assertTrue(torch.all(dequantized_qzeros == 8))
+
+        # validate torch packer
+        torch_linear = self.pack(TorchQuantLinear)
+
+        dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(torch_linear)
+        dequantized_weight = dequantized_weight.to(torch.float16)
+
+        self.assertTrue(torch.equal(dequantized_weight, self.linear.weight))
+        self.assertTrue(torch.all(dequantized_qzeros == 8))
+
+        self.assertTrue(torch.allclose(triton_linear.qweight, torch_linear.qweight))
+        self.assertTrue(torch.allclose(triton_linear.scales, torch_linear.scales))
+        self.assertTrue(torch.allclose(triton_linear.qzeros, torch_linear.qzeros))