From 179fc6b2976caa2e7edf0f5fc259dc4d9dbca3de Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 21 Feb 2025 15:52:03 +0800 Subject: [PATCH 1/5] [CI] fix sglang test name --- .github/workflows/unit_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 34d466be4..948b2b070 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -563,7 +563,7 @@ jobs: uv pip install colorlog fi - if [[ "${{ matrix.test_script }}" == "test_sglang.py" ]]; then + if [[ "${{ matrix.test_script }}" == "test_sglang" ]]; then uv pip install transformers==4.48.3 fi From c926734c70557d660db6aa9488b50738653f9a08 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 21 Feb 2025 15:54:57 +0800 Subject: [PATCH 2/5] [CI] don't pass gpu to ipex --- .github/workflows/unit_tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 948b2b070..c6ab3d019 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -631,6 +631,9 @@ jobs: - name: Run tests if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }} run: | + if [[ "${{ matrix.test_script }}" == *ipex* ]]; then + export CUDA_VISIBLE_DEVICES="" + fi if [[ "${{ matrix.test_script }}" == *xpu* ]]; then export CUDA_VISIBLE_DEVICES="" source /etc/profile.d/pyenv.sh && pyenv activate xpu From c19a94df27c42dfdd4c28a9868bbd2185434c6d9 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 21 Feb 2025 15:58:49 +0800 Subject: [PATCH 3/5] [CI] print status after getting gpu --- .github/workflows/unit_tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index c6ab3d019..ec2cba93e 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -417,6 +417,7 @@ jobs: if [ "$gpu_id" -lt 0 ]; then echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}×tamp=$timestamp&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}&exclusive=${{ github.event.inputs.exclusive-gpu }} returned $gpu_id" echo "No available GPU, waiting 5 seconds..." + curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2 sleep 5 else echo "Allocated GPU ID: $gpu_id" @@ -428,6 +429,7 @@ jobs: echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp" + curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2 - name: Run tests if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }} @@ -616,6 +618,7 @@ jobs: if [ "$gpu_id" -lt 0 ]; then echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}×tamp=$timestamp&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}&exclusive=${{ github.event.inputs.exclusive-gpu }} returned $gpu_id" echo "No available GPU, waiting 5 seconds..." + curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2 sleep 5 else echo "Allocated GPU ID: $gpu_id" @@ -627,6 +630,7 @@ jobs: echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp" + curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2 - name: Run tests if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }} From 7994f20b1715ac628b545cf1fa722e30a8dbcfe1 Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 21 Feb 2025 16:02:52 +0800 Subject: [PATCH 4/5] remove packing test for exllama --- tests/test_packing.py | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/tests/test_packing.py b/tests/test_packing.py index e8d377c08..88d5da5f6 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -98,38 +98,3 @@ def pack(self, qlinearCls): qlinear.pack(self.linear, self.s.T, self.zeros.T, g_idx=None) return qlinear - - def test_compare_exllama_triton_torch(self): - # validate exllama packer - exllama_linear = self.pack(ExllamaQuantLinear) - - dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(exllama_linear) - dequantized_weight = dequantized_weight.to(torch.float16) - - self.assertTrue(torch.equal(dequantized_weight, self.linear.weight)) - self.assertTrue(torch.all(dequantized_qzeros == 8)) - - triton_linear = self.pack(TritonV2QuantLinear) - - dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(triton_linear) - dequantized_weight = dequantized_weight.to(torch.float16) - - self.assertTrue(torch.equal(dequantized_weight, self.linear.weight)) - self.assertTrue(torch.all(dequantized_qzeros == 8)) - - self.assertTrue(torch.allclose(exllama_linear.qweight, triton_linear.qweight)) - self.assertTrue(torch.allclose(exllama_linear.scales, triton_linear.scales)) - self.assertTrue(torch.allclose(exllama_linear.qzeros, triton_linear.qzeros)) - - # validate torch packer - torch_linear = self.pack(TorchQuantLinear) - - dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(torch_linear) - dequantized_weight = dequantized_weight.to(torch.float16) - - self.assertTrue(torch.equal(dequantized_weight, self.linear.weight)) - self.assertTrue(torch.all(dequantized_qzeros == 8)) - - self.assertTrue(torch.allclose(exllama_linear.qweight, torch_linear.qweight)) - self.assertTrue(torch.allclose(exllama_linear.scales, torch_linear.scales)) - self.assertTrue(torch.allclose(exllama_linear.qzeros, torch_linear.qzeros)) From 7b7d96ecb126546e35e307d624a263d53f5387dd Mon Sep 17 00:00:00 2001 From: CSY-ModelCloud Date: Fri, 21 Feb 2025 16:17:09 +0800 Subject: [PATCH 5/5] add triton & torch test back --- tests/test_packing.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_packing.py b/tests/test_packing.py index 88d5da5f6..72727eee1 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -98,3 +98,25 @@ def pack(self, qlinearCls): qlinear.pack(self.linear, self.s.T, self.zeros.T, g_idx=None) return qlinear + + def test_compare_exllama_triton_torch(self): + triton_linear = self.pack(TritonV2QuantLinear) + + dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(triton_linear) + dequantized_weight = dequantized_weight.to(torch.float16) + + self.assertTrue(torch.equal(dequantized_weight, self.linear.weight)) + self.assertTrue(torch.all(dequantized_qzeros == 8)) + + # validate torch packer + torch_linear = self.pack(TorchQuantLinear) + + dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(torch_linear) + dequantized_weight = dequantized_weight.to(torch.float16) + + self.assertTrue(torch.equal(dequantized_weight, self.linear.weight)) + self.assertTrue(torch.all(dequantized_qzeros == 8)) + + self.assertTrue(torch.allclose(triton_linear.qweight, torch_linear.qweight)) + self.assertTrue(torch.allclose(triton_linear.scales, torch_linear.scales)) + self.assertTrue(torch.allclose(triton_linear.qzeros, torch_linear.qzeros))