diff --git a/.github/workflows/torch_compile_tests.yml b/.github/workflows/torch_compile_tests.yml
index c84bec97a1..7ac0a643c2 100644
--- a/.github/workflows/torch_compile_tests.yml
+++ b/.github/workflows/torch_compile_tests.yml
@@ -11,32 +11,42 @@ on:
         required: false
         default: false
 
+env:
+  RUN_SLOW: "yes"
+  IS_GITHUB_CI: "1"
+  # To be able to run tests on CUDA 12.2
+  NVIDIA_DISABLE_REQUIRE: "1"
+
 jobs:
   run_tests_with_compile:
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, a10, ci]
     env:
       PEFT_DEBUG_WITH_TORCH_COMPILE: 1
+      CUDA_VISIBLE_DEVICES: "0"
+      TEST_TYPE: "single_gpu_huggingface/peft-gpu-bnb-latest:latest"
+    container:
+      image: "huggingface/peft-gpu-bnb-latest:latest"
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
         with:
           ref: ${{ github.event.inputs.branch }}
           repository: ${{ github.event.pull_request.head.repo.full_name }}
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-          cache: "pip"
-          cache-dependency-path: "setup.py"
-      - name: Install dependencies
+      - name: Pip install
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install .[test]
-          python -m pip install bitsandbytes
+          source activate peft
+          pip install -e . --no-deps
+          pip install pytest-cov parameterized datasets scipy einops
+          pip install "pytest>=7.2.0,<8.0.0" # see: https://github.com/huggingface/transformers/blob/ce4fff0be7f6464d713f7ac3e0bbaafbc6959ae5/setup.py#L148C6-L148C26
           if [ "${{ github.event.inputs.pytorch_nightly }}" = "true" ]; then
             python -m pip install --upgrade --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
           fi
       - name: Test compile with pytest
         run: |
+          source activate peft
           echo "PEFT_DEBUG_WITH_TORCH_COMPILE=$PEFT_DEBUG_WITH_TORCH_COMPILE"
           git status
           make tests_torch_compile
diff --git a/tests/test_torch_compile.py b/tests/test_torch_compile.py
index 353a562148..818dcc1c43 100644
--- a/tests/test_torch_compile.py
+++ b/tests/test_torch_compile.py
@@ -49,6 +49,8 @@
     get_peft_model,
 )
 
+from .testing_utils import require_bitsandbytes
+
 
 # only run (very slow) torch.compile tests when explicitly asked to
 if os.environ.get("PEFT_DEBUG_WITH_TORCH_COMPILE") != "1":
@@ -269,6 +271,7 @@ def test_causal_lm_training_pytorch_compile(self, settings, tokenizer, data, tmp
         assert torch.allclose(output_after.logits, output_loaded.logits, atol=atol, rtol=rtol)
         assert (tokens_after == tokens_loaded).all()
 
+    @require_bitsandbytes
     @pytest.mark.xfail(strict=True)
     def test_causal_lm_training_lora_bnb_compile(self, tokenizer, data, tmp_path):
         r"""Train a bnb quantized LoRA model with torch.compile using PyTorch training loop"""
@@ -329,6 +332,7 @@ def test_causal_lm_training_lora_bnb_compile(self, tokenizer, data, tmp_path):
         assert torch.allclose(output_after.logits, output_loaded.logits, atol=atol, rtol=rtol)
 
     @pytest.mark.xfail(strict=True)
+    @require_bitsandbytes
     def test_causal_lm_multiple_lora_adapter_compile(self, tokenizer, data):
         torch.manual_seed(0)
         model = AutoModelForCausalLM.from_pretrained(
@@ -393,6 +397,7 @@ def test_causal_lm_disable_lora_adapter_compile(self, tokenizer, data):
         assert torch.allclose(output_base.logits, output_disabled.logits, atol=atol, rtol=rtol)
         assert not torch.allclose(output_base.logits, output_lora.logits, atol=atol, rtol=rtol)
 
+    @require_bitsandbytes
     def test_causal_lm_merging_lora_adapter_compile(self, tokenizer, data):
         # merge the adapter
         torch.manual_seed(0)
@@ -420,6 +425,7 @@ def test_causal_lm_merging_lora_adapter_compile(self, tokenizer, data):
         assert not torch.allclose(output_base.logits, output_lora.logits, atol=atol, rtol=rtol)
         assert torch.allclose(output_lora.logits, output_merged.logits, atol=atol, rtol=rtol)
 
+    @require_bitsandbytes
     def test_causal_lm_merging_multiple_lora_adapters_compile(self, tokenizer, data):
         # merge multiple adapters at once
         torch.manual_seed(0)
@@ -457,6 +463,7 @@ def test_causal_lm_merging_multiple_lora_adapters_compile(self, tokenizer, data)
         assert not torch.allclose(output_default.logits, output_merged.logits, atol=atol, rtol=rtol)
         assert not torch.allclose(output_other.logits, output_merged.logits, atol=atol, rtol=rtol)
 
+    @require_bitsandbytes
     @pytest.mark.xfail(strict=True)
     def test_causal_lm_merge_and_unload_lora_adapter_compile(self, tokenizer, data):
         torch.manual_seed(0)
@@ -485,6 +492,7 @@ def test_causal_lm_merge_and_unload_lora_adapter_compile(self, tokenizer, data):
         assert not torch.allclose(output_base.logits, output_lora.logits, atol=atol, rtol=rtol)
         assert torch.allclose(output_lora.logits, output_unloaded.logits, atol=atol, rtol=rtol)
 
+    @require_bitsandbytes
     @pytest.mark.xfail(strict=True)
     def test_causal_lm_mixed_batch_lora_adapter_compile(self, tokenizer, data):
         torch.manual_seed(0)
@@ -530,6 +538,7 @@ def test_causal_lm_mixed_batch_lora_adapter_compile(self, tokenizer, data):
         assert torch.allclose(output_default.logits[1], output_mixed.logits[1], atol=atol, rtol=rtol)
         assert torch.allclose(output_other.logits[2], output_mixed.logits[2], atol=atol, rtol=rtol)
 
+    @require_bitsandbytes
     def test_causal_lm_add_weighted_adapter_lora_adapter_compile(self, tokenizer, data):
         torch.manual_seed(0)
         model = AutoModelForCausalLM.from_pretrained(