From 38c213ff49cfabb499c6249b6c8203747fcff2b6 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Thu, 12 Jun 2025 00:57:16 +0200
Subject: [PATCH 1/5] update

---
 src/diffusers/hooks/group_offloading.py       |  4 +++
 tests/quantization/bnb/test_4bit.py           |  2 +-
 tests/quantization/bnb/test_mixed_int8.py     |  2 +-
 .../quantization/test_torch_compile_utils.py  | 25 +++++++++++++--
 tests/quantization/torchao/test_torchao.py    | 31 +++++++++++++++++++
 5 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 565f8f1ff860..f96f6cbbe1ef 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -219,6 +219,7 @@ def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
         return module
 
     def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
+        breakpoint()
         # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward
         # method is the onload_leader of the group.
         if self.group.onload_leader is None:
@@ -285,6 +286,7 @@ def callback():
         return module
 
     def post_forward(self, module, output):
+        breakpoint()
         # At this point, for the current modules' submodules, we know the execution order of the layers. We can now
         # remove the layer execution tracker hooks and apply prefetching by setting the next_group attribute for each
         # group offloading hook.
@@ -624,7 +626,9 @@ def _apply_group_offloading_leaf_level(
     modules_with_group_offloading = set()
     for name, submodule in module.named_modules():
         if not isinstance(submodule, _SUPPORTED_PYTORCH_LAYERS):
+            print("unsupported module", name, type(submodule))
             continue
+        print("applying group offloading to", name, type(submodule))
         group = ModuleGroup(
             modules=[submodule],
             offload_device=offload_device,
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 2d8b9f698bfe..c6d59e8b71ed 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -881,4 +881,4 @@ def test_torch_compile_with_cpu_offload(self):
         super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
 
     def test_torch_compile_with_group_offload(self):
-        super()._test_torch_compile_with_group_offload(quantization_config=self.quantization_config)
+        super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config)
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index b15a9f72a8f6..fc4d6127fef9 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -845,6 +845,6 @@ def test_torch_compile_with_cpu_offload(self):
 
     @pytest.mark.xfail(reason="Test fails because of an offloading problem from Accelerate with confusion in hooks.")
     def test_torch_compile_with_group_offload(self):
-        super()._test_torch_compile_with_group_offload(
+        super()._test_torch_compile_with_group_offload_leaf_stream(
             quantization_config=self.quantization_config, torch_dtype=torch.float16
         )
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
index 1ae77b27d7cd..63d09922f11e 100644
--- a/tests/quantization/test_torch_compile_utils.py
+++ b/tests/quantization/test_torch_compile_utils.py
@@ -64,7 +64,29 @@ def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype=
             # small resolutions to ensure speedy execution.
             pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
 
-    def _test_torch_compile_with_group_offload(self, quantization_config, torch_dtype=torch.bfloat16):
+    def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch_dtype=torch.bfloat16):
+        torch._dynamo.config.cache_size_limit = 10000
+
+        pipe = self._init_pipeline(quantization_config, torch_dtype)
+        group_offload_kwargs = {
+            "onload_device": torch.device("cuda"),
+            "offload_device": torch.device("cpu"),
+            "offload_type": "leaf_level",
+            "num_blocks_per_group": 1,
+            "use_stream": False,
+        }
+        pipe.transformer.enable_group_offload(**group_offload_kwargs)
+        # pipe.transformer.compile()
+        for name, component in pipe.components.items():
+            if name != "transformer" and isinstance(component, torch.nn.Module):
+                if torch.device(component.device).type == "cpu":
+                    component.to("cuda")
+
+        for _ in range(2):
+            # small resolutions to ensure speedy execution.
+            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
+
+    def _test_torch_compile_with_group_offload_leaf_stream(self, quantization_config, torch_dtype=torch.bfloat16):
         torch._dynamo.config.cache_size_limit = 10000
 
         pipe = self._init_pipeline(quantization_config, torch_dtype)
@@ -73,7 +95,6 @@ def _test_torch_compile_with_group_offload(self, quantization_config, torch_dtyp
             "offload_device": torch.device("cpu"),
             "offload_type": "leaf_level",
             "use_stream": True,
-            "non_blocking": True,
         }
         pipe.transformer.enable_group_offload(**group_offload_kwargs)
         pipe.transformer.compile()
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 743da17356f7..9ab6a3242a56 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -29,6 +29,7 @@
     TorchAoConfig,
 )
 from diffusers.models.attention_processor import Attention
+from diffusers.quantizers import PipelineQuantizationConfig
 from diffusers.utils.testing_utils import (
     backend_empty_cache,
     backend_synchronize,
@@ -44,6 +45,8 @@
     torch_device,
 )
 
+from ..test_torch_compile_utils import QuantCompileTests
+
 
 enable_full_determinism()
 
@@ -625,6 +628,34 @@ def test_int_a16w8_cpu(self):
         self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device)
 
 
+@require_torchao_version_greater_or_equal("0.7.0")
+class TorchAoCompileTest(QuantCompileTests):
+    quantization_config = PipelineQuantizationConfig(
+        quant_mapping={
+            "transformer": TorchAoConfig(quant_type="int8_weight_only"),
+        },
+    )
+
+    def test_torch_compile(self):
+        super()._test_torch_compile(quantization_config=self.quantization_config)
+
+    def test_torch_compile_with_cpu_offload(self):
+        super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
+
+    def test_torch_compile_with_group_offload_leaf(self):
+        from diffusers.utils.logging import set_verbosity_debug
+
+        set_verbosity_debug()
+        super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
+
+    @unittest.skip(
+        "Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO."
+    )
+    def test_torch_compile_with_group_offload_leaf_stream(self):
+        # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
+        super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config)
+
+
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
 @require_torch
 @require_torch_accelerator

From fb99d94b25ebefe44f2eb436ec48e1848ebad323 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 16 Jun 2025 10:23:14 +0200
Subject: [PATCH 2/5] update

---
 src/diffusers/hooks/group_offloading.py        |  4 ----
 tests/quantization/test_torch_compile_utils.py |  2 +-
 tests/quantization/torchao/test_torchao.py     | 12 +++++++++---
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index f96f6cbbe1ef..565f8f1ff860 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -219,7 +219,6 @@ def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
         return module
 
     def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
-        breakpoint()
         # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward
         # method is the onload_leader of the group.
         if self.group.onload_leader is None:
@@ -286,7 +285,6 @@ def callback():
         return module
 
     def post_forward(self, module, output):
-        breakpoint()
         # At this point, for the current modules' submodules, we know the execution order of the layers. We can now
         # remove the layer execution tracker hooks and apply prefetching by setting the next_group attribute for each
         # group offloading hook.
@@ -626,9 +624,7 @@ def _apply_group_offloading_leaf_level(
     modules_with_group_offloading = set()
     for name, submodule in module.named_modules():
         if not isinstance(submodule, _SUPPORTED_PYTORCH_LAYERS):
-            print("unsupported module", name, type(submodule))
             continue
-        print("applying group offloading to", name, type(submodule))
         group = ModuleGroup(
             modules=[submodule],
             offload_device=offload_device,
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
index 63d09922f11e..1205d0baf93e 100644
--- a/tests/quantization/test_torch_compile_utils.py
+++ b/tests/quantization/test_torch_compile_utils.py
@@ -76,7 +76,7 @@ def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch
             "use_stream": False,
         }
         pipe.transformer.enable_group_offload(**group_offload_kwargs)
-        # pipe.transformer.compile()
+        pipe.transformer.compile()
         for name, component in pipe.components.items():
             if name != "transformer" and isinstance(component, torch.nn.Module):
                 if torch.device(component.device).type == "cpu":
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 9ab6a3242a56..3861aedbd464 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -639,13 +639,19 @@ class TorchAoCompileTest(QuantCompileTests):
     def test_torch_compile(self):
         super()._test_torch_compile(quantization_config=self.quantization_config)
 
+    @unittest.skip(
+        "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work."
+    )
     def test_torch_compile_with_cpu_offload(self):
+        # RuntimeError: _apply(): Couldn't swap Linear.weight
         super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
 
+    @unittest.skip(
+        "Changing the device of AQT tensor with .to() does not work. Needs to be discussed with TorchAO team."
+    )
     def test_torch_compile_with_group_offload_leaf(self):
-        from diffusers.utils.logging import set_verbosity_debug
-
-        set_verbosity_debug()
+        # for linear layers, weight.tensor_impl shows cuda... but:
+        # weight.tensor_impl.{data,scale,zero_point}.device will be cpu
         super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
 
     @unittest.skip(

From b69d0995e2ef154927fd1536f29f3d9cfbba688f Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 16 Jun 2025 10:51:33 +0200
Subject: [PATCH 3/5] update

---
 tests/quantization/torchao/test_torchao.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 3861aedbd464..e708fbbbb3ae 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -640,16 +640,24 @@ def test_torch_compile(self):
         super()._test_torch_compile(quantization_config=self.quantization_config)
 
     @unittest.skip(
-        "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work."
+        "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work "
+        "when compiling."
     )
     def test_torch_compile_with_cpu_offload(self):
         # RuntimeError: _apply(): Couldn't swap Linear.weight
         super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
 
     @unittest.skip(
-        "Changing the device of AQT tensor with .to() does not work. Needs to be discussed with TorchAO team."
+        "Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation "
+        "is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure."
     )
     def test_torch_compile_with_group_offload_leaf(self):
+        # If we run group offloading without compilation, we will see:
+        #   RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0".  This is no longer allowed; the devices must match.
+        # When running with compilation, the error ends up being different:
+        #   Dynamo failed to run FX node with fake tensors: call_function <built-in function linear>(*(FakeTensor(..., device='cuda:0', size=(s0, 256), dtype=torch.bfloat16), AffineQuantizedTensor(tensor_impl=PlainAQTTensorImpl(data=FakeTensor(..., size=(1536, 256), dtype=torch.int8)... , scale=FakeTensor(..., size=(1536,), dtype=torch.bfloat16)... , zero_point=FakeTensor(..., size=(1536,), dtype=torch.int64)... , _layout=PlainLayout()), block_size=(1, 256), shape=torch.Size([1536, 256]), device=cpu, dtype=torch.bfloat16, requires_grad=False), Parameter(FakeTensor(..., device='cuda:0', size=(1536,), dtype=torch.bfloat16,
+        #   requires_grad=True))), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices cuda:0, cpu')
+        # Looks like something that will have to be looked into upstream.
         # for linear layers, weight.tensor_impl shows cuda... but:
         # weight.tensor_impl.{data,scale,zero_point}.device will be cpu
         super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)

From 2c608d1ab41c990a18362c061282efa28d45410f Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 16 Jun 2025 22:57:24 +0200
Subject: [PATCH 4/5] update

---
 tests/quantization/bnb/test_4bit.py           |  9 ++++---
 tests/quantization/bnb/test_mixed_int8.py     |  6 ++---
 .../quantization/test_torch_compile_utils.py  | 27 +++----------------
 tests/quantization/torchao/test_torchao.py    | 22 ++++++++-------
 4 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index c6d59e8b71ed..63dbdf2c56a7 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -45,7 +45,6 @@
     require_peft_backend,
     require_torch,
     require_torch_accelerator,
-    require_torch_version_greater,
     require_transformers_version_greater,
     slow,
     torch_device,
@@ -861,7 +860,7 @@ def test_fp4_double_safe(self):
         self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True)
 
 
-@require_torch_version_greater("2.7.1")
+# @require_torch_version_greater("2.7.1")
 class Bnb4BitCompileTests(QuantCompileTests):
     quantization_config = PipelineQuantizationConfig(
         quant_backend="bitsandbytes_8bit",
@@ -880,5 +879,7 @@ def test_torch_compile(self):
     def test_torch_compile_with_cpu_offload(self):
         super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
 
-    def test_torch_compile_with_group_offload(self):
-        super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config)
+    def test_torch_compile_with_group_offload_leaf(self):
+        super()._test_torch_compile_with_group_offload_leaf(
+            quantization_config=self.quantization_config, use_stream=True
+        )
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index fc4d6127fef9..1d72ad486392 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -844,7 +844,7 @@ def test_torch_compile_with_cpu_offload(self):
         )
 
     @pytest.mark.xfail(reason="Test fails because of an offloading problem from Accelerate with confusion in hooks.")
-    def test_torch_compile_with_group_offload(self):
-        super()._test_torch_compile_with_group_offload_leaf_stream(
-            quantization_config=self.quantization_config, torch_dtype=torch.float16
+    def test_torch_compile_with_group_offload_leaf(self):
+        super()._test_torch_compile_with_group_offload_leaf(
+            quantization_config=self.quantization_config, torch_dtype=torch.float16, use_stream=True
         )
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
index 1205d0baf93e..7a5af6c1f860 100644
--- a/tests/quantization/test_torch_compile_utils.py
+++ b/tests/quantization/test_torch_compile_utils.py
@@ -64,7 +64,9 @@ def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype=
             # small resolutions to ensure speedy execution.
             pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
 
-    def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch_dtype=torch.bfloat16):
+    def _test_torch_compile_with_group_offload_leaf(
+        self, quantization_config, torch_dtype=torch.bfloat16, *, use_stream: bool = False
+    ):
         torch._dynamo.config.cache_size_limit = 10000
 
         pipe = self._init_pipeline(quantization_config, torch_dtype)
@@ -73,28 +75,7 @@ def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch
             "offload_device": torch.device("cpu"),
             "offload_type": "leaf_level",
             "num_blocks_per_group": 1,
-            "use_stream": False,
-        }
-        pipe.transformer.enable_group_offload(**group_offload_kwargs)
-        pipe.transformer.compile()
-        for name, component in pipe.components.items():
-            if name != "transformer" and isinstance(component, torch.nn.Module):
-                if torch.device(component.device).type == "cpu":
-                    component.to("cuda")
-
-        for _ in range(2):
-            # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
-
-    def _test_torch_compile_with_group_offload_leaf_stream(self, quantization_config, torch_dtype=torch.bfloat16):
-        torch._dynamo.config.cache_size_limit = 10000
-
-        pipe = self._init_pipeline(quantization_config, torch_dtype)
-        group_offload_kwargs = {
-            "onload_device": torch.device("cuda"),
-            "offload_device": torch.device("cpu"),
-            "offload_type": "leaf_level",
-            "use_stream": True,
+            "use_stream": use_stream,
         }
         pipe.transformer.enable_group_offload(**group_offload_kwargs)
         pipe.transformer.compile()
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index e708fbbbb3ae..af5de359c878 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -19,6 +19,7 @@
 from typing import List
 
 import numpy as np
+from parameterized import parameterized
 from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 
 from diffusers import (
@@ -648,10 +649,17 @@ def test_torch_compile_with_cpu_offload(self):
         super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
 
     @unittest.skip(
-        "Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation "
-        "is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure."
+        """
+        For `use_stream=False`:
+            - Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation
+            is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure.
+        For `use_stream=True`:
+            Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO.
+        """
     )
+    @parameterized.expand([False, True])
     def test_torch_compile_with_group_offload_leaf(self):
+        # For use_stream=False:
         # If we run group offloading without compilation, we will see:
         #   RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0".  This is no longer allowed; the devices must match.
         # When running with compilation, the error ends up being different:
@@ -660,14 +668,10 @@ def test_torch_compile_with_group_offload_leaf(self):
         # Looks like something that will have to be looked into upstream.
         # for linear layers, weight.tensor_impl shows cuda... but:
         # weight.tensor_impl.{data,scale,zero_point}.device will be cpu
-        super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
 
-    @unittest.skip(
-        "Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO."
-    )
-    def test_torch_compile_with_group_offload_leaf_stream(self):
-        # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
-        super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config)
+        # For use_stream=True:
+        # # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
+        super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
 
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners

From acd86ed69936b2a80b5f6709060ff1d2117999a4 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Mon, 16 Jun 2025 22:59:40 +0200
Subject: [PATCH 5/5] update

---
 tests/quantization/bnb/test_4bit.py            | 3 ++-
 tests/quantization/test_torch_compile_utils.py | 1 -
 tests/quantization/torchao/test_torchao.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 63dbdf2c56a7..b94dc4698026 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -45,6 +45,7 @@
     require_peft_backend,
     require_torch,
     require_torch_accelerator,
+    require_torch_version_greater,
     require_transformers_version_greater,
     slow,
     torch_device,
@@ -860,7 +861,7 @@ def test_fp4_double_safe(self):
         self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True)
 
 
-# @require_torch_version_greater("2.7.1")
+@require_torch_version_greater("2.7.1")
 class Bnb4BitCompileTests(QuantCompileTests):
     quantization_config = PipelineQuantizationConfig(
         quant_backend="bitsandbytes_8bit",
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
index 7a5af6c1f860..386554bf65cc 100644
--- a/tests/quantization/test_torch_compile_utils.py
+++ b/tests/quantization/test_torch_compile_utils.py
@@ -74,7 +74,6 @@ def _test_torch_compile_with_group_offload_leaf(
             "onload_device": torch.device("cuda"),
             "offload_device": torch.device("cpu"),
             "offload_type": "leaf_level",
-            "num_blocks_per_group": 1,
             "use_stream": use_stream,
         }
         pipe.transformer.enable_group_offload(**group_offload_kwargs)
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index af5de359c878..6b4bb7779d8c 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -670,7 +670,7 @@ def test_torch_compile_with_group_offload_leaf(self):
         # weight.tensor_impl.{data,scale,zero_point}.device will be cpu
 
         # For use_stream=True:
-        # # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
+        # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
         super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)