From 38c213ff49cfabb499c6249b6c8203747fcff2b6 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 12 Jun 2025 00:57:16 +0200 Subject: [PATCH 1/5] update --- src/diffusers/hooks/group_offloading.py | 4 +++ tests/quantization/bnb/test_4bit.py | 2 +- tests/quantization/bnb/test_mixed_int8.py | 2 +- .../quantization/test_torch_compile_utils.py | 25 +++++++++++++-- tests/quantization/torchao/test_torchao.py | 31 +++++++++++++++++++ 5 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py index 565f8f1ff860..f96f6cbbe1ef 100644 --- a/src/diffusers/hooks/group_offloading.py +++ b/src/diffusers/hooks/group_offloading.py @@ -219,6 +219,7 @@ def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module: return module def pre_forward(self, module: torch.nn.Module, *args, **kwargs): + breakpoint() # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward # method is the onload_leader of the group. if self.group.onload_leader is None: @@ -285,6 +286,7 @@ def callback(): return module def post_forward(self, module, output): + breakpoint() # At this point, for the current modules' submodules, we know the execution order of the layers. We can now # remove the layer execution tracker hooks and apply prefetching by setting the next_group attribute for each # group offloading hook. @@ -624,7 +626,9 @@ def _apply_group_offloading_leaf_level( modules_with_group_offloading = set() for name, submodule in module.named_modules(): if not isinstance(submodule, _SUPPORTED_PYTORCH_LAYERS): + print("unsupported module", name, type(submodule)) continue + print("applying group offloading to", name, type(submodule)) group = ModuleGroup( modules=[submodule], offload_device=offload_device, diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 2d8b9f698bfe..c6d59e8b71ed 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -881,4 +881,4 @@ def test_torch_compile_with_cpu_offload(self): super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config) def test_torch_compile_with_group_offload(self): - super()._test_torch_compile_with_group_offload(quantization_config=self.quantization_config) + super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index b15a9f72a8f6..fc4d6127fef9 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -845,6 +845,6 @@ def test_torch_compile_with_cpu_offload(self): @pytest.mark.xfail(reason="Test fails because of an offloading problem from Accelerate with confusion in hooks.") def test_torch_compile_with_group_offload(self): - super()._test_torch_compile_with_group_offload( + super()._test_torch_compile_with_group_offload_leaf_stream( quantization_config=self.quantization_config, torch_dtype=torch.float16 ) diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py index 1ae77b27d7cd..63d09922f11e 100644 --- a/tests/quantization/test_torch_compile_utils.py +++ b/tests/quantization/test_torch_compile_utils.py @@ -64,7 +64,29 @@ def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype= # small resolutions to ensure speedy execution. pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256) - def _test_torch_compile_with_group_offload(self, quantization_config, torch_dtype=torch.bfloat16): + def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch_dtype=torch.bfloat16): + torch._dynamo.config.cache_size_limit = 10000 + + pipe = self._init_pipeline(quantization_config, torch_dtype) + group_offload_kwargs = { + "onload_device": torch.device("cuda"), + "offload_device": torch.device("cpu"), + "offload_type": "leaf_level", + "num_blocks_per_group": 1, + "use_stream": False, + } + pipe.transformer.enable_group_offload(**group_offload_kwargs) + # pipe.transformer.compile() + for name, component in pipe.components.items(): + if name != "transformer" and isinstance(component, torch.nn.Module): + if torch.device(component.device).type == "cpu": + component.to("cuda") + + for _ in range(2): + # small resolutions to ensure speedy execution. + pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256) + + def _test_torch_compile_with_group_offload_leaf_stream(self, quantization_config, torch_dtype=torch.bfloat16): torch._dynamo.config.cache_size_limit = 10000 pipe = self._init_pipeline(quantization_config, torch_dtype) @@ -73,7 +95,6 @@ def _test_torch_compile_with_group_offload(self, quantization_config, torch_dtyp "offload_device": torch.device("cpu"), "offload_type": "leaf_level", "use_stream": True, - "non_blocking": True, } pipe.transformer.enable_group_offload(**group_offload_kwargs) pipe.transformer.compile() diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 743da17356f7..9ab6a3242a56 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -29,6 +29,7 @@ TorchAoConfig, ) from diffusers.models.attention_processor import Attention +from diffusers.quantizers import PipelineQuantizationConfig from diffusers.utils.testing_utils import ( backend_empty_cache, backend_synchronize, @@ -44,6 +45,8 @@ torch_device, ) +from ..test_torch_compile_utils import QuantCompileTests + enable_full_determinism() @@ -625,6 +628,34 @@ def test_int_a16w8_cpu(self): self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device) +@require_torchao_version_greater_or_equal("0.7.0") +class TorchAoCompileTest(QuantCompileTests): + quantization_config = PipelineQuantizationConfig( + quant_mapping={ + "transformer": TorchAoConfig(quant_type="int8_weight_only"), + }, + ) + + def test_torch_compile(self): + super()._test_torch_compile(quantization_config=self.quantization_config) + + def test_torch_compile_with_cpu_offload(self): + super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config) + + def test_torch_compile_with_group_offload_leaf(self): + from diffusers.utils.logging import set_verbosity_debug + + set_verbosity_debug() + super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config) + + @unittest.skip( + "Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO." + ) + def test_torch_compile_with_group_offload_leaf_stream(self): + # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=, types=(,), arg_types=(,), kwarg_types={} + super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config) + + # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners @require_torch @require_torch_accelerator From fb99d94b25ebefe44f2eb436ec48e1848ebad323 Mon Sep 17 00:00:00 2001 From: Aryan Date: Mon, 16 Jun 2025 10:23:14 +0200 Subject: [PATCH 2/5] update --- src/diffusers/hooks/group_offloading.py | 4 ---- tests/quantization/test_torch_compile_utils.py | 2 +- tests/quantization/torchao/test_torchao.py | 12 +++++++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py index f96f6cbbe1ef..565f8f1ff860 100644 --- a/src/diffusers/hooks/group_offloading.py +++ b/src/diffusers/hooks/group_offloading.py @@ -219,7 +219,6 @@ def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module: return module def pre_forward(self, module: torch.nn.Module, *args, **kwargs): - breakpoint() # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward # method is the onload_leader of the group. if self.group.onload_leader is None: @@ -286,7 +285,6 @@ def callback(): return module def post_forward(self, module, output): - breakpoint() # At this point, for the current modules' submodules, we know the execution order of the layers. We can now # remove the layer execution tracker hooks and apply prefetching by setting the next_group attribute for each # group offloading hook. @@ -626,9 +624,7 @@ def _apply_group_offloading_leaf_level( modules_with_group_offloading = set() for name, submodule in module.named_modules(): if not isinstance(submodule, _SUPPORTED_PYTORCH_LAYERS): - print("unsupported module", name, type(submodule)) continue - print("applying group offloading to", name, type(submodule)) group = ModuleGroup( modules=[submodule], offload_device=offload_device, diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py index 63d09922f11e..1205d0baf93e 100644 --- a/tests/quantization/test_torch_compile_utils.py +++ b/tests/quantization/test_torch_compile_utils.py @@ -76,7 +76,7 @@ def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch "use_stream": False, } pipe.transformer.enable_group_offload(**group_offload_kwargs) - # pipe.transformer.compile() + pipe.transformer.compile() for name, component in pipe.components.items(): if name != "transformer" and isinstance(component, torch.nn.Module): if torch.device(component.device).type == "cpu": diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 9ab6a3242a56..3861aedbd464 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -639,13 +639,19 @@ class TorchAoCompileTest(QuantCompileTests): def test_torch_compile(self): super()._test_torch_compile(quantization_config=self.quantization_config) + @unittest.skip( + "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work." + ) def test_torch_compile_with_cpu_offload(self): + # RuntimeError: _apply(): Couldn't swap Linear.weight super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config) + @unittest.skip( + "Changing the device of AQT tensor with .to() does not work. Needs to be discussed with TorchAO team." + ) def test_torch_compile_with_group_offload_leaf(self): - from diffusers.utils.logging import set_verbosity_debug - - set_verbosity_debug() + # for linear layers, weight.tensor_impl shows cuda... but: + # weight.tensor_impl.{data,scale,zero_point}.device will be cpu super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config) @unittest.skip( From b69d0995e2ef154927fd1536f29f3d9cfbba688f Mon Sep 17 00:00:00 2001 From: Aryan Date: Mon, 16 Jun 2025 10:51:33 +0200 Subject: [PATCH 3/5] update --- tests/quantization/torchao/test_torchao.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 3861aedbd464..e708fbbbb3ae 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -640,16 +640,24 @@ def test_torch_compile(self): super()._test_torch_compile(quantization_config=self.quantization_config) @unittest.skip( - "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work." + "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work " + "when compiling." ) def test_torch_compile_with_cpu_offload(self): # RuntimeError: _apply(): Couldn't swap Linear.weight super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config) @unittest.skip( - "Changing the device of AQT tensor with .to() does not work. Needs to be discussed with TorchAO team." + "Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation " + "is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure." ) def test_torch_compile_with_group_offload_leaf(self): + # If we run group offloading without compilation, we will see: + # RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0". This is no longer allowed; the devices must match. + # When running with compilation, the error ends up being different: + # Dynamo failed to run FX node with fake tensors: call_function (*(FakeTensor(..., device='cuda:0', size=(s0, 256), dtype=torch.bfloat16), AffineQuantizedTensor(tensor_impl=PlainAQTTensorImpl(data=FakeTensor(..., size=(1536, 256), dtype=torch.int8)... , scale=FakeTensor(..., size=(1536,), dtype=torch.bfloat16)... , zero_point=FakeTensor(..., size=(1536,), dtype=torch.int64)... , _layout=PlainLayout()), block_size=(1, 256), shape=torch.Size([1536, 256]), device=cpu, dtype=torch.bfloat16, requires_grad=False), Parameter(FakeTensor(..., device='cuda:0', size=(1536,), dtype=torch.bfloat16, + # requires_grad=True))), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices cuda:0, cpu') + # Looks like something that will have to be looked into upstream. # for linear layers, weight.tensor_impl shows cuda... but: # weight.tensor_impl.{data,scale,zero_point}.device will be cpu super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config) From 2c608d1ab41c990a18362c061282efa28d45410f Mon Sep 17 00:00:00 2001 From: Aryan Date: Mon, 16 Jun 2025 22:57:24 +0200 Subject: [PATCH 4/5] update --- tests/quantization/bnb/test_4bit.py | 9 ++++--- tests/quantization/bnb/test_mixed_int8.py | 6 ++--- .../quantization/test_torch_compile_utils.py | 27 +++---------------- tests/quantization/torchao/test_torchao.py | 22 ++++++++------- 4 files changed, 25 insertions(+), 39 deletions(-) diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index c6d59e8b71ed..63dbdf2c56a7 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -45,7 +45,6 @@ require_peft_backend, require_torch, require_torch_accelerator, - require_torch_version_greater, require_transformers_version_greater, slow, torch_device, @@ -861,7 +860,7 @@ def test_fp4_double_safe(self): self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True) -@require_torch_version_greater("2.7.1") +# @require_torch_version_greater("2.7.1") class Bnb4BitCompileTests(QuantCompileTests): quantization_config = PipelineQuantizationConfig( quant_backend="bitsandbytes_8bit", @@ -880,5 +879,7 @@ def test_torch_compile(self): def test_torch_compile_with_cpu_offload(self): super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config) - def test_torch_compile_with_group_offload(self): - super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config) + def test_torch_compile_with_group_offload_leaf(self): + super()._test_torch_compile_with_group_offload_leaf( + quantization_config=self.quantization_config, use_stream=True + ) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index fc4d6127fef9..1d72ad486392 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -844,7 +844,7 @@ def test_torch_compile_with_cpu_offload(self): ) @pytest.mark.xfail(reason="Test fails because of an offloading problem from Accelerate with confusion in hooks.") - def test_torch_compile_with_group_offload(self): - super()._test_torch_compile_with_group_offload_leaf_stream( - quantization_config=self.quantization_config, torch_dtype=torch.float16 + def test_torch_compile_with_group_offload_leaf(self): + super()._test_torch_compile_with_group_offload_leaf( + quantization_config=self.quantization_config, torch_dtype=torch.float16, use_stream=True ) diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py index 1205d0baf93e..7a5af6c1f860 100644 --- a/tests/quantization/test_torch_compile_utils.py +++ b/tests/quantization/test_torch_compile_utils.py @@ -64,7 +64,9 @@ def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype= # small resolutions to ensure speedy execution. pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256) - def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch_dtype=torch.bfloat16): + def _test_torch_compile_with_group_offload_leaf( + self, quantization_config, torch_dtype=torch.bfloat16, *, use_stream: bool = False + ): torch._dynamo.config.cache_size_limit = 10000 pipe = self._init_pipeline(quantization_config, torch_dtype) @@ -73,28 +75,7 @@ def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch "offload_device": torch.device("cpu"), "offload_type": "leaf_level", "num_blocks_per_group": 1, - "use_stream": False, - } - pipe.transformer.enable_group_offload(**group_offload_kwargs) - pipe.transformer.compile() - for name, component in pipe.components.items(): - if name != "transformer" and isinstance(component, torch.nn.Module): - if torch.device(component.device).type == "cpu": - component.to("cuda") - - for _ in range(2): - # small resolutions to ensure speedy execution. - pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256) - - def _test_torch_compile_with_group_offload_leaf_stream(self, quantization_config, torch_dtype=torch.bfloat16): - torch._dynamo.config.cache_size_limit = 10000 - - pipe = self._init_pipeline(quantization_config, torch_dtype) - group_offload_kwargs = { - "onload_device": torch.device("cuda"), - "offload_device": torch.device("cpu"), - "offload_type": "leaf_level", - "use_stream": True, + "use_stream": use_stream, } pipe.transformer.enable_group_offload(**group_offload_kwargs) pipe.transformer.compile() diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index e708fbbbb3ae..af5de359c878 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -19,6 +19,7 @@ from typing import List import numpy as np +from parameterized import parameterized from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel from diffusers import ( @@ -648,10 +649,17 @@ def test_torch_compile_with_cpu_offload(self): super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config) @unittest.skip( - "Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation " - "is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure." + """ + For `use_stream=False`: + - Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation + is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure. + For `use_stream=True`: + Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO. + """ ) + @parameterized.expand([False, True]) def test_torch_compile_with_group_offload_leaf(self): + # For use_stream=False: # If we run group offloading without compilation, we will see: # RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0". This is no longer allowed; the devices must match. # When running with compilation, the error ends up being different: @@ -660,14 +668,10 @@ def test_torch_compile_with_group_offload_leaf(self): # Looks like something that will have to be looked into upstream. # for linear layers, weight.tensor_impl shows cuda... but: # weight.tensor_impl.{data,scale,zero_point}.device will be cpu - super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config) - @unittest.skip( - "Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO." - ) - def test_torch_compile_with_group_offload_leaf_stream(self): - # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=, types=(,), arg_types=(,), kwarg_types={} - super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config) + # For use_stream=True: + # # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=, types=(,), arg_types=(,), kwarg_types={} + super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config) # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners From acd86ed69936b2a80b5f6709060ff1d2117999a4 Mon Sep 17 00:00:00 2001 From: Aryan Date: Mon, 16 Jun 2025 22:59:40 +0200 Subject: [PATCH 5/5] update --- tests/quantization/bnb/test_4bit.py | 3 ++- tests/quantization/test_torch_compile_utils.py | 1 - tests/quantization/torchao/test_torchao.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 63dbdf2c56a7..b94dc4698026 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -45,6 +45,7 @@ require_peft_backend, require_torch, require_torch_accelerator, + require_torch_version_greater, require_transformers_version_greater, slow, torch_device, @@ -860,7 +861,7 @@ def test_fp4_double_safe(self): self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True) -# @require_torch_version_greater("2.7.1") +@require_torch_version_greater("2.7.1") class Bnb4BitCompileTests(QuantCompileTests): quantization_config = PipelineQuantizationConfig( quant_backend="bitsandbytes_8bit", diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py index 7a5af6c1f860..386554bf65cc 100644 --- a/tests/quantization/test_torch_compile_utils.py +++ b/tests/quantization/test_torch_compile_utils.py @@ -74,7 +74,6 @@ def _test_torch_compile_with_group_offload_leaf( "onload_device": torch.device("cuda"), "offload_device": torch.device("cpu"), "offload_type": "leaf_level", - "num_blocks_per_group": 1, "use_stream": use_stream, } pipe.transformer.enable_group_offload(**group_offload_kwargs) diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index af5de359c878..6b4bb7779d8c 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -670,7 +670,7 @@ def test_torch_compile_with_group_offload_leaf(self): # weight.tensor_impl.{data,scale,zero_point}.device will be cpu # For use_stream=True: - # # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=, types=(,), arg_types=(,), kwarg_types={} + # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=, types=(,), arg_types=(,), kwarg_types={} super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)