From 0ea501b9c25e81930c2b5d20065d6cba1243adb3 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Mon, 5 Sep 2022 12:04:07 -0300 Subject: [PATCH 01/25] add accelerate to load models with smaller memory footprint --- src/diffusers/configuration_utils.py | 18 ++++++++---- src/diffusers/modeling_utils.py | 43 +++++++++++++++++++++------- tests/test_models_unet.py | 7 +++++ 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 053ccd6429e0..b90c5c994589 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -24,6 +24,7 @@ from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError +import accelerate from requests import HTTPError from . import __version__ @@ -90,11 +91,18 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool @classmethod def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs): - config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) - - init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs) - - model = cls(**init_dict) + low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", None) + device_map = kwargs.pop("device_map", None) + if low_cpu_mem_usage: + with accelerate.init_empty_weights(): + config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) + init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs) + model = cls(**init_dict) + + else: + config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) + init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs) + model = cls(**init_dict) if return_unused_kwargs: return model, unused_kwargs diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index ec501e2ae1f8..91de97d6b7c0 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -22,6 +22,7 @@ from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError +import accelerate from requests import HTTPError from .utils import CONFIG_NAME, DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging @@ -317,6 +318,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P from_auto_class = kwargs.pop("_from_auto", False) torch_dtype = kwargs.pop("torch_dtype", None) subfolder = kwargs.pop("subfolder", None) + low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", None) + device_map = kwargs.pop("device_map", None) user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class} @@ -333,6 +336,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P use_auth_token=use_auth_token, revision=revision, subfolder=subfolder, + low_cpu_mem_usage=low_cpu_mem_usage, + device_map=device_map, **kwargs, ) @@ -415,25 +420,41 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ) # restore default dtype - state_dict = load_state_dict(model_file) - model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model( - model, - state_dict, - model_file, - pretrained_model_name_or_path, - ignore_mismatched_sizes=ignore_mismatched_sizes, - ) - # Set model in evaluation mode to deactivate DropOut modules by default - model.eval() + if low_cpu_mem_usage: + accelerate.load_checkpoint_and_dispatch( + model, + model_file, + device_map + ) + loading_info = { + "missing_keys": [], + "unexpected_keys": [], + "mismatched_keys": [], + "error_msgs": [], + } + + else: + + state_dict = load_state_dict(model_file) + model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model( + model, + state_dict, + model_file, + pretrained_model_name_or_path, + ignore_mismatched_sizes=ignore_mismatched_sizes, + ) - if output_loading_info: loading_info = { "missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "mismatched_keys": mismatched_keys, "error_msgs": error_msgs, } + + # Set model in evaluation mode to deactivate DropOut modules by default + model.eval() + if output_loading_info: return model, loading_info return model diff --git a/tests/test_models_unet.py b/tests/test_models_unet.py index c574a0092e3c..2da4d103744f 100644 --- a/tests/test_models_unet.py +++ b/tests/test_models_unet.py @@ -133,6 +133,13 @@ def test_from_pretrained_hub(self): assert image is not None, "Make sure output is not None" + def test_from_pretrained_accelerate(self): + model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=True, device_map="auto") + model.to(torch_device) + image = model(**self.dummy_input).sample + + assert image is not None, "Make sure output is not None" + def test_output_pretrained(self): model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update") model.eval() From 7631dd68da13f886d87ba3e6a0e21224abc4475d Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Mon, 12 Sep 2022 11:39:33 -0300 Subject: [PATCH 02/25] remove low_cpu_mem_usage as it is reduntant --- setup.py | 1 + src/diffusers/configuration_utils.py | 5 ++--- src/diffusers/modeling_utils.py | 4 +--- tests/test_models_unet.py | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 7b71bd70d470..a6bf120db47e 100644 --- a/setup.py +++ b/setup.py @@ -97,6 +97,7 @@ "tensorboard", "torch>=1.4", "transformers>=4.21.0", + "accelerate>=0.12.0" ] # this is a lookup table with items like: diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index b90c5c994589..0720ba586939 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -91,9 +91,8 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool @classmethod def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs): - low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", None) - device_map = kwargs.pop("device_map", None) - if low_cpu_mem_usage: + device_map = kwargs.pop("low_cpu_mem_usage", None) + if device_map is not None: with accelerate.init_empty_weights(): config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs) diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index 91de97d6b7c0..325160bbdc9f 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -318,7 +318,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P from_auto_class = kwargs.pop("_from_auto", False) torch_dtype = kwargs.pop("torch_dtype", None) subfolder = kwargs.pop("subfolder", None) - low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", None) device_map = kwargs.pop("device_map", None) user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class} @@ -336,7 +335,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P use_auth_token=use_auth_token, revision=revision, subfolder=subfolder, - low_cpu_mem_usage=low_cpu_mem_usage, device_map=device_map, **kwargs, ) @@ -421,7 +419,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P # restore default dtype - if low_cpu_mem_usage: + if device_map is not None: accelerate.load_checkpoint_and_dispatch( model, model_file, diff --git a/tests/test_models_unet.py b/tests/test_models_unet.py index 2da4d103744f..477307cfb324 100644 --- a/tests/test_models_unet.py +++ b/tests/test_models_unet.py @@ -134,7 +134,7 @@ def test_from_pretrained_hub(self): assert image is not None, "Make sure output is not None" def test_from_pretrained_accelerate(self): - model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=True, device_map="auto") + model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto") model.to(torch_device) image = model(**self.dummy_input).sample From 8592e23c3a1f50775308be0c49214e5872a8d3ea Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Fri, 16 Sep 2022 12:26:23 -0300 Subject: [PATCH 03/25] move accelerate init weights context to modelling utils --- src/diffusers/configuration_utils.py | 14 ++------- src/diffusers/modeling_utils.py | 45 +++++++++++++++++++--------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 4aeacca75edb..0b48fac039e4 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -154,17 +154,9 @@ def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], ret """ - device_map = kwargs.pop("low_cpu_mem_usage", None) - if device_map is not None: - with accelerate.init_empty_weights(): - config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) - init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs) - model = cls(**init_dict) - - else: - config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) - init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs) - model = cls(**init_dict) + config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) + init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs) + model = cls(**init_dict) if return_unused_kwargs: return model, unused_kwargs diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index 81d73a8be36c..07e3cdf66346 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -266,20 +266,37 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P # Load config if we don't provide a configuration config_path = pretrained_model_name_or_path - model, unused_kwargs = cls.from_config( - config_path, - cache_dir=cache_dir, - return_unused_kwargs=True, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - subfolder=subfolder, - device_map=device_map, - **kwargs, - ) + if device_map == "auto": + with accelerate.init_empty_weights(): + model, unused_kwargs = cls.from_config( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + subfolder=subfolder, + device_map=device_map, + **kwargs, + ) + else: + model, unused_kwargs = cls.from_config( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + subfolder=subfolder, + device_map=device_map, + **kwargs, + ) if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype): raise ValueError( From 76b8e4a0a21957a275eefa3863ee39215f0f5f6c Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Fri, 16 Sep 2022 12:44:31 -0300 Subject: [PATCH 04/25] add test to ensure results are the same when loading with accelerate --- tests/test_models_unet.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_models_unet.py b/tests/test_models_unet.py index 8ea25560c80c..13a2bda73d44 100644 --- a/tests/test_models_unet.py +++ b/tests/test_models_unet.py @@ -15,6 +15,7 @@ import math import unittest +import gc import torch @@ -140,6 +141,35 @@ def test_from_pretrained_accelerate(self): assert image is not None, "Make sure output is not None" + def test_from_pretrained_accelerate_wont_change_results(self): + model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto") + model_accelerate.to(torch_device) + model_accelerate.eval() + + noise = torch.randn( + 1, + model_accelerate.config.in_channels, + model_accelerate.config.sample_size, + model_accelerate.config.sample_size, + generator=torch.manual_seed(0), + ) + noise = noise.to(torch_device) + time_step = torch.tensor([10] * noise.shape[0]).to(torch_device) + + arr_accelerate = model_accelerate(noise, time_step)["sample"] + + # two models don't need to stay in the device at the same time + del model_accelerate + torch.cuda.empty_cache() + gc.collect() + + model_normal_load, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) + model_normal_load.to(torch_device) + model_normal_load.eval() + arr_normal_load = model_normal_load(noise, time_step)["sample"] + + assert torch.allclose(arr_accelerate["sample"], arr_normal_load, rtol=1e-3) + def test_output_pretrained(self): model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update") model.eval() From dd7f9b9cc66c0bd46562849a453c7b4b37d56766 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Fri, 16 Sep 2022 12:59:01 -0300 Subject: [PATCH 05/25] add tests to ensure ram usage gets lower when using accelerate --- tests/test_models_unet.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_models_unet.py b/tests/test_models_unet.py index 13a2bda73d44..d12b3017f3af 100644 --- a/tests/test_models_unet.py +++ b/tests/test_models_unet.py @@ -16,6 +16,7 @@ import math import unittest import gc +import tracemalloc import torch @@ -134,6 +135,7 @@ def test_from_pretrained_hub(self): assert image is not None, "Make sure output is not None" + @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") def test_from_pretrained_accelerate(self): model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto") model.to(torch_device) @@ -141,6 +143,7 @@ def test_from_pretrained_accelerate(self): assert image is not None, "Make sure output is not None" + @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") def test_from_pretrained_accelerate_wont_change_results(self): model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto") model_accelerate.to(torch_device) @@ -170,6 +173,31 @@ def test_from_pretrained_accelerate_wont_change_results(self): assert torch.allclose(arr_accelerate["sample"], arr_normal_load, rtol=1e-3) + @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") + def test_memory_footprint_gets_reduced(self): + torch.cuda.empty_cache() + gc.collect() + + tracemalloc.start() + model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto") + model_accelerate.to(torch_device) + model_accelerate.eval() + _, peak_accelerate = tracemalloc.get_traced_memory() + + + del model_accelerate + torch.cuda.empty_cache() + gc.collect() + + model_normal_load, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) + model_normal_load.to(torch_device) + model_normal_load.eval() + _, peak_normal = tracemalloc.get_traced_memory() + + tracemalloc.stop() + + assert peak_accelerate < peak_normal + def test_output_pretrained(self): model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update") model.eval() From ec5f7aa7583db1e6f8195bdc3afb2be928e33380 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Fri, 16 Sep 2022 13:07:03 -0300 Subject: [PATCH 06/25] move accelerate logic to single snippet under modelling utils and remove it from configuration utils --- src/diffusers/configuration_utils.py | 1 - src/diffusers/modeling_utils.py | 82 ++++++++++++++-------------- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 0b48fac039e4..c6082a50c2d1 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -24,7 +24,6 @@ from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError -import accelerate from requests import HTTPError from . import __version__ diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index 07e3cdf66346..54044670c0cc 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -266,46 +266,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P # Load config if we don't provide a configuration config_path = pretrained_model_name_or_path - if device_map == "auto": - with accelerate.init_empty_weights(): - model, unused_kwargs = cls.from_config( - config_path, - cache_dir=cache_dir, - return_unused_kwargs=True, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - subfolder=subfolder, - device_map=device_map, - **kwargs, - ) - else: - model, unused_kwargs = cls.from_config( - config_path, - cache_dir=cache_dir, - return_unused_kwargs=True, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - subfolder=subfolder, - device_map=device_map, - **kwargs, - ) - - if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype): - raise ValueError( - f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}." - ) - elif torch_dtype is not None: - model = model.to(torch_dtype) - model.register_to_config(_name_or_path=pretrained_model_name_or_path) # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the # Load model pretrained_model_name_or_path = str(pretrained_model_name_or_path) @@ -378,20 +339,50 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P # restore default dtype - if device_map is not None: + if device_map == "auto": + with accelerate.init_empty_weights(): + model, unused_kwargs = cls.from_config( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + subfolder=subfolder, + device_map=device_map, + **kwargs, + ) + accelerate.load_checkpoint_and_dispatch( model, model_file, device_map ) + loading_info = { "missing_keys": [], "unexpected_keys": [], "mismatched_keys": [], "error_msgs": [], } - else: + model, unused_kwargs = cls.from_config( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + subfolder=subfolder, + device_map=device_map, + **kwargs, + ) state_dict = load_state_dict(model_file) model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model( @@ -409,6 +400,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P "error_msgs": error_msgs, } + if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype): + raise ValueError( + f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}." + ) + elif torch_dtype is not None: + model = model.to(torch_dtype) + + model.register_to_config(_name_or_path=pretrained_model_name_or_path) + # Set model in evaluation mode to deactivate DropOut modules by default model.eval() if output_loading_info: From 8392e3ff65e4cfa6f912ef72fe7afcd71c2aeb91 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Fri, 16 Sep 2022 15:27:24 -0300 Subject: [PATCH 07/25] format code using to pass quality check --- src/diffusers/modeling_utils.py | 12 ++++-------- tests/test_models_unet.py | 21 +++++++++++++-------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index c4e5799e11c7..53c0eedcdad0 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -359,12 +359,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P device_map=device_map, **kwargs, ) - - accelerate.load_checkpoint_and_dispatch( - model, - model_file, - device_map - ) + + accelerate.load_checkpoint_and_dispatch(model, model_file, device_map) loading_info = { "missing_keys": [], @@ -387,7 +383,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P device_map=device_map, **kwargs, ) - + state_dict = load_state_dict(model_file) model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model( model, @@ -412,7 +408,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P model = model.to(torch_dtype) model.register_to_config(_name_or_path=pretrained_model_name_or_path) - + # Set model in evaluation mode to deactivate DropOut modules by default model.eval() if output_loading_info: diff --git a/tests/test_models_unet.py b/tests/test_models_unet.py index 012ba126a2ba..82e03630cdcb 100644 --- a/tests/test_models_unet.py +++ b/tests/test_models_unet.py @@ -137,7 +137,9 @@ def test_from_pretrained_hub(self): @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") def test_from_pretrained_accelerate(self): - model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto") + model, _ = UNet2DModel.from_pretrained( + "fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto" + ) model.to(torch_device) image = model(**self.dummy_input).sample @@ -145,7 +147,9 @@ def test_from_pretrained_accelerate(self): @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") def test_from_pretrained_accelerate_wont_change_results(self): - model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto") + model_accelerate, _ = UNet2DModel.from_pretrained( + "fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto" + ) model_accelerate.to(torch_device) model_accelerate.eval() @@ -170,20 +174,21 @@ def test_from_pretrained_accelerate_wont_change_results(self): model_normal_load.to(torch_device) model_normal_load.eval() arr_normal_load = model_normal_load(noise, time_step)["sample"] - + assert torch.allclose(arr_accelerate["sample"], arr_normal_load, rtol=1e-3) @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") def test_memory_footprint_gets_reduced(self): torch.cuda.empty_cache() gc.collect() - + tracemalloc.start() - model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto") + model_accelerate, _ = UNet2DModel.from_pretrained( + "fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto" + ) model_accelerate.to(torch_device) model_accelerate.eval() - _, peak_accelerate = tracemalloc.get_traced_memory() - + _, peak_accelerate = tracemalloc.get_traced_memory() del model_accelerate torch.cuda.empty_cache() @@ -192,7 +197,7 @@ def test_memory_footprint_gets_reduced(self): model_normal_load, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) model_normal_load.to(torch_device) model_normal_load.eval() - _, peak_normal = tracemalloc.get_traced_memory() + _, peak_normal = tracemalloc.get_traced_memory() tracemalloc.stop() From 615054affbfb6cf22a13bedd65879908c8421ee0 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Fri, 16 Sep 2022 15:30:37 -0300 Subject: [PATCH 08/25] fix imports with isor --- src/diffusers/modeling_utils.py | 2 +- tests/test_models_unet.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index 53c0eedcdad0..e2f55e5fd1d1 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -20,9 +20,9 @@ import torch from torch import Tensor, device +import accelerate from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError -import accelerate from requests import HTTPError from .utils import CONFIG_NAME, DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging diff --git a/tests/test_models_unet.py b/tests/test_models_unet.py index 82e03630cdcb..fe49916a7146 100644 --- a/tests/test_models_unet.py +++ b/tests/test_models_unet.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math -import unittest import gc +import math import tracemalloc +import unittest import torch From 75c08a94625f45b8f5f1b707608eeb799112ca07 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Fri, 16 Sep 2022 16:19:46 -0300 Subject: [PATCH 09/25] add accelerate to test extra deps --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7f4a8034317a..cafc6b0ec909 100644 --- a/setup.py +++ b/setup.py @@ -175,7 +175,7 @@ def run(self): extras["quality"] = ["black==22.8", "isort>=5.5.4", "flake8>=3.8.3", "hf-doc-builder"] extras["docs"] = ["hf-doc-builder"] extras["training"] = ["accelerate", "datasets", "tensorboard", "modelcards"] -extras["test"] = ["datasets", "onnxruntime", "pytest", "pytest-timeout", "pytest-xdist", "scipy", "transformers"] +extras["test"] = ["datasets", "onnxruntime", "pytest", "pytest-timeout", "pytest-xdist", "scipy", "transformers", "accelerate"] extras["torch"] = deps_list("torch") if os.name == "nt": # windows From 6189b86f06574b320c02a330c5f9543f12f3b53e Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Wed, 21 Sep 2022 11:25:08 -0300 Subject: [PATCH 10/25] only import accelerate if device_map is set to auto --- src/diffusers/modeling_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index e2f55e5fd1d1..dcc59c90578b 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -20,10 +20,10 @@ import torch from torch import Tensor, device -import accelerate from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError from requests import HTTPError +from transformers.utils import is_accelerate_available from .utils import CONFIG_NAME, DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging @@ -344,6 +344,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P # restore default dtype if device_map == "auto": + if is_accelerate_available(): + import accelerate + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + with accelerate.init_empty_weights(): model, unused_kwargs = cls.from_config( config_path, From bc510615c7aa68c26c1fa5050fdc23b8cda6cc9b Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Thu, 22 Sep 2022 11:08:31 -0300 Subject: [PATCH 11/25] move accelerate availability check to diffusers import utils --- src/diffusers/modeling_utils.py | 2 +- src/diffusers/utils/__init__.py | 1 + src/diffusers/utils/import_utils.py | 9 +++++++++ tests/test_models_unet.py | 2 +- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index 7cb9ee7f4f0f..bd9c4fe34ab3 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -23,7 +23,7 @@ from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError from requests import HTTPError -from transformers.utils import is_accelerate_available +from diffusers.utils import is_accelerate_available from .utils import CONFIG_NAME, DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, WEIGHTS_NAME, logging diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index b63dbd2b285c..32ab54079046 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -32,6 +32,7 @@ is_transformers_available, is_unidecode_available, requires_backends, + is_accelerate_available ) from .logging import get_logger from .outputs import BaseOutput diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index de344d074da0..f5a08a916eea 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -159,6 +159,12 @@ except importlib_metadata.PackageNotFoundError: _scipy_available = False +_accelerate_available = importlib.util.find_spec("accelerate") is not None +try: + _accelerate_version = importlib_metadata.version("accelerate") + logger.debug(f"Successfully imported accelerate version {_accelerate_version}") +except importlib_metadata.PackageNotFoundError: + _accelerate_available = False def is_torch_available(): return _torch_available @@ -195,6 +201,9 @@ def is_onnx_available(): def is_scipy_available(): return _scipy_available +def is_accelerate_available(): + return _accelerate_available + # docstyle-ignore FLAX_IMPORT_ERROR = """ diff --git a/tests/test_models_unet.py b/tests/test_models_unet.py index cd4eb3fb782d..59af3832b7a0 100644 --- a/tests/test_models_unet.py +++ b/tests/test_models_unet.py @@ -175,7 +175,7 @@ def test_from_pretrained_accelerate_wont_change_results(self): model_normal_load.eval() arr_normal_load = model_normal_load(noise, time_step)["sample"] - assert torch.allclose(arr_accelerate["sample"], arr_normal_load, rtol=1e-3) + assert torch.allclose(arr_accelerate, arr_normal_load, rtol=1e-3) @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") def test_memory_footprint_gets_reduced(self): From e020d73359ce3765fa83156c4b40f2e8bb3049d6 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Thu, 22 Sep 2022 11:17:53 -0300 Subject: [PATCH 12/25] format code --- src/diffusers/modeling_utils.py | 2 +- src/diffusers/utils/__init__.py | 2 +- src/diffusers/utils/import_utils.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py index c7d4cd6f5e9e..4d609043d731 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/modeling_utils.py @@ -21,10 +21,10 @@ import torch from torch import Tensor, device +from diffusers.utils import is_accelerate_available from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError from requests import HTTPError -from diffusers.utils import is_accelerate_available from .utils import CONFIG_NAME, DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, WEIGHTS_NAME, logging diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 32ab54079046..9c428b6d7852 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -22,6 +22,7 @@ USE_TF, USE_TORCH, DummyObject, + is_accelerate_available, is_flax_available, is_inflect_available, is_modelcards_available, @@ -32,7 +33,6 @@ is_transformers_available, is_unidecode_available, requires_backends, - is_accelerate_available ) from .logging import get_logger from .outputs import BaseOutput diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index f5a08a916eea..b2aabee70c92 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -166,6 +166,7 @@ except importlib_metadata.PackageNotFoundError: _accelerate_available = False + def is_torch_available(): return _torch_available @@ -201,6 +202,7 @@ def is_onnx_available(): def is_scipy_available(): return _scipy_available + def is_accelerate_available(): return _accelerate_available From 6206595cce392f088cb918919d05333796eff167 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Tue, 4 Oct 2022 21:27:17 -0300 Subject: [PATCH 13/25] add device map to pipeline abstraction --- src/diffusers/pipeline_utils.py | 10 +++++++++ tests/test_pipelines.py | 39 ++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index fb8801bc959a..b447db839bf0 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -283,6 +283,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P torch_dtype = kwargs.pop("torch_dtype", None) provider = kwargs.pop("provider", None) sess_options = kwargs.pop("sess_options", None) + device_map = kwargs.pop("device_map", None) # 1. Download the checkpoints and configs # use snapshot download here to get it working from from_pretrained @@ -401,6 +402,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P loading_kwargs["provider"] = provider loading_kwargs["sess_options"] = sess_options + if library_name == "diffusers": + loading_kwargs["device_map"] = device_map + + # if using transformers and class obj has no _no_split modules, using device map will break loading + elif library_name == "transformers": + if getattr(class_obj, "_no_split_modules", None) is not None: + loading_kwargs["device_map"] = device_map + + # check if the module is in a subdirectory if os.path.isdir(os.path.join(cached_folder, name)): loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 78a22ec3138b..4f4d37e54041 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -17,6 +17,7 @@ import os import random import tempfile +import tracemalloc import unittest import numpy as np @@ -473,7 +474,7 @@ def test_stable_diffusion_k_lms(self): expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - + def test_stable_diffusion_attention_chunk(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator unet = self.dummy_cond_unet @@ -1608,3 +1609,39 @@ def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: pipe(prompt=prompt, num_inference_steps=5, guidance_scale=7.5, callback=test_callback_fn, callback_steps=1) assert test_callback_fn.has_been_called assert number_of_steps == 6 + + # @slow + @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") + def test_stable_diffusion_accelerate_load_works(self): + model_id = "CompVis/stable-diffusion-v1-4" + _ = StableDiffusionPipeline.from_pretrained( + model_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" + ).to(torch_device) + + @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") + def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self): + pipeline_id = "CompVis/stable-diffusion-v1-4" + + torch.cuda.empty_cache() + gc.collect() + + tracemalloc.start() + pipeline_accelerate = StableDiffusionPipeline.from_pretrained( + pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" + ) + pipeline_accelerate.to(torch_device) + _, peak_accelerate = tracemalloc.get_traced_memory() + + del pipeline_accelerate + torch.cuda.empty_cache() + gc.collect() + + pipeline_normal_load = StableDiffusionPipeline.from_pretrained( + pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" + ) + pipeline_normal_load.to(torch_device) + _, peak_normal = tracemalloc.get_traced_memory() + + tracemalloc.stop() + + assert peak_accelerate < peak_normal \ No newline at end of file From 8912f53f0c5db153491c7e5b53b3788cb1135501 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Tue, 4 Oct 2022 21:28:13 -0300 Subject: [PATCH 14/25] lint it to pass PR quality check --- src/diffusers/pipeline_utils.py | 1 - tests/test_pipelines.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index b447db839bf0..0e65f392a880 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -410,7 +410,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P if getattr(class_obj, "_no_split_modules", None) is not None: loading_kwargs["device_map"] = device_map - # check if the module is in a subdirectory if os.path.isdir(os.path.join(cached_folder, name)): loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 4f4d37e54041..eb6b97b9048c 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -474,7 +474,7 @@ def test_stable_diffusion_k_lms(self): expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - + def test_stable_diffusion_attention_chunk(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator unet = self.dummy_cond_unet @@ -1644,4 +1644,4 @@ def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self): tracemalloc.stop() - assert peak_accelerate < peak_normal \ No newline at end of file + assert peak_accelerate < peak_normal From fd8829f228387ffbf8b67181861a36c2cffe3207 Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Tue, 4 Oct 2022 21:47:55 -0300 Subject: [PATCH 15/25] fix class check to use accelerate when using diffusers ModelMixin subclasses --- src/diffusers/pipeline_utils.py | 2 +- tests/test_pipelines.py | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 0e65f392a880..9110b0791831 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -402,7 +402,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P loading_kwargs["provider"] = provider loading_kwargs["sess_options"] = sess_options - if library_name == "diffusers": + if issubclass(class_obj, diffusers.ModelMixin): loading_kwargs["device_map"] = device_map # if using transformers and class obj has no _no_split modules, using device map will break loading diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index eb6b97b9048c..7b0c227187d5 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -1626,21 +1626,22 @@ def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self): gc.collect() tracemalloc.start() - pipeline_accelerate = StableDiffusionPipeline.from_pretrained( - pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" + pipeline_normal_load = StableDiffusionPipeline.from_pretrained( + pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True ) - pipeline_accelerate.to(torch_device) - _, peak_accelerate = tracemalloc.get_traced_memory() + pipeline_normal_load.to(torch_device) + _, peak_normal = tracemalloc.get_traced_memory() + tracemalloc.stop() - del pipeline_accelerate + del pipeline_normal_load torch.cuda.empty_cache() gc.collect() - pipeline_normal_load = StableDiffusionPipeline.from_pretrained( + tracemalloc.start() + _ = StableDiffusionPipeline.from_pretrained( pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" ) - pipeline_normal_load.to(torch_device) - _, peak_normal = tracemalloc.get_traced_memory() + _, peak_accelerate = tracemalloc.get_traced_memory() tracemalloc.stop() From 8132cfd4ab9c4154d4422335ed138dc3328d6fad Mon Sep 17 00:00:00 2001 From: Pi Esposito Date: Wed, 5 Oct 2022 09:27:58 -0300 Subject: [PATCH 16/25] use low_cpu_mem_usage in transformers if device_map is not available --- src/diffusers/pipeline_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 9110b0791831..787eb88897ff 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -406,9 +406,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P loading_kwargs["device_map"] = device_map # if using transformers and class obj has no _no_split modules, using device map will break loading + # so we just use low cpu memory usage to reduce ram usage on cpu elif library_name == "transformers": if getattr(class_obj, "_no_split_modules", None) is not None: loading_kwargs["device_map"] = device_map + else: + loading_kwargs["low_cpu_mem_usage"] = True # check if the module is in a subdirectory if os.path.isdir(os.path.join(cached_folder, name)): From 1fd2ea418dc644d5ec21152809e6f9ca37cf6e8b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 7 Oct 2022 14:27:12 +0000 Subject: [PATCH 17/25] NoModuleLayer --- src/diffusers/pipeline_utils.py | 16 ++++++---------- .../pipelines/stable_diffusion/safety_checker.py | 2 ++ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 787eb88897ff..85cf88c674b1 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -31,7 +31,11 @@ from .configuration_utils import ConfigMixin from .schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from .utils import CONFIG_NAME, DIFFUSERS_CACHE, ONNX_WEIGHTS_NAME, WEIGHTS_NAME, BaseOutput, logging +from .utils import CONFIG_NAME, DIFFUSERS_CACHE, ONNX_WEIGHTS_NAME, WEIGHTS_NAME, BaseOutput, logging, is_transformers_available + + +if is_transformers_available(): + from transformers import PreTrainedModel INDEX_FILE = "diffusion_pytorch_model.bin" @@ -402,17 +406,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P loading_kwargs["provider"] = provider loading_kwargs["sess_options"] = sess_options - if issubclass(class_obj, diffusers.ModelMixin): + if issubclass(class_obj, diffusers.ModelMixin) or is_transformers_available() and issubclass(class_obj, PreTrainedModel): loading_kwargs["device_map"] = device_map - # if using transformers and class obj has no _no_split modules, using device map will break loading - # so we just use low cpu memory usage to reduce ram usage on cpu - elif library_name == "transformers": - if getattr(class_obj, "_no_split_modules", None) is not None: - loading_kwargs["device_map"] = device_map - else: - loading_kwargs["low_cpu_mem_usage"] = True - # check if the module is in a subdirectory if os.path.isdir(os.path.join(cached_folder, name)): loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index 773a7d4b2107..ea2b821b2da9 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -19,6 +19,8 @@ def cosine_distance(image_embeds, text_embeds): class StableDiffusionSafetyChecker(PreTrainedModel): config_class = CLIPConfig + _no_split_modules = ["CLIPEncoderLayer"] + def __init__(self, config: CLIPConfig): super().__init__(config) From c2d9a84c6bf878ec689c7a1c153146ac6d06aa09 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 7 Oct 2022 14:36:27 +0000 Subject: [PATCH 18/25] comment out tests --- tests/test_pipelines.py | 70 ++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index e722fd848e0e..2751a3bb1c97 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -1642,38 +1642,38 @@ def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: assert number_of_steps == 6 # @slow - @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") - def test_stable_diffusion_accelerate_load_works(self): - model_id = "CompVis/stable-diffusion-v1-4" - _ = StableDiffusionPipeline.from_pretrained( - model_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" - ).to(torch_device) - - @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") - def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self): - pipeline_id = "CompVis/stable-diffusion-v1-4" - - torch.cuda.empty_cache() - gc.collect() - - tracemalloc.start() - pipeline_normal_load = StableDiffusionPipeline.from_pretrained( - pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True - ) - pipeline_normal_load.to(torch_device) - _, peak_normal = tracemalloc.get_traced_memory() - tracemalloc.stop() - - del pipeline_normal_load - torch.cuda.empty_cache() - gc.collect() - - tracemalloc.start() - _ = StableDiffusionPipeline.from_pretrained( - pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" - ) - _, peak_accelerate = tracemalloc.get_traced_memory() - - tracemalloc.stop() - - assert peak_accelerate < peak_normal +# @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") +# def test_stable_diffusion_accelerate_load_works(self): +# model_id = "CompVis/stable-diffusion-v1-4" +# _ = StableDiffusionPipeline.from_pretrained( +# model_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" +# ).to(torch_device) +# +# @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") +# def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self): +# pipeline_id = "CompVis/stable-diffusion-v1-4" +# +# torch.cuda.empty_cache() +# gc.collect() +# +# tracemalloc.start() +# pipeline_normal_load = StableDiffusionPipeline.from_pretrained( +# pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True +# ) +# pipeline_normal_load.to(torch_device) +# _, peak_normal = tracemalloc.get_traced_memory() +# tracemalloc.stop() +# +# del pipeline_normal_load +# torch.cuda.empty_cache() +# gc.collect() +# +# tracemalloc.start() +# _ = StableDiffusionPipeline.from_pretrained( +# pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" +# ) +# _, peak_accelerate = tracemalloc.get_traced_memory() +# +# tracemalloc.stop() +# +# assert peak_accelerate < peak_normal From a7bb7f81858724d0b7ec1e200bbcfbd2008621ba Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 7 Oct 2022 14:36:47 +0000 Subject: [PATCH 19/25] up --- src/diffusers/pipeline_utils.py | 16 ++++++++++++++-- tests/test_pipelines.py | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 85cf88c674b1..4945003c23be 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -31,7 +31,15 @@ from .configuration_utils import ConfigMixin from .schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from .utils import CONFIG_NAME, DIFFUSERS_CACHE, ONNX_WEIGHTS_NAME, WEIGHTS_NAME, BaseOutput, logging, is_transformers_available +from .utils import ( + CONFIG_NAME, + DIFFUSERS_CACHE, + ONNX_WEIGHTS_NAME, + WEIGHTS_NAME, + BaseOutput, + is_transformers_available, + logging, +) if is_transformers_available(): @@ -406,7 +414,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P loading_kwargs["provider"] = provider loading_kwargs["sess_options"] = sess_options - if issubclass(class_obj, diffusers.ModelMixin) or is_transformers_available() and issubclass(class_obj, PreTrainedModel): + if ( + issubclass(class_obj, diffusers.ModelMixin) + or is_transformers_available() + and issubclass(class_obj, PreTrainedModel) + ): loading_kwargs["device_map"] = device_map # check if the module is in a subdirectory diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 2751a3bb1c97..9515a2791765 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -1642,6 +1642,8 @@ def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: assert number_of_steps == 6 # @slow + + # @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") # def test_stable_diffusion_accelerate_load_works(self): # model_id = "CompVis/stable-diffusion-v1-4" From 0a9bcd9af0a59c3011c63d0fcf4b821084fe8d04 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 7 Oct 2022 14:39:56 +0000 Subject: [PATCH 20/25] uP --- tests/test_pipelines.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 9515a2791765..d1ce9ecc47f0 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -17,7 +17,6 @@ import os import random import tempfile -import tracemalloc import unittest import numpy as np From 6d0bbbaca7fed682063f26abd6ac2788d855ab54 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 7 Oct 2022 16:20:48 +0000 Subject: [PATCH 21/25] finish --- src/diffusers/pipelines/stable_diffusion/safety_checker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index ea2b821b2da9..11d2b13273c2 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -20,6 +20,7 @@ class StableDiffusionSafetyChecker(PreTrainedModel): config_class = CLIPConfig _no_split_modules = ["CLIPEncoderLayer"] + base_model_prefix = "vision_model" def __init__(self, config: CLIPConfig): super().__init__(config) @@ -30,8 +31,8 @@ def __init__(self, config: CLIPConfig): self.concept_embeds = nn.Parameter(torch.ones(17, config.projection_dim), requires_grad=False) self.special_care_embeds = nn.Parameter(torch.ones(3, config.projection_dim), requires_grad=False) - self.register_buffer("concept_embeds_weights", torch.ones(17)) - self.register_buffer("special_care_embeds_weights", torch.ones(3)) + self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False) + self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False) @torch.no_grad() def forward(self, clip_input, images): From e5990335e92c0ff0e8e6bbdbfd2942c09788fcef Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 7 Oct 2022 18:22:38 +0200 Subject: [PATCH 22/25] Update src/diffusers/pipelines/stable_diffusion/safety_checker.py --- src/diffusers/pipelines/stable_diffusion/safety_checker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index 11d2b13273c2..3984171f57db 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -20,7 +20,6 @@ class StableDiffusionSafetyChecker(PreTrainedModel): config_class = CLIPConfig _no_split_modules = ["CLIPEncoderLayer"] - base_model_prefix = "vision_model" def __init__(self, config: CLIPConfig): super().__init__(config) From dfaabd6a3fbaab274949d65167a2db448543876f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 7 Oct 2022 16:36:45 +0000 Subject: [PATCH 23/25] finish --- tests/test_pipelines.py | 87 ++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index d1ce9ecc47f0..147006794273 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -17,12 +17,15 @@ import os import random import tempfile +import tracemalloc import unittest import numpy as np import torch +import accelerate import PIL +import transformers from diffusers import ( AutoencoderKL, DDIMPipeline, @@ -49,6 +52,7 @@ from diffusers.pipeline_utils import DiffusionPipeline from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, load_image, slow, torch_device +from packaging import version from PIL import Image from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer @@ -1640,41 +1644,52 @@ def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: assert test_callback_fn.has_been_called assert number_of_steps == 6 - # @slow + @slow + @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") + def test_stable_diffusion_accelerate_load_works(self): + if version.parse(version.parse(transformers.__version__).base_version) < "4.23": + return + if version.parse(version.parse(accelerate.__version__).base_version) < "0.14": + return -# @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") -# def test_stable_diffusion_accelerate_load_works(self): -# model_id = "CompVis/stable-diffusion-v1-4" -# _ = StableDiffusionPipeline.from_pretrained( -# model_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" -# ).to(torch_device) -# -# @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") -# def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self): -# pipeline_id = "CompVis/stable-diffusion-v1-4" -# -# torch.cuda.empty_cache() -# gc.collect() -# -# tracemalloc.start() -# pipeline_normal_load = StableDiffusionPipeline.from_pretrained( -# pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True -# ) -# pipeline_normal_load.to(torch_device) -# _, peak_normal = tracemalloc.get_traced_memory() -# tracemalloc.stop() -# -# del pipeline_normal_load -# torch.cuda.empty_cache() -# gc.collect() -# -# tracemalloc.start() -# _ = StableDiffusionPipeline.from_pretrained( -# pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" -# ) -# _, peak_accelerate = tracemalloc.get_traced_memory() -# -# tracemalloc.stop() -# -# assert peak_accelerate < peak_normal + model_id = "CompVis/stable-diffusion-v1-4" + _ = StableDiffusionPipeline.from_pretrained( + model_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" + ).to(torch_device) + + @slow + @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") + def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self): + if version.parse(version.parse(transformers.__version__).base_version) < "4.23": + return + + if version.parse(version.parse(accelerate.__version__).base_version) < "0.14": + return + + pipeline_id = "CompVis/stable-diffusion-v1-4" + + torch.cuda.empty_cache() + gc.collect() + + tracemalloc.start() + pipeline_normal_load = StableDiffusionPipeline.from_pretrained( + pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True + ) + pipeline_normal_load.to(torch_device) + _, peak_normal = tracemalloc.get_traced_memory() + tracemalloc.stop() + + del pipeline_normal_load + torch.cuda.empty_cache() + gc.collect() + + tracemalloc.start() + _ = StableDiffusionPipeline.from_pretrained( + pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto" + ) + _, peak_accelerate = tracemalloc.get_traced_memory() + + tracemalloc.stop() + + assert peak_accelerate < peak_normal From bbc05c5e55ccd5707be4ef55c8ab4af1c501bdaf Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 7 Oct 2022 16:43:45 +0000 Subject: [PATCH 24/25] uP --- tests/test_pipelines.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 147006794273..0027149406db 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -1647,10 +1647,10 @@ def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: @slow @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") def test_stable_diffusion_accelerate_load_works(self): - if version.parse(version.parse(transformers.__version__).base_version) < "4.23": + if version.parse(version.parse(transformers.__version__).base_version) < version.parse("4.23"): return - if version.parse(version.parse(accelerate.__version__).base_version) < "0.14": + if version.parse(version.parse(accelerate.__version__).base_version) < version.parse("0.14"): return model_id = "CompVis/stable-diffusion-v1-4" @@ -1661,10 +1661,10 @@ def test_stable_diffusion_accelerate_load_works(self): @slow @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU") def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self): - if version.parse(version.parse(transformers.__version__).base_version) < "4.23": + if version.parse(version.parse(transformers.__version__).base_version) < version.parse("4.23"): return - if version.parse(version.parse(accelerate.__version__).base_version) < "0.14": + if version.parse(version.parse(accelerate.__version__).base_version) < version.parse("0.14"): return pipeline_id = "CompVis/stable-diffusion-v1-4" From dcf7e0fc82342ecfe76b0aadcc9d1eb25e317d4c Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 10 Oct 2022 15:12:54 +0200 Subject: [PATCH 25/25] make style --- tests/test_pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 55129352c680..30beb033fca7 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -52,8 +52,8 @@ from diffusers.pipeline_utils import DiffusionPipeline from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, load_image, slow, torch_device -from packaging import version from diffusers.utils.testing_utils import get_tests_dir +from packaging import version from PIL import Image from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer