Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/diffusers/models/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(
self.value = nn.Linear(channels, channels)

self.rescale_output_factor = rescale_output_factor
self.proj_attn = nn.Linear(channels, channels, 1)
self.proj_attn = nn.Linear(channels, channels, bias=True)

self._use_memory_efficient_attention_xformers = False
self._attention_op = None
Expand Down
16 changes: 8 additions & 8 deletions src/diffusers/models/controlnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte
`processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
The instantiated processor class or a dictionary of processor classes that will be set as the processor
of **all** `Attention` layers.
In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.:
In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:

"""
count = len(self.attn_processors.keys())
Expand Down Expand Up @@ -379,34 +379,34 @@ def set_attention_slice(self, slice_size):
Args:
slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
`"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
`"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
must be a multiple of `slice_size`.
"""
sliceable_head_dims = []

def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
if hasattr(module, "set_attention_slice"):
sliceable_head_dims.append(module.sliceable_head_dim)

for child in module.children():
fn_recursive_retrieve_slicable_dims(child)
fn_recursive_retrieve_sliceable_dims(child)

# retrieve number of attention layers
for module in self.children():
fn_recursive_retrieve_slicable_dims(module)
fn_recursive_retrieve_sliceable_dims(module)

num_slicable_layers = len(sliceable_head_dims)
num_sliceable_layers = len(sliceable_head_dims)

if slice_size == "auto":
# half the attention head size is usually a good trade-off between
# speed and memory
slice_size = [dim // 2 for dim in sliceable_head_dims]
elif slice_size == "max":
# make smallest slice possible
slice_size = num_slicable_layers * [1]
slice_size = num_sliceable_layers * [1]

slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size

if len(slice_size) != len(sliceable_head_dims):
raise ValueError(
Expand Down
4 changes: 2 additions & 2 deletions src/diffusers/models/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
raise ValueError(
f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
" `low_cpu_mem_usage=False` and `device_map=None` if you want to randomely initialize"
" `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
" those weights or else make sure your checkpoint file is correct."
)

Expand All @@ -591,7 +591,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
set_module_tensor_to_device(model, param_name, param_device, value=param)
else: # else let accelerate handle loading and dispatching.
# Load weights and dispatch according to the device_map
# by deafult the device_map is None and the weights are loaded on the CPU
# by default the device_map is None and the weights are loaded on the CPU
accelerate.load_checkpoint_and_dispatch(model, model_file, device_map, dtype=torch_dtype)

loading_info = {
Expand Down
2 changes: 1 addition & 1 deletion src/diffusers/models/resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ class ResnetBlock2D(nn.Module):
time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
"ada_group" for a stronger conditioning with scale and shift.
kernal (`torch.FloatTensor`, optional, default to None): FIR filter, see
kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
[`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
use_in_shortcut (`bool`, *optional*, default to `True`):
Expand Down
6 changes: 3 additions & 3 deletions src/diffusers/models/transformer_2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __init__(
self.attention_head_dim = attention_head_dim
inner_dim = num_attention_heads * attention_head_dim

# 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
# 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
# Define whether input is continuous or discrete depending on configuration
self.is_input_continuous = (in_channels is not None) and (patch_size is None)
self.is_input_vectorized = num_vector_embeds is not None
Expand Down Expand Up @@ -198,7 +198,7 @@ def __init__(
# 4. Define output layers
self.out_channels = in_channels if out_channels is None else out_channels
if self.is_input_continuous:
# TODO: should use out_channels for continous projections
# TODO: should use out_channels for continuous projections
if use_linear_projection:
self.proj_out = nn.Linear(inner_dim, in_channels)
else:
Expand All @@ -223,7 +223,7 @@ def forward(
"""
Args:
hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
hidden_states
encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
Expand Down
2 changes: 1 addition & 1 deletion src/diffusers/models/unet_1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class UNet1DModel(ModelMixin, ConfigMixin):
obj:`(32, 32, 64)`): Tuple of block output channels.
mid_block_type (`str`, *optional*, defaults to "UNetMidBlock1D"): block type for middle of UNet.
out_block_type (`str`, *optional*, defaults to `None`): optional output processing of UNet.
act_fn (`str`, *optional*, defaults to None): optional activitation function in UNet blocks.
act_fn (`str`, *optional*, defaults to None): optional activation function in UNet blocks.
norm_num_groups (`int`, *optional*, defaults to 8): group norm member count in UNet blocks.
layers_per_block (`int`, *optional*, defaults to 1): added number of layers in a UNet block.
downsample_each_block (`int`, *optional*, defaults to False:
Expand Down
2 changes: 1 addition & 1 deletion src/diffusers/models/unet_1d_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
self.key = nn.Linear(self.channels, self.channels)
self.value = nn.Linear(self.channels, self.channels)

self.proj_attn = nn.Linear(self.channels, self.channels, 1)
self.proj_attn = nn.Linear(self.channels, self.channels, bias=True)

self.dropout = nn.Dropout(dropout_rate, inplace=True)

Expand Down
2 changes: 1 addition & 1 deletion src/diffusers/models/unet_2d_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2684,7 +2684,7 @@ def __init__(
dropout=dropout,
bias=attention_bias,
cross_attention_dim=None,
cross_attention_norm=None,
cross_attention_norm=False,
)

# 2. Cross-Attn
Expand Down
20 changes: 10 additions & 10 deletions src/diffusers/models/unet_2d_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def __init__(
timestep_input_dim = block_out_channels[0]
else:
raise ValueError(
f"{time_embedding_type} does not exist. Pleaes make sure to use one of `fourier` or `positional`."
f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
)

self.time_embedding = TimestepEmbedding(
Expand Down Expand Up @@ -391,7 +391,7 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte
`processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
The instantiated processor class or a dictionary of processor classes that will be set as the processor
of **all** `Attention` layers.
In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.:
In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:

"""
count = len(self.attn_processors.keys())
Expand Down Expand Up @@ -425,34 +425,34 @@ def set_attention_slice(self, slice_size):
Args:
slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
`"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
`"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
must be a multiple of `slice_size`.
"""
sliceable_head_dims = []

def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
if hasattr(module, "set_attention_slice"):
sliceable_head_dims.append(module.sliceable_head_dim)

for child in module.children():
fn_recursive_retrieve_slicable_dims(child)
fn_recursive_retrieve_sliceable_dims(child)

# retrieve number of attention layers
for module in self.children():
fn_recursive_retrieve_slicable_dims(module)
fn_recursive_retrieve_sliceable_dims(module)

num_slicable_layers = len(sliceable_head_dims)
num_sliceable_layers = len(sliceable_head_dims)

if slice_size == "auto":
# half the attention head size is usually a good trade-off between
# speed and memory
slice_size = [dim // 2 for dim in sliceable_head_dims]
elif slice_size == "max":
# make smallest slice possible
slice_size = num_slicable_layers * [1]
slice_size = num_sliceable_layers * [1]

slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size

if len(slice_size) != len(sliceable_head_dims):
raise ValueError(
Expand Down Expand Up @@ -515,7 +515,7 @@ def forward(
returning a tuple, the first element is the sample tensor.
"""
# By default samples have to be AT least a multiple of the overall upsampling factor.
# The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
# The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
# However, the upsampling interpolation output size can be forced to fit any upsampling size
# on the fly if necessary.
default_overall_up_factor = 2**self.num_upsamplers
Expand Down
2 changes: 1 addition & 1 deletion src/diffusers/pipelines/pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1351,7 +1351,7 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
Args:
slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
`"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
`"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
must be a multiple of `slice_size`.
"""
Expand Down
20 changes: 10 additions & 10 deletions src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def __init__(
timestep_input_dim = block_out_channels[0]
else:
raise ValueError(
f"{time_embedding_type} does not exist. Pleaes make sure to use one of `fourier` or `positional`."
f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
)

self.time_embedding = TimestepEmbedding(
Expand Down Expand Up @@ -481,7 +481,7 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte
`processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
The instantiated processor class or a dictionary of processor classes that will be set as the processor
of **all** `Attention` layers.
In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.:
In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:

"""
count = len(self.attn_processors.keys())
Expand Down Expand Up @@ -515,34 +515,34 @@ def set_attention_slice(self, slice_size):
Args:
slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
`"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
`"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
must be a multiple of `slice_size`.
"""
sliceable_head_dims = []

def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
if hasattr(module, "set_attention_slice"):
sliceable_head_dims.append(module.sliceable_head_dim)

for child in module.children():
fn_recursive_retrieve_slicable_dims(child)
fn_recursive_retrieve_sliceable_dims(child)

# retrieve number of attention layers
for module in self.children():
fn_recursive_retrieve_slicable_dims(module)
fn_recursive_retrieve_sliceable_dims(module)

num_slicable_layers = len(sliceable_head_dims)
num_sliceable_layers = len(sliceable_head_dims)

if slice_size == "auto":
# half the attention head size is usually a good trade-off between
# speed and memory
slice_size = [dim // 2 for dim in sliceable_head_dims]
elif slice_size == "max":
# make smallest slice possible
slice_size = num_slicable_layers * [1]
slice_size = num_sliceable_layers * [1]

slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size

if len(slice_size) != len(sliceable_head_dims):
raise ValueError(
Expand Down Expand Up @@ -605,7 +605,7 @@ def forward(
returning a tuple, the first element is the sample tensor.
"""
# By default samples have to be AT least a multiple of the overall upsampling factor.
# The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
# The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
# However, the upsampling interpolation output size can be forced to fit any upsampling size
# on the fly if necessary.
default_overall_up_factor = 2**self.num_upsamplers
Expand Down
10 changes: 5 additions & 5 deletions tests/models/test_models_unet_2d_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,23 +223,23 @@ def test_model_attention_slicing(self):
output = model(**inputs_dict)
assert output is not None

def test_model_slicable_head_dim(self):
def test_model_sliceable_head_dim(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

init_dict["attention_head_dim"] = (8, 16)

model = self.model_class(**init_dict)

def check_slicable_dim_attr(module: torch.nn.Module):
def check_sliceable_dim_attr(module: torch.nn.Module):
if hasattr(module, "set_attention_slice"):
assert isinstance(module.sliceable_head_dim, int)

for child in module.children():
check_slicable_dim_attr(child)
check_sliceable_dim_attr(child)

# retrieve number of attention layers
for module in model.children():
check_slicable_dim_attr(module)
check_sliceable_dim_attr(module)

def test_special_attn_proc(self):
class AttnEasyProc(torch.nn.Module):
Expand Down Expand Up @@ -658,7 +658,7 @@ def test_set_attention_slice_list(self):
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()

# there are 32 slicable layers
# there are 32 sliceable layers
slice_list = 16 * [2, 3]
unet = self.get_unet_model()
unet.set_attention_slice(slice_list)
Expand Down