From f85d908b0428c389b58a65aba883795168cbdb42 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 26 Oct 2022 18:22:05 +0200 Subject: [PATCH 001/131] initial TokenEncoder and ContinuousEncoder --- .../spectrogram_diffusion/__init__.py | 2 + .../spectrogram_diffusion/modules.py | 148 ++++++++++++++++++ .../pipeline_spectrogram_diffusion.py | 0 3 files changed, 150 insertions(+) create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/__init__.py create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/modules.py create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py new file mode 100644 index 000000000000..53377210e7b3 --- /dev/null +++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa +from .modules import TokenEncoder, ContinuousEncoder diff --git a/src/diffusers/pipelines/spectrogram_diffusion/modules.py b/src/diffusers/pipelines/spectrogram_diffusion/modules.py new file mode 100644 index 000000000000..b4ea211e4741 --- /dev/null +++ b/src/diffusers/pipelines/spectrogram_diffusion/modules.py @@ -0,0 +1,148 @@ +import torch +import torch.nn as nn + +from transformers.models.t5.modeling_t5 import T5LayerNorm, T5Block + + +class TokenEncoder(nn.Module): + def __init__(self, config, weights): + super().__init__() + + self.token_embedder = nn.Embedding( + config.vocab_size, + config.d_model, + _weight=torch.FloatTensor(weights["token_embedder"]["embedding"]), + ) + + self.position_encoding = nn.Embedding( + config.max_length, + config.d_model, + _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), + ) + self.position_encoding.weight.requires_grad = False + + self.dropout_pre = nn.Dropout(p=config.dropout_rate) + + config.is_encoder_decoder = False + self.encoders = nn.ModuleList([]) + for lyr_num in range(config.num_layers): + lyr = T5Block(config) + ly_weight = weights[f"layers_{lyr_num}"] + + attention_weights = ly_weight["attention"] + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"])) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"])) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"])) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"])) + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + ) + + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"]) + ) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"]) + ) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"])) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + + self.encoders.append(lyr) + + self.layer_norm = T5LayerNorm(hidden_size=config.d_model) + self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + + self.dropout_post = nn.Dropout(p=config.dropout_rate) + + def forward(self, encoder_input_tokens, encoder_inputs_mask): + x = self.token_embedder(encoder_input_tokens) + + seq_length = encoder_input_tokens.shape[1] + inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device) + x += self.position_encoding(inputs_positions) + + x = self.dropout_pre(x) + + for lyr in self.encoders: + x = lyr(x, encoder_inputs_mask) + + x = self.layer_norm(x) + + return self.dropout_post(x), encoder_inputs_mask + + +class ContinuousEncoder(nn.Module): + def __init__(self, config, weights): + super().__init__() + + self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False) + self.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"])) + + self.position_encoding = nn.Embedding( + config.targets_context_length, + config.d_model, + _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), + ) + self.position_encoding.weight.requires_grad = False + + self.dropout_pre = nn.Dropout(p=config.dropout_rate) + + config.is_encoder_decoder = False + self.encoders = nn.ModuleList([]) + for lyr_num in range(config.num_layers): + lyr = T5Block(config) + ly_weight = weights[f"layers_{lyr_num}"] + + attention_weights = ly_weight["attention"] + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"])) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"])) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"])) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"])) + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + ) + + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"]) + ) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"]) + ) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"])) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + + self.encoders.append(lyr) + + self.layer_norm = T5LayerNorm(hidden_size=config.d_model) + self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + + self.dropout_post = nn.Dropout(p=config.dropout_rate) + + def get_sequence_length(self, sequence): + # Return the first index where a 0 occurs. + length = torch.argmax(sequence == 0) + + # If argmax returns 0, that means that either + # 1) No 0s were found, and the sequence length is the full length of the array + # 2) There's padding immediately at the beginning, indicating that the array + # is all padding and the sequence length is 0. + return torch.where(length == 0 and sequence[0] != 0, sequence.shape[0], length) + + def forward(self, encoder_inputs, encoder_inputs_mask): + x = self.input_proj(encoder_inputs) + + # terminal relative positional encodings + max_positions = encoder_inputs.shape[1] + input_positions = torch.arange(max_positions, device=encoder_inputs.device) + seq_lens = self.get_sequence_length(encoder_inputs_mask) + input_positions = torch.roll(input_positions, seq_lens, dims=0) + x += self.position_encoding(input_positions) + + x = self.dropout_pre(x) + + for lyr in self.encoders: + x = lyr(x, encoder_inputs_mask) + + x = self.layer_norm(x) + + return self.dropout_post(x), encoder_inputs_mask diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py new file mode 100644 index 000000000000..e69de29bb2d1 From e02541020431a1fe8e6ca57480907726b53b2700 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 28 Oct 2022 18:28:36 +0200 Subject: [PATCH 002/131] initial modules --- .../spectrogram_diffusion/__init__.py | 2 +- .../spectrogram_diffusion/modules.py | 148 ----- .../pipeline_spectrogram_diffusion.py | 618 ++++++++++++++++++ 3 files changed, 619 insertions(+), 149 deletions(-) delete mode 100644 src/diffusers/pipelines/spectrogram_diffusion/modules.py diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py index 53377210e7b3..fb094f2380ca 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py @@ -1,2 +1,2 @@ # flake8: noqa -from .modules import TokenEncoder, ContinuousEncoder +from .pipeline_spectrogram_diffusion import ContinuousEncoder, Decoder, TokenEncoder diff --git a/src/diffusers/pipelines/spectrogram_diffusion/modules.py b/src/diffusers/pipelines/spectrogram_diffusion/modules.py deleted file mode 100644 index b4ea211e4741..000000000000 --- a/src/diffusers/pipelines/spectrogram_diffusion/modules.py +++ /dev/null @@ -1,148 +0,0 @@ -import torch -import torch.nn as nn - -from transformers.models.t5.modeling_t5 import T5LayerNorm, T5Block - - -class TokenEncoder(nn.Module): - def __init__(self, config, weights): - super().__init__() - - self.token_embedder = nn.Embedding( - config.vocab_size, - config.d_model, - _weight=torch.FloatTensor(weights["token_embedder"]["embedding"]), - ) - - self.position_encoding = nn.Embedding( - config.max_length, - config.d_model, - _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), - ) - self.position_encoding.weight.requires_grad = False - - self.dropout_pre = nn.Dropout(p=config.dropout_rate) - - config.is_encoder_decoder = False - self.encoders = nn.ModuleList([]) - for lyr_num in range(config.num_layers): - lyr = T5Block(config) - ly_weight = weights[f"layers_{lyr_num}"] - - attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"])) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"])) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"])) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"])) - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) - - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"]) - ) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"]) - ) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"])) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - - self.encoders.append(lyr) - - self.layer_norm = T5LayerNorm(hidden_size=config.d_model) - self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) - - self.dropout_post = nn.Dropout(p=config.dropout_rate) - - def forward(self, encoder_input_tokens, encoder_inputs_mask): - x = self.token_embedder(encoder_input_tokens) - - seq_length = encoder_input_tokens.shape[1] - inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device) - x += self.position_encoding(inputs_positions) - - x = self.dropout_pre(x) - - for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask) - - x = self.layer_norm(x) - - return self.dropout_post(x), encoder_inputs_mask - - -class ContinuousEncoder(nn.Module): - def __init__(self, config, weights): - super().__init__() - - self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False) - self.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"])) - - self.position_encoding = nn.Embedding( - config.targets_context_length, - config.d_model, - _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), - ) - self.position_encoding.weight.requires_grad = False - - self.dropout_pre = nn.Dropout(p=config.dropout_rate) - - config.is_encoder_decoder = False - self.encoders = nn.ModuleList([]) - for lyr_num in range(config.num_layers): - lyr = T5Block(config) - ly_weight = weights[f"layers_{lyr_num}"] - - attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"])) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"])) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"])) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"])) - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) - - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"]) - ) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"]) - ) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"])) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - - self.encoders.append(lyr) - - self.layer_norm = T5LayerNorm(hidden_size=config.d_model) - self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) - - self.dropout_post = nn.Dropout(p=config.dropout_rate) - - def get_sequence_length(self, sequence): - # Return the first index where a 0 occurs. - length = torch.argmax(sequence == 0) - - # If argmax returns 0, that means that either - # 1) No 0s were found, and the sequence length is the full length of the array - # 2) There's padding immediately at the beginning, indicating that the array - # is all padding and the sequence length is 0. - return torch.where(length == 0 and sequence[0] != 0, sequence.shape[0], length) - - def forward(self, encoder_inputs, encoder_inputs_mask): - x = self.input_proj(encoder_inputs) - - # terminal relative positional encodings - max_positions = encoder_inputs.shape[1] - input_positions = torch.arange(max_positions, device=encoder_inputs.device) - seq_lens = self.get_sequence_length(encoder_inputs_mask) - input_positions = torch.roll(input_positions, seq_lens, dims=0) - x += self.position_encoding(input_positions) - - x = self.dropout_pre(x) - - for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask) - - x = self.layer_norm(x) - - return self.dropout_post(x), encoder_inputs_mask diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index e69de29bb2d1..43240b206a52 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -0,0 +1,618 @@ +import torch +import torch.nn as nn + +from diffusers.models.embeddings import get_timestep_embedding +from transformers.models.t5.modeling_t5 import T5Block, T5Config, T5LayerCrossAttention, T5LayerFF, T5LayerNorm + + +class FiLMLayer(nn.Module): + def __init__(self, in_features, out_features): + super().__init__() + self.scale_bias = nn.Linear(in_features, out_features * 2) + + def forward(self, x, conditioning_emb): + scale_bias = self.scale_bias(conditioning_emb) + scale, bias = torch.chunk(scale_bias, 2, -1) + return x * (scale + 1.0) + bias + + +class T5LayerSelfAttentionCond(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.layer_norm = T5LayerNorm(config.d_model) + self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model) + self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + ): + # pre_self_attention_layer_norm + normed_hidden_states = self.layer_norm(hidden_states) + + if conditioning_emb is not None: + normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb) + + # Self-attention block + attention_output = self.SelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them + return outputs + + +class DecoderLayer(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.layer = nn.ModuleList() + + # cond self attention: layer 0 + self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias)) + + # cross attention: layer 1 + self.layer.append(T5LayerCrossAttention(config)) + + # pre_mlp_layer_norm: layer 2 + self.layer.append(T5LayerNorm(hidden_size=config.d_model)) + + # FiLM layer: 3 + self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)) + + # MLP + dropout: last layer + self.layer.append(T5LayerFF(config)) + + def forward( + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): + + if past_key_value is not None: + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + + if len(past_key_value) != expected_num_past_key_values: + raise ValueError( + f"There should be {expected_num_past_key_values} past states. " + f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" + f"Got {len(past_key_value)} past key / value states" + ) + + self_attn_past_key_value = past_key_value[:2] + cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + conditioning_emb=conditioning_emb, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=self_attn_past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] + attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + if encoder_hidden_states is not None: + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here + if present_key_value_state is not None: + query_length = present_key_value_state[0].shape[2] + else: + query_length = None + + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states + if present_key_value_state is not None: + present_key_value_state = present_key_value_state + cross_attention_outputs[1] + + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + + # Apply LayerNorm + hidden_states = self.layer[2](hidden_states) + + # FiLM + if conditioning_emb is not None: + hidden_states = self.layer[3](hidden_states, conditioning_emb) + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if use_cache: + outputs = outputs + (present_key_value_state,) + attention_outputs + else: + outputs = outputs + attention_outputs + + return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + +class DecoderLayer(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.layer = nn.ModuleList() + + # cond self attention: layer 0 + self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias)) + + # cross attention: layer 1 + self.layer.append(T5LayerCrossAttention(config)) + + # pre_mlp_layer_norm: layer 2 + self.layer.append(T5LayerNorm(hidden_size=config.d_model)) + + # FiLM layer: 3 + self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)) + + # MLP + dropout: last layer + self.layer.append(T5LayerFF(config)) + + def forward( + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): + + if past_key_value is not None: + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + + if len(past_key_value) != expected_num_past_key_values: + raise ValueError( + f"There should be {expected_num_past_key_values} past states. " + f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" + f"Got {len(past_key_value)} past key / value states" + ) + + self_attn_past_key_value = past_key_value[:2] + cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + conditioning_emb=conditioning_emb, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=self_attn_past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] + attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + if encoder_hidden_states is not None: + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here + if present_key_value_state is not None: + query_length = present_key_value_state[0].shape[2] + else: + query_length = None + + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states + if present_key_value_state is not None: + present_key_value_state = present_key_value_state + cross_attention_outputs[1] + + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + + # Apply LayerNorm + hidden_states = self.layer[2](hidden_states) + + # FiLM + if conditioning_emb is not None: + hidden_states = self.layer[3](hidden_states, conditioning_emb) + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if use_cache: + outputs = outputs + (present_key_value_state,) + attention_outputs + else: + outputs = outputs + attention_outputs + + return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + +class TokenEncoder(nn.Module): + def __init__(self, config: T5Config, weights): + super().__init__() + + self.token_embedder = nn.Embedding( + config.vocab_size, + config.d_model, + _weight=torch.FloatTensor(weights["token_embedder"]["embedding"]), + ) + + self.position_encoding = nn.Embedding( + config.max_length, + config.d_model, + _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), + ) + self.position_encoding.weight.requires_grad = False + + self.dropout_pre = nn.Dropout(p=config.dropout_rate) + + config.is_encoder_decoder = False + self.encoders = nn.ModuleList() + for lyr_num in range(config.num_layers): + lyr = T5Block(config) + ly_weight = weights[f"layers_{lyr_num}"] + + attention_weights = ly_weight["attention"] + lyr.layer[0].SelfAttention.q.weight = nn.Parameter( + torch.FloatTensor(attention_weights["query"]["kernel"].T) + ) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter( + torch.FloatTensor(attention_weights["value"]["kernel"].T) + ) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + ) + + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) + ) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) + ) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + + self.encoders.append(lyr) + + self.layer_norm = T5LayerNorm(hidden_size=config.d_model) + self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + + self.dropout_post = nn.Dropout(p=config.dropout_rate) + + def forward(self, encoder_input_tokens, encoder_inputs_mask): + x = self.token_embedder(encoder_input_tokens) + + seq_length = encoder_input_tokens.shape[1] + inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device) + x += self.position_encoding(inputs_positions) + + x = self.dropout_pre(x) + + for lyr in self.encoders: + x = lyr(x, encoder_inputs_mask) + + x = self.layer_norm(x) + + return self.dropout_post(x), encoder_inputs_mask + + +class ContinuousEncoder(nn.Module): + def __init__(self, config, weights): + super().__init__() + + self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False) + self.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) + + self.position_encoding = nn.Embedding( + config.targets_context_length, + config.d_model, + _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), + ) + self.position_encoding.weight.requires_grad = False + + self.dropout_pre = nn.Dropout(p=config.dropout_rate) + + config.is_encoder_decoder = False + self.encoders = nn.ModuleList() + for lyr_num in range(config.num_layers): + lyr = T5Block(config) + ly_weight = weights[f"layers_{lyr_num}"] + + attention_weights = ly_weight["attention"] + lyr.layer[0].SelfAttention.q.weight = nn.Parameter( + torch.FloatTensor(attention_weights["query"]["kernel"].T) + ) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter( + torch.FloatTensor(attention_weights["value"]["kernel"].T) + ) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + ) + + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) + ) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) + ) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + + self.encoders.append(lyr) + + self.layer_norm = T5LayerNorm(hidden_size=config.d_model) + self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + + self.dropout_post = nn.Dropout(p=config.dropout_rate) + + def get_sequence_length(self, sequence): + # Return the first index where a 0 occurs. + length = torch.argmax(sequence == 0) + + # If argmax returns 0, that means that either + # 1) No 0s were found, and the sequence length is the full length of the array + # 2) There's padding immediately at the beginning, indicating that the array + # is all padding and the sequence length is 0. + return torch.where(length == 0 and sequence[0] != 0, sequence.shape[0], length) + + def forward(self, encoder_inputs, encoder_inputs_mask): + x = self.input_proj(encoder_inputs) + + # terminal relative positional encodings + max_positions = encoder_inputs.shape[1] + input_positions = torch.arange(max_positions, device=encoder_inputs.device) + seq_lens = self.get_sequence_length(encoder_inputs_mask) + input_positions = torch.roll(input_positions, seq_lens, dims=0) + x += self.position_encoding(input_positions) + + x = self.dropout_pre(x) + + for lyr in self.encoders: + x = lyr(x, encoder_inputs_mask) + + x = self.layer_norm(x) + + return self.dropout_post(x), encoder_inputs_mask + + +class Decoder(nn.Module): + def __init__(self, config, weights): + super().__init__() + + self.conditioning_emb = nn.Sequential( + nn.Linear(config.d_model, config.d_model * 4, bias=False), + nn.SiLU(), + nn.Linear(config.d_model * 4, config.d_model * 4, bias=False), + nn.SiLU(), + ) + self.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) + self.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) + + self.position_encoding = nn.Embedding( + config.targets_length, + config.d_model, + _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), + ) + self.position_encoding.weight.requires_grad = False + + self.continuous_inputs_projection = nn.Linear( + config.input_dims, + config.d_model, + ) + self.continuous_inputs_projection.weight = nn.Parameter( + torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T) + ) + + self.dropout = nn.Dropout(p=config.dropout_rate) + + self.decoders = nn.ModuleList() + config.is_decoder = True + config.is_encoder_decoder = False + for lyr_num in range(config.num_decoder_layers): + # FiLM conditional T5 decoder + lyr = DecoderLayer(config) + ly_weight = weights[f"layers_{lyr_num}"] + + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) + ) + + lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter( + torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) + ) + + attention_weights = ly_weight["self_attention"] + lyr.layer[0].SelfAttention.q.weight = nn.Parameter( + torch.FloatTensor(attention_weights["query"]["kernel"].T) + ) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter( + torch.FloatTensor(attention_weights["value"]["kernel"].T) + ) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + + attention_weights = ly_weight["MultiHeadDotProductAttention_0"] + lyr.layer[1].EncDecAttention.q.weight = nn.Parameter( + torch.FloatTensor(attention_weights["query"]["kernel"].T) + ) + lyr.layer[1].EncDecAttention.k.weight = nn.Parameter( + torch.FloatTensor(attention_weights["key"]["kernel"].T) + ) + lyr.layer[1].EncDecAttention.v.weight = nn.Parameter( + torch.FloatTensor(attention_weights["value"]["kernel"].T) + ) + lyr.layer[1].EncDecAttention.o.weight = nn.Parameter( + torch.FloatTensor(attention_weights["out"]["kernel"].T) + ) + + lyr.layer[1].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) + ) + + lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + + lyr.layer[3].scale_bias.weight = nn.Parameter( + torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) + ) + + lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) + ) + lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter( + torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) + ) + lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + + self.decoders.append(lyr) + + self.decoder_norm = T5LayerNorm(config.d_model) + self.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) + + self.post_dropout = nn.Dropout(p=config.dropout_rate) + self.spec_out = nn.Linear(config.d_model, config.input_dims, bias=False) + self.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) + + self.max_decoder_noise_time = config.max_decoder_noise_time + self.emb_dim = condig.d_model + + def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul): + mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) + + return mask.unsqueeze(-3) + + def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time): + batch, _, _ = decoder_input_tokens.shape + assert decoder_noise_time.shape == (batch,) + + # decoder_noise_time is in [0, 1), so rescale to expected timing range. + conditioning_emb = get_timestep_embedding( + decoder_noise_time * self.max_decoder_noise_time, + embedding_dim=self.emb_dim, + max_period=self.max_decoder_noise_time, + ) + + conditioning_emb = self.conditioning_emb(conditioning_emb) + + assert conditioning_emb.shape == (batch, 1, self.emb_dim * 4) + + seq_length = decoder_input_tokens.shape[1] + + # If we want to use relative positions for audio context, we can just offset + # this sequence by the length of encodings_and_masks. + decoder_positions = torch.broadcast_to( + torch.arange(seq_length, device=decoder_input_tokens.device), + (batch, seq_length), + ) + + position_encodings = self.position_encoding(decoder_positions) + + # decoder: No padding present. + decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device) + + # Translate encoding masks to encoder-decoder masks. + encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks] + + inputs = self.continuous_inputs_projection(decoder_input_tokens) + + inputs += position_encodings + + inputs = self.dropout(inputs) + y = inputs + + for lyr in self.decoders: + y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb) + + y = self.decoder_norm(y) + y = self.post_dropout(y) + + spec_out = self.spec_out(y) + return spec_out From e88dc6fe6c9f76a0df44e52774da36d94a4e35a6 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 28 Oct 2022 18:29:41 +0200 Subject: [PATCH 003/131] added ContinuousContextTransformer --- .../spectrogram_diffusion/__init__.py | 2 +- .../pipeline_spectrogram_diffusion.py | 52 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py index fb094f2380ca..a404e61c1217 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py @@ -1,2 +1,2 @@ # flake8: noqa -from .pipeline_spectrogram_diffusion import ContinuousEncoder, Decoder, TokenEncoder +from .pipeline_spectrogram_diffusion import ContinuousContextTransformer, ContinuousEncoder, Decoder, TokenEncoder diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 43240b206a52..7f3faa8c8b3b 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -616,3 +616,55 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) spec_out = self.spec_out(y) return spec_out + + +class ContinuousContextTransformer(nn.Module): + def __init__(self, config, weights): + super().__init__() + + self.token_encoder = TokenEncoder(config=config, weights=weights) + self.continuous_encoder = ContinuousEncoder(config=config, weights=weights) + self.decoder = Decoder(config=config, weights=weights) + + def encode(self, input_tokens, continuous_inputs, continuous_mask): + tokens_mask = input_tokens > 0 + tokens_encoded, tokens_mask = self.token_encoder( + encoder_input_tokens=input_tokens, + encoder_inputs_mask=tokens_mask, + ) + + continuous_encoded, continuous_mask = self.continuous_encoder( + encoder_inputs=continuous_inputs, + encoder_inputs_mask=continuous_mask, + ) + + return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)] + + def decode(self, encodings_and_masks, input_tokens, noise_time): + logits = self.decoder( + encodings_and_masks=encodings_and_masks, + decoder_input_tokens=input_tokens, + decoder_noise_time=noise_time, + ) + return logits + + def forward( + self, + encoder_input_tokens, + encoder_continuous_inputs, + encoder_continuous_mask, + decoder_input_tokens, + decoder_noise_time, + ): + + encodings_and_masks = self.encode( + input_tokens=encoder_input_tokens, + continuous_inputs=encoder_continuous_inputs, + continuous_mask=encoder_continuous_mask, + ) + + return self.decode( + encodings_and_masks=encodings_and_masks, + input_tokens=decoder_input_tokens, + noise_time=decoder_noise_time, + ) From 59e2111fbd9c07357e07b57e87dec32d9bc7f992 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 2 Nov 2022 15:15:21 +0300 Subject: [PATCH 004/131] fix copy paste error --- .../pipeline_spectrogram_diffusion.py | 145 ++---------------- 1 file changed, 12 insertions(+), 133 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 7f3faa8c8b3b..82764042b81d 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -2,7 +2,14 @@ import torch.nn as nn from diffusers.models.embeddings import get_timestep_embedding -from transformers.models.t5.modeling_t5 import T5Block, T5Config, T5LayerCrossAttention, T5LayerFF, T5LayerNorm +from transformers.models.t5.modeling_t5 import ( + T5Attention, + T5Block, + T5Config, + T5LayerCrossAttention, + T5LayerFF, + T5LayerNorm, +) class FiLMLayer(nn.Module): @@ -184,134 +191,6 @@ def forward( return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) -class DecoderLayer(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() - self.layer = nn.ModuleList() - - # cond self attention: layer 0 - self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias)) - - # cross attention: layer 1 - self.layer.append(T5LayerCrossAttention(config)) - - # pre_mlp_layer_norm: layer 2 - self.layer.append(T5LayerNorm(hidden_size=config.d_model)) - - # FiLM layer: 3 - self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)) - - # MLP + dropout: last layer - self.layer.append(T5LayerFF(config)) - - def forward( - self, - hidden_states, - conditioning_emb=None, - attention_mask=None, - position_bias=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - encoder_decoder_position_bias=None, - layer_head_mask=None, - cross_attn_layer_head_mask=None, - past_key_value=None, - use_cache=False, - output_attentions=False, - return_dict=True, - ): - - if past_key_value is not None: - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - - self_attention_outputs = self.layer[0]( - hidden_states, - conditioning_emb=conditioning_emb, - attention_mask=attention_mask, - position_bias=position_bias, - layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states, present_key_value_state = self_attention_outputs[:2] - attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - if encoder_hidden_states is not None: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - - cross_attention_outputs = self.layer[1]( - hidden_states, - key_value_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, - position_bias=encoder_decoder_position_bias, - layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states = cross_attention_outputs[0] - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - - # Keep cross-attention outputs and relative position weights - attention_outputs = attention_outputs + cross_attention_outputs[2:] - - # Apply LayerNorm - hidden_states = self.layer[2](hidden_states) - - # FiLM - if conditioning_emb is not None: - hidden_states = self.layer[3](hidden_states, conditioning_emb) - - # Apply Feed Forward layer - hidden_states = self.layer[-1](hidden_states) - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - outputs = (hidden_states,) - - if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs - else: - outputs = outputs + attention_outputs - - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - - class TokenEncoder(nn.Module): def __init__(self, config: T5Config, weights): super().__init__() @@ -376,7 +255,7 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask): x = self.dropout_pre(x) for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask) + x = lyr(x, encoder_inputs_mask)[0] x = self.layer_norm(x) @@ -457,7 +336,7 @@ def forward(self, encoder_inputs, encoder_inputs_mask): x = self.dropout_pre(x) for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask) + x = lyr(x, encoder_inputs_mask)[0] x = self.layer_norm(x) @@ -562,7 +441,7 @@ def __init__(self, config, weights): self.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) self.max_decoder_noise_time = config.max_decoder_noise_time - self.emb_dim = condig.d_model + self.emb_dim = config.d_model def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul): mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) @@ -609,7 +488,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) y = inputs for lyr in self.decoders: - y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb) + y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb)[0] y = self.decoder_norm(y) y = self.post_dropout(y) From ab829233087ba7e45fed12211521e48160c8a975 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 2 Nov 2022 21:57:32 +0300 Subject: [PATCH 005/131] use numpy for get_sequence_length --- .../pipeline_spectrogram_diffusion.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 82764042b81d..5179c0953b5f 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -1,3 +1,5 @@ +import numpy as np + import torch import torch.nn as nn @@ -315,13 +317,13 @@ def __init__(self, config, weights): def get_sequence_length(self, sequence): # Return the first index where a 0 occurs. - length = torch.argmax(sequence == 0) + length = np.argmax(sequence == 0) # If argmax returns 0, that means that either # 1) No 0s were found, and the sequence length is the full length of the array # 2) There's padding immediately at the beginning, indicating that the array # is all padding and the sequence length is 0. - return torch.where(length == 0 and sequence[0] != 0, sequence.shape[0], length) + return np.where(length == 0 and sequence[0] != 0, sequence.shape[0], length).tolist() def forward(self, encoder_inputs, encoder_inputs_mask): x = self.input_proj(encoder_inputs) @@ -329,7 +331,7 @@ def forward(self, encoder_inputs, encoder_inputs_mask): # terminal relative positional encodings max_positions = encoder_inputs.shape[1] input_positions = torch.arange(max_positions, device=encoder_inputs.device) - seq_lens = self.get_sequence_length(encoder_inputs_mask) + seq_lens = self.get_sequence_length(encoder_inputs_mask.cpu().numpy()) input_positions = torch.roll(input_positions, seq_lens, dims=0) x += self.position_encoding(input_positions) From cdc6ec7eef389ea906d4c97e4e1b609bd815dc84 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 3 Nov 2022 13:50:11 +0300 Subject: [PATCH 006/131] initial terminal relative positional encodings --- .../pipeline_spectrogram_diffusion.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 5179c0953b5f..042342a27261 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -315,24 +315,15 @@ def __init__(self, config, weights): self.dropout_post = nn.Dropout(p=config.dropout_rate) - def get_sequence_length(self, sequence): - # Return the first index where a 0 occurs. - length = np.argmax(sequence == 0) - - # If argmax returns 0, that means that either - # 1) No 0s were found, and the sequence length is the full length of the array - # 2) There's padding immediately at the beginning, indicating that the array - # is all padding and the sequence length is 0. - return np.where(length == 0 and sequence[0] != 0, sequence.shape[0], length).tolist() - def forward(self, encoder_inputs, encoder_inputs_mask): x = self.input_proj(encoder_inputs) # terminal relative positional encodings max_positions = encoder_inputs.shape[1] input_positions = torch.arange(max_positions, device=encoder_inputs.device) - seq_lens = self.get_sequence_length(encoder_inputs_mask.cpu().numpy()) - input_positions = torch.roll(input_positions, seq_lens, dims=0) + + seq_lens = encoder_inputs_mask.sum(-1) + input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0) x += self.position_encoding(input_positions) x = self.dropout_pre(x) From c55fb5bb4a6945cd39f6f98738995689923a9605 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 3 Nov 2022 14:22:44 +0300 Subject: [PATCH 007/131] fix weights keys --- .../pipeline_spectrogram_diffusion.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 042342a27261..52185f661a93 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -212,6 +212,7 @@ def __init__(self, config: T5Config, weights): self.dropout_pre = nn.Dropout(p=config.dropout_rate) + config.is_decoder = False config.is_encoder_decoder = False self.encoders = nn.ModuleList() for lyr_num in range(config.num_layers): @@ -280,6 +281,7 @@ def __init__(self, config, weights): self.dropout_pre = nn.Dropout(p=config.dropout_rate) + config.is_decoder = False config.is_encoder_decoder = False self.encoders = nn.ModuleList() for lyr_num in range(config.num_layers): @@ -366,9 +368,9 @@ def __init__(self, config, weights): self.dropout = nn.Dropout(p=config.dropout_rate) - self.decoders = nn.ModuleList() config.is_decoder = True config.is_encoder_decoder = False + self.decoders = nn.ModuleList() for lyr_num in range(config.num_decoder_layers): # FiLM conditional T5 decoder lyr = DecoderLayer(config) @@ -477,9 +479,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) inputs += position_encodings - inputs = self.dropout(inputs) - y = inputs - + y = self.dropout(inputs) for lyr in self.decoders: y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb)[0] @@ -494,9 +494,9 @@ class ContinuousContextTransformer(nn.Module): def __init__(self, config, weights): super().__init__() - self.token_encoder = TokenEncoder(config=config, weights=weights) - self.continuous_encoder = ContinuousEncoder(config=config, weights=weights) - self.decoder = Decoder(config=config, weights=weights) + self.token_encoder = TokenEncoder(config=config, weights=weights["token_encoder"]) + self.continuous_encoder = ContinuousEncoder(config=config, weights=weights["continuous_encoder"]) + self.decoder = Decoder(config=config, weights=weights["decoder"]) def encode(self, input_tokens, continuous_inputs, continuous_mask): tokens_mask = input_tokens > 0 From af673745d91ad31af510a91823fce588efac4ebe Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 3 Nov 2022 16:16:38 +0300 Subject: [PATCH 008/131] fix assert --- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 52185f661a93..c760fbcfa6a1 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -456,7 +456,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) conditioning_emb = self.conditioning_emb(conditioning_emb) - assert conditioning_emb.shape == (batch, 1, self.emb_dim * 4) + assert conditioning_emb.shape == (batch, self.emb_dim * 4) seq_length = decoder_input_tokens.shape[1] From ef43fe0a0c893e78b92d61bad771c812e701288d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 3 Nov 2022 16:54:23 +0300 Subject: [PATCH 009/131] cross attend style: concat encodings --- .../pipeline_spectrogram_diffusion.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index c760fbcfa6a1..c589d2ce8194 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -481,7 +481,16 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) y = self.dropout(inputs) for lyr in self.decoders: - y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb)[0] + # cross attend style: concat encodings + encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) + encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) + + y = lyr( + y, + conditioning_emb=conditioning_emb, + encoder_hidden_states=encoded, + encoder_attention_mask=encoder_decoder_mask, + )[0] y = self.decoder_norm(y) y = self.post_dropout(y) From 6de0cfb163c2b8a89e7b7bd6513e2c9b0cb8a267 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 3 Nov 2022 19:21:40 +0300 Subject: [PATCH 010/131] make style --- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index c589d2ce8194..baa10984636c 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -1,5 +1,4 @@ import numpy as np - import torch import torch.nn as nn From 5546c121dad5998956c51629c12866f693ed1384 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 3 Nov 2022 21:51:18 +0300 Subject: [PATCH 011/131] concat once --- .../pipeline_spectrogram_diffusion.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index baa10984636c..3e3c4feb16d8 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -479,11 +479,11 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) inputs += position_encodings y = self.dropout(inputs) - for lyr in self.decoders: - # cross attend style: concat encodings - encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) - encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) + # cross attend style: concat encodings + encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) + encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) + for lyr in self.decoders: y = lyr( y, conditioning_emb=conditioning_emb, From 8b32df3462ba1e5072dd914e297f7870d7159175 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 4 Nov 2022 10:11:39 +0300 Subject: [PATCH 012/131] fix formatting --- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 3e3c4feb16d8..48116424cde3 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -439,7 +439,6 @@ def __init__(self, config, weights): def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul): mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) - return mask.unsqueeze(-3) def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time): From c69a3b902de252a9391217b0be5cf98b657228a1 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 4 Nov 2022 11:03:32 +0300 Subject: [PATCH 013/131] Initial SpectrogramPipeline --- .../pipeline_spectrogram_diffusion.py | 73 ++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 48116424cde3..83c24fefbe54 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -1,8 +1,9 @@ -import numpy as np +from typing import Optional +import math + import torch import torch.nn as nn -from diffusers.models.embeddings import get_timestep_embedding from transformers.models.t5.modeling_t5 import ( T5Attention, T5Block, @@ -12,6 +13,10 @@ T5LayerNorm, ) +from ...models.embeddings import get_timestep_embedding +from ...pipeline_utils import DiffusionPipeline +from ...schedulers import DDPMScheduler + class FiLMLayer(nn.Module): def __init__(self, in_features, out_features): @@ -547,3 +552,67 @@ def forward( input_tokens=decoder_input_tokens, noise_time=decoder_noise_time, ) + + +class SpectrogramPipeline(DiffusionPipeline): + def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None: + super().__init__() + + # From MELGAN + self.min_value = math.log(1e-5) # Matches MelGAN training. + self.max_value = 4.0 # Largest value for most examples. + + self.register_modules(cont_context_trans=cont_context_trans, scheduler=scheduler) + + def scale_features(self, features, output_range=(-1.0, 1.0), clip=False): + """Linearly scale features to network outputs range.""" + min_out, max_out = output_range + if clip: + features = torch.clip(features, self.min_value, self.max_value) + # Scale to [0, 1]. + zero_one = (features - self.min_value) / (self.max_value - self.min_value) + # Scale to [min_out, max_out]. + return zero_one * (max_out - min_out) + min_out + + @torch.no_grad() + def __call__( + self, + encoder_input_tokens, + encoder_continuous_inputs, + encoder_continuous_mask, + decoder_input_tokens, + generator: Optional[torch.Generator] = None, + num_inference_steps: int = 1000, + return_dict: bool = True, + predict_epsilon: bool = True, + **kwargs, + ): + target_shape = encoder_continuous_inputs.shape + encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True) + + encodings_and_masks = self.cont_context_trans.encode( + encoder_input_tokens=encoder_input_tokens, + continuous_inputs=encoder_continuous_inputs, + continuous_mask=encoder_continuous_mask, + ) + + # Sample gaussian noise to begin loop + x = torch.randn(target_shape, generator=generator) + x = x.to(self.device) + + # set step values + self.scheduler.set_timesteps(num_inference_steps) + + for t in self.progress_bar(self.scheduler.timesteps): + output = self.cont_context_trans.decode( + encodings_and_masks=encodings_and_masks, + input_tokens=decoder_input_tokens, + noise_time=t, + ) + + # 2. compute previous output: x_t -> x_t-1 + x = self.scheduler.step(output, t, x, generator=generator, predict_epsilon=predict_epsilon).prev_sample + + decode = self.scale_to_features(x, input_range=[-1.0, 1.0]) + + return decode From f7254db3f17031abf9c8b77217d049d5a0aa773f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 4 Nov 2022 11:07:41 +0300 Subject: [PATCH 014/131] fix input_tokens --- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 83c24fefbe54..5bdf0c341ad0 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -580,7 +580,6 @@ def __call__( encoder_input_tokens, encoder_continuous_inputs, encoder_continuous_mask, - decoder_input_tokens, generator: Optional[torch.Generator] = None, num_inference_steps: int = 1000, return_dict: bool = True, @@ -606,7 +605,7 @@ def __call__( for t in self.progress_bar(self.scheduler.timesteps): output = self.cont_context_trans.decode( encodings_and_masks=encodings_and_masks, - input_tokens=decoder_input_tokens, + input_tokens=x, noise_time=t, ) From 133d155b343caff53fe449a903301dbf3012eeb2 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 4 Nov 2022 11:36:08 +0300 Subject: [PATCH 015/131] make style --- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 5bdf0c341ad0..e12a1b0b8ceb 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -1,5 +1,5 @@ -from typing import Optional import math +from typing import Optional import torch import torch.nn as nn @@ -105,7 +105,6 @@ def forward( output_attentions=False, return_dict=True, ): - if past_key_value is not None: expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 @@ -540,7 +539,6 @@ def forward( decoder_input_tokens, decoder_noise_time, ): - encodings_and_masks = self.encode( input_tokens=encoder_input_tokens, continuous_inputs=encoder_continuous_inputs, From aa2323f06e9348ab3f46517392ec0cb2e05c8478 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 7 Nov 2022 16:41:00 +0100 Subject: [PATCH 016/131] added mel output --- src/diffusers/pipeline_utils.py | 14 ++++++++++++++ .../pipeline_spectrogram_diffusion.py | 16 +++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 36c2d5b888ef..40a7924dad90 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -109,6 +109,20 @@ class AudioPipelineOutput(BaseOutput): audios: np.ndarray +@dataclass +class MelPipelineOutput(BaseOutput): + """ + Output class for Mel pipelines. + + Args: + mels (`np.ndarray`) + List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the + denoised mel samples of the diffusion pipeline. + """ + + mels: np.ndarray + + class DiffusionPipeline(ConfigMixin): r""" Base class for all models. diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index e12a1b0b8ceb..c888819daae5 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -13,8 +13,10 @@ T5LayerNorm, ) +from ...configuration_utils import ConfigMixin, register_to_config +from ...modeling_utils import ModelMixin from ...models.embeddings import get_timestep_embedding -from ...pipeline_utils import DiffusionPipeline +from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput from ...schedulers import DDPMScheduler @@ -196,7 +198,8 @@ def forward( return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) -class TokenEncoder(nn.Module): +class TokenEncoder(ModelMixin, ConfigMixin): + @register_to_config def __init__(self, config: T5Config, weights): super().__init__() @@ -558,7 +561,7 @@ def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: # From MELGAN self.min_value = math.log(1e-5) # Matches MelGAN training. - self.max_value = 4.0 # Largest value for most examples. + self.max_value = 4.0 # Largest value for most examples self.register_modules(cont_context_trans=cont_context_trans, scheduler=scheduler) @@ -610,6 +613,9 @@ def __call__( # 2. compute previous output: x_t -> x_t-1 x = self.scheduler.step(output, t, x, generator=generator, predict_epsilon=predict_epsilon).prev_sample - decode = self.scale_to_features(x, input_range=[-1.0, 1.0]) + mel = self.scale_to_features(x, input_range=[-1.0, 1.0]) - return decode + if not return_dict: + return (mel,) + + return MelPipelineOutput(mels=mel) From c154878fd2888f755b52cc9110aab6dbfc26e8ef Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 7 Nov 2022 17:04:45 +0100 Subject: [PATCH 017/131] ignore weights for config --- .../pipeline_spectrogram_diffusion.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index c888819daae5..34d58aa706ad 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -199,6 +199,8 @@ def forward( class TokenEncoder(ModelMixin, ConfigMixin): + ignore_for_config = ["weights"] + @register_to_config def __init__(self, config: T5Config, weights): super().__init__() @@ -271,7 +273,8 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask): return self.dropout_post(x), encoder_inputs_mask -class ContinuousEncoder(nn.Module): +class ContinuousEncoder(ModelMixin, ConfigMixin): + @register_to_config def __init__(self, config, weights): super().__init__() @@ -344,8 +347,11 @@ def forward(self, encoder_inputs, encoder_inputs_mask): return self.dropout_post(x), encoder_inputs_mask -class Decoder(nn.Module): - def __init__(self, config, weights): +class Decoder(ModelMixin, ConfigMixin): + ignore_for_config = ["weights"] + + @register_to_config + def __init__(self, config: T5Config, weights): super().__init__() self.conditioning_emb = nn.Sequential( @@ -504,8 +510,11 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) return spec_out -class ContinuousContextTransformer(nn.Module): - def __init__(self, config, weights): +class ContinuousContextTransformer(ModelMixin, ConfigMixin): + ignore_for_config = ["weights"] + + @register_to_config + def __init__(self, config: T5Config, weights): super().__init__() self.token_encoder = TokenEncoder(config=config, weights=weights["token_encoder"]) From 63f69b6cb99cf21e51ef5db41cc1dd9aa682cc30 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 7 Nov 2022 17:09:13 +0100 Subject: [PATCH 018/131] move mel to numpy --- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 34d58aa706ad..0dd239e4a69f 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -623,6 +623,7 @@ def __call__( x = self.scheduler.step(output, t, x, generator=generator, predict_epsilon=predict_epsilon).prev_sample mel = self.scale_to_features(x, input_range=[-1.0, 1.0]) + mel = mel.cpu().numpy() if not return_dict: return (mel,) From 9808d06642b94ea4e6aba780437abef07b6133c8 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 7 Nov 2022 17:16:19 +0100 Subject: [PATCH 019/131] import pipeline --- src/diffusers/pipelines/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index bb3440b2bfbc..7f474a8b9774 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -10,6 +10,7 @@ from .repaint import RePaintPipeline from .score_sde_ve import ScoreSdeVePipeline from .stochastic_karras_ve import KarrasVePipeline + from .spectrogram_diffusion import SpectrogramDiffusionPipeline else: from ..utils.dummy_pt_objects import * # noqa F403 From 49d95c0ce3cba464cdefd7262d358d85d04611de Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 7 Nov 2022 17:23:55 +0100 Subject: [PATCH 020/131] fix class names and import --- src/diffusers/pipelines/spectrogram_diffusion/__init__.py | 8 +++++++- .../pipeline_spectrogram_diffusion.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py index a404e61c1217..85230f5e95d0 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py @@ -1,2 +1,8 @@ # flake8: noqa -from .pipeline_spectrogram_diffusion import ContinuousContextTransformer, ContinuousEncoder, Decoder, TokenEncoder +from .pipeline_spectrogram_diffusion import ( + SpectrogramDiffusionPipeline, + ContinuousContextTransformer, + ContinuousEncoder, + Decoder, + TokenEncoder, +) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 0dd239e4a69f..ad3c7d0a545a 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -564,7 +564,7 @@ def forward( ) -class SpectrogramPipeline(DiffusionPipeline): +class SpectrogramDiffusionPipeline(DiffusionPipeline): def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None: super().__init__() From ce4a6582ccc9130cb4c0486f7ed4d8545e914869 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 8 Nov 2022 09:20:39 +0100 Subject: [PATCH 021/131] moved models to models folder --- src/diffusers/models/__init__.py | 2 + src/diffusers/models/film.py | 27 + src/diffusers/models/t5_attention.py | 424 +++++++++++++ src/diffusers/pipeline_utils.py | 4 +- src/diffusers/pipelines/__init__.py | 2 +- .../spectrogram_diffusion/__init__.py | 8 +- .../pipeline_spectrogram_diffusion.py | 558 +----------------- 7 files changed, 458 insertions(+), 567 deletions(-) create mode 100644 src/diffusers/models/film.py create mode 100644 src/diffusers/models/t5_attention.py diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index 5b101d169148..c3d524eddebc 100644 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -17,6 +17,8 @@ if is_torch_available(): from .attention import Transformer2DModel + from .film import FiLMLayer + from .t5_attention import ContinuousContextTransformer, ContinuousEncoder, Decoder, TokenEncoder from .unet_1d import UNet1DModel from .unet_2d import UNet2DModel from .unet_2d_condition import UNet2DConditionModel diff --git a/src/diffusers/models/film.py b/src/diffusers/models/film.py new file mode 100644 index 000000000000..8936fd32276d --- /dev/null +++ b/src/diffusers/models/film.py @@ -0,0 +1,27 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn + + +class FiLMLayer(nn.Module): + def __init__(self, in_features, out_features): + super().__init__() + self.scale_bias = nn.Linear(in_features, out_features * 2) + + def forward(self, x, conditioning_emb): + scale_bias = self.scale_bias(conditioning_emb) + scale, bias = torch.chunk(scale_bias, 2, -1) + return x * (scale + 1.0) + bias diff --git a/src/diffusers/models/t5_attention.py b/src/diffusers/models/t5_attention.py new file mode 100644 index 000000000000..7d588a3113c4 --- /dev/null +++ b/src/diffusers/models/t5_attention.py @@ -0,0 +1,424 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn + +from transformers.models.t5.modeling_t5 import ( + T5Attention, + T5Block, + T5Config, + T5LayerCrossAttention, + T5LayerFF, + T5LayerNorm, +) + +from ..configuration_utils import ConfigMixin, register_to_config +from ..modeling_utils import ModelMixin +from .embeddings import get_timestep_embedding +from .film import FiLMLayer + + +class T5LayerSelfAttentionCond(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.layer_norm = T5LayerNorm(config.d_model) + self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model) + self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + ): + # pre_self_attention_layer_norm + normed_hidden_states = self.layer_norm(hidden_states) + + if conditioning_emb is not None: + normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb) + + # Self-attention block + attention_output = self.SelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[1:] + return outputs + + +class DecoderLayer(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.layer = nn.ModuleList() + + # cond self attention: layer 0 + self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias)) + + # cross attention: layer 1 + self.layer.append(T5LayerCrossAttention(config)) + + # pre_mlp_layer_norm: layer 2 + self.layer.append(T5LayerNorm(hidden_size=config.d_model)) + + # FiLM layer: 3 + self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)) + + # MLP + dropout: last layer + self.layer.append(T5LayerFF(config)) + + def forward( + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): + if past_key_value is not None: + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + + if len(past_key_value) != expected_num_past_key_values: + raise ValueError( + f"There should be {expected_num_past_key_values} past states. " + f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" + f"Got {len(past_key_value)} past key / value states" + ) + + self_attn_past_key_value = past_key_value[:2] + cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + conditioning_emb=conditioning_emb, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=self_attn_past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] + attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + if encoder_hidden_states is not None: + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here + if present_key_value_state is not None: + query_length = present_key_value_state[0].shape[2] + else: + query_length = None + + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states + if present_key_value_state is not None: + present_key_value_state = present_key_value_state + cross_attention_outputs[1] + + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + + # Apply LayerNorm + hidden_states = self.layer[2](hidden_states) + + # FiLM + if conditioning_emb is not None: + hidden_states = self.layer[3](hidden_states, conditioning_emb) + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if use_cache: + outputs = outputs + (present_key_value_state,) + attention_outputs + else: + outputs = outputs + attention_outputs + + return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + +class TokenEncoder(ModelMixin, ConfigMixin): + @register_to_config + def __init__(self, config: T5Config): + super().__init__() + + self.token_embedder = nn.Embedding(config.vocab_size, config.d_model) + + self.position_encoding = nn.Embedding(config.max_length, config.d_model) + self.position_encoding.weight.requires_grad = False + + self.dropout_pre = nn.Dropout(p=config.dropout_rate) + + config.is_decoder = False + config.is_encoder_decoder = False + self.encoders = nn.ModuleList() + for lyr_num in range(config.num_layers): + lyr = T5Block(config) + self.encoders.append(lyr) + + self.layer_norm = T5LayerNorm(hidden_size=config.d_model) + self.dropout_post = nn.Dropout(p=config.dropout_rate) + + def forward(self, encoder_input_tokens, encoder_inputs_mask): + x = self.token_embedder(encoder_input_tokens) + + seq_length = encoder_input_tokens.shape[1] + inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device) + x += self.position_encoding(inputs_positions) + + x = self.dropout_pre(x) + + for lyr in self.encoders: + x = lyr(x, encoder_inputs_mask)[0] + x = self.layer_norm(x) + + return self.dropout_post(x), encoder_inputs_mask + + +class ContinuousEncoder(ModelMixin, ConfigMixin): + @register_to_config + def __init__(self, config): + super().__init__() + + self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False) + + self.position_encoding = nn.Embedding(config.targets_context_length, config.d_model) + self.position_encoding.weight.requires_grad = False + + self.dropout_pre = nn.Dropout(p=config.dropout_rate) + + config.is_decoder = False + config.is_encoder_decoder = False + self.encoders = nn.ModuleList() + for lyr_num in range(config.num_layers): + lyr = T5Block(config) + self.encoders.append(lyr) + + self.layer_norm = T5LayerNorm(hidden_size=config.d_model) + self.dropout_post = nn.Dropout(p=config.dropout_rate) + + def forward(self, encoder_inputs, encoder_inputs_mask): + x = self.input_proj(encoder_inputs) + + # terminal relative positional encodings + max_positions = encoder_inputs.shape[1] + input_positions = torch.arange(max_positions, device=encoder_inputs.device) + + seq_lens = encoder_inputs_mask.sum(-1) + input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0) + x += self.position_encoding(input_positions) + + x = self.dropout_pre(x) + + for lyr in self.encoders: + x = lyr(x, encoder_inputs_mask)[0] + x = self.layer_norm(x) + + return self.dropout_post(x), encoder_inputs_mask + + +class Decoder(ModelMixin, ConfigMixin): + @register_to_config + def __init__(self, config: T5Config): + super().__init__() + + self.conditioning_emb = nn.Sequential( + nn.Linear(config.d_model, config.d_model * 4, bias=False), + nn.SiLU(), + nn.Linear(config.d_model * 4, config.d_model * 4, bias=False), + nn.SiLU(), + ) + + self.position_encoding = nn.Embedding(config.targets_length, config.d_model) + self.position_encoding.weight.requires_grad = False + + self.continuous_inputs_projection = nn.Linear(config.input_dims, config.d_model) + + self.dropout = nn.Dropout(p=config.dropout_rate) + + config.is_decoder = True + config.is_encoder_decoder = False + self.decoders = nn.ModuleList() + for lyr_num in range(config.num_decoder_layers): + # FiLM conditional T5 decoder + lyr = DecoderLayer(config) + self.decoders.append(lyr) + + self.decoder_norm = T5LayerNorm(config.d_model) + + self.post_dropout = nn.Dropout(p=config.dropout_rate) + self.spec_out = nn.Linear(config.d_model, config.input_dims, bias=False) + + self.max_decoder_noise_time = config.max_decoder_noise_time + self.emb_dim = config.d_model + + def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul): + mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) + return mask.unsqueeze(-3) + + def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time): + batch, _, _ = decoder_input_tokens.shape + assert decoder_noise_time.shape == (batch,) + + # decoder_noise_time is in [0, 1), so rescale to expected timing range. + conditioning_emb = get_timestep_embedding( + decoder_noise_time * self.max_decoder_noise_time, + embedding_dim=self.emb_dim, + max_period=self.max_decoder_noise_time, + ) + + conditioning_emb = self.conditioning_emb(conditioning_emb) + + assert conditioning_emb.shape == (batch, self.emb_dim * 4) + + seq_length = decoder_input_tokens.shape[1] + + # If we want to use relative positions for audio context, we can just offset + # this sequence by the length of encodings_and_masks. + decoder_positions = torch.broadcast_to( + torch.arange(seq_length, device=decoder_input_tokens.device), + (batch, seq_length), + ) + + position_encodings = self.position_encoding(decoder_positions) + + # decoder: No padding present. + decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device) + + # Translate encoding masks to encoder-decoder masks. + encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks] + + inputs = self.continuous_inputs_projection(decoder_input_tokens) + + inputs += position_encodings + + y = self.dropout(inputs) + + # cross attend style: concat encodings + encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) + encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) + for lyr in self.decoders: + y = lyr( + y, + conditioning_emb=conditioning_emb, + encoder_hidden_states=encoded, + encoder_attention_mask=encoder_decoder_mask, + )[0] + + y = self.decoder_norm(y) + y = self.post_dropout(y) + + spec_out = self.spec_out(y) + return spec_out + + +class ContinuousContextTransformer(ModelMixin, ConfigMixin): + @register_to_config + def __init__(self, config: T5Config): + super().__init__() + + self.token_encoder = TokenEncoder(config=config) + self.continuous_encoder = ContinuousEncoder(config=config) + self.decoder = Decoder(config=config) + + def encode(self, input_tokens, continuous_inputs, continuous_mask): + tokens_mask = input_tokens > 0 + tokens_encoded, tokens_mask = self.token_encoder( + encoder_input_tokens=input_tokens, + encoder_inputs_mask=tokens_mask, + ) + + continuous_encoded, continuous_mask = self.continuous_encoder( + encoder_inputs=continuous_inputs, + encoder_inputs_mask=continuous_mask, + ) + + return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)] + + def decode(self, encodings_and_masks, input_tokens, noise_time): + logits = self.decoder( + encodings_and_masks=encodings_and_masks, + decoder_input_tokens=input_tokens, + decoder_noise_time=noise_time, + ) + return logits + + def forward( + self, + encoder_input_tokens, + encoder_continuous_inputs, + encoder_continuous_mask, + decoder_input_tokens, + decoder_noise_time, + ): + encodings_and_masks = self.encode( + input_tokens=encoder_input_tokens, + continuous_inputs=encoder_continuous_inputs, + continuous_mask=encoder_continuous_mask, + ) + + return self.decode( + encodings_and_masks=encodings_and_masks, + input_tokens=decoder_input_tokens, + noise_time=decoder_noise_time, + ) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 40a7924dad90..ca08c1b36ce3 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -116,8 +116,8 @@ class MelPipelineOutput(BaseOutput): Args: mels (`np.ndarray`) - List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the - denoised mel samples of the diffusion pipeline. + List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the denoised mel + samples of the diffusion pipeline. """ mels: np.ndarray diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 7f474a8b9774..4b63ab8af277 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -9,8 +9,8 @@ from .pndm import PNDMPipeline from .repaint import RePaintPipeline from .score_sde_ve import ScoreSdeVePipeline - from .stochastic_karras_ve import KarrasVePipeline from .spectrogram_diffusion import SpectrogramDiffusionPipeline + from .stochastic_karras_ve import KarrasVePipeline else: from ..utils.dummy_pt_objects import * # noqa F403 diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py index 85230f5e95d0..de37e892a7db 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py @@ -1,8 +1,2 @@ # flake8: noqa -from .pipeline_spectrogram_diffusion import ( - SpectrogramDiffusionPipeline, - ContinuousContextTransformer, - ContinuousEncoder, - Decoder, - TokenEncoder, -) +from .pipeline_spectrogram_diffusion import SpectrogramDiffusionPipeline diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index ad3c7d0a545a..69f1f092febf 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -2,568 +2,12 @@ from typing import Optional import torch -import torch.nn as nn -from transformers.models.t5.modeling_t5 import ( - T5Attention, - T5Block, - T5Config, - T5LayerCrossAttention, - T5LayerFF, - T5LayerNorm, -) - -from ...configuration_utils import ConfigMixin, register_to_config -from ...modeling_utils import ModelMixin -from ...models.embeddings import get_timestep_embedding +from ...models.t5_attention import ContinuousContextTransformer from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput from ...schedulers import DDPMScheduler -class FiLMLayer(nn.Module): - def __init__(self, in_features, out_features): - super().__init__() - self.scale_bias = nn.Linear(in_features, out_features * 2) - - def forward(self, x, conditioning_emb): - scale_bias = self.scale_bias(conditioning_emb) - scale, bias = torch.chunk(scale_bias, 2, -1) - return x * (scale + 1.0) + bias - - -class T5LayerSelfAttentionCond(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() - self.layer_norm = T5LayerNorm(config.d_model) - self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model) - self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.dropout = nn.Dropout(config.dropout_rate) - - def forward( - self, - hidden_states, - conditioning_emb=None, - attention_mask=None, - position_bias=None, - layer_head_mask=None, - past_key_value=None, - use_cache=False, - output_attentions=False, - ): - # pre_self_attention_layer_norm - normed_hidden_states = self.layer_norm(hidden_states) - - if conditioning_emb is not None: - normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb) - - # Self-attention block - attention_output = self.SelfAttention( - normed_hidden_states, - mask=attention_mask, - position_bias=position_bias, - layer_head_mask=layer_head_mask, - past_key_value=past_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states = hidden_states + self.dropout(attention_output[0]) - outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them - return outputs - - -class DecoderLayer(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() - self.layer = nn.ModuleList() - - # cond self attention: layer 0 - self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias)) - - # cross attention: layer 1 - self.layer.append(T5LayerCrossAttention(config)) - - # pre_mlp_layer_norm: layer 2 - self.layer.append(T5LayerNorm(hidden_size=config.d_model)) - - # FiLM layer: 3 - self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)) - - # MLP + dropout: last layer - self.layer.append(T5LayerFF(config)) - - def forward( - self, - hidden_states, - conditioning_emb=None, - attention_mask=None, - position_bias=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - encoder_decoder_position_bias=None, - layer_head_mask=None, - cross_attn_layer_head_mask=None, - past_key_value=None, - use_cache=False, - output_attentions=False, - return_dict=True, - ): - if past_key_value is not None: - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - - self_attention_outputs = self.layer[0]( - hidden_states, - conditioning_emb=conditioning_emb, - attention_mask=attention_mask, - position_bias=position_bias, - layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states, present_key_value_state = self_attention_outputs[:2] - attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - if encoder_hidden_states is not None: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - - cross_attention_outputs = self.layer[1]( - hidden_states, - key_value_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, - position_bias=encoder_decoder_position_bias, - layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states = cross_attention_outputs[0] - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - - # Keep cross-attention outputs and relative position weights - attention_outputs = attention_outputs + cross_attention_outputs[2:] - - # Apply LayerNorm - hidden_states = self.layer[2](hidden_states) - - # FiLM - if conditioning_emb is not None: - hidden_states = self.layer[3](hidden_states, conditioning_emb) - - # Apply Feed Forward layer - hidden_states = self.layer[-1](hidden_states) - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - outputs = (hidden_states,) - - if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs - else: - outputs = outputs + attention_outputs - - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - - -class TokenEncoder(ModelMixin, ConfigMixin): - ignore_for_config = ["weights"] - - @register_to_config - def __init__(self, config: T5Config, weights): - super().__init__() - - self.token_embedder = nn.Embedding( - config.vocab_size, - config.d_model, - _weight=torch.FloatTensor(weights["token_embedder"]["embedding"]), - ) - - self.position_encoding = nn.Embedding( - config.max_length, - config.d_model, - _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), - ) - self.position_encoding.weight.requires_grad = False - - self.dropout_pre = nn.Dropout(p=config.dropout_rate) - - config.is_decoder = False - config.is_encoder_decoder = False - self.encoders = nn.ModuleList() - for lyr_num in range(config.num_layers): - lyr = T5Block(config) - ly_weight = weights[f"layers_{lyr_num}"] - - attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter( - torch.FloatTensor(attention_weights["query"]["kernel"].T) - ) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter( - torch.FloatTensor(attention_weights["value"]["kernel"].T) - ) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) - - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) - ) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) - ) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - - self.encoders.append(lyr) - - self.layer_norm = T5LayerNorm(hidden_size=config.d_model) - self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) - - self.dropout_post = nn.Dropout(p=config.dropout_rate) - - def forward(self, encoder_input_tokens, encoder_inputs_mask): - x = self.token_embedder(encoder_input_tokens) - - seq_length = encoder_input_tokens.shape[1] - inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device) - x += self.position_encoding(inputs_positions) - - x = self.dropout_pre(x) - - for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask)[0] - - x = self.layer_norm(x) - - return self.dropout_post(x), encoder_inputs_mask - - -class ContinuousEncoder(ModelMixin, ConfigMixin): - @register_to_config - def __init__(self, config, weights): - super().__init__() - - self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False) - self.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) - - self.position_encoding = nn.Embedding( - config.targets_context_length, - config.d_model, - _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), - ) - self.position_encoding.weight.requires_grad = False - - self.dropout_pre = nn.Dropout(p=config.dropout_rate) - - config.is_decoder = False - config.is_encoder_decoder = False - self.encoders = nn.ModuleList() - for lyr_num in range(config.num_layers): - lyr = T5Block(config) - ly_weight = weights[f"layers_{lyr_num}"] - - attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter( - torch.FloatTensor(attention_weights["query"]["kernel"].T) - ) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter( - torch.FloatTensor(attention_weights["value"]["kernel"].T) - ) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) - - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) - ) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) - ) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - - self.encoders.append(lyr) - - self.layer_norm = T5LayerNorm(hidden_size=config.d_model) - self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) - - self.dropout_post = nn.Dropout(p=config.dropout_rate) - - def forward(self, encoder_inputs, encoder_inputs_mask): - x = self.input_proj(encoder_inputs) - - # terminal relative positional encodings - max_positions = encoder_inputs.shape[1] - input_positions = torch.arange(max_positions, device=encoder_inputs.device) - - seq_lens = encoder_inputs_mask.sum(-1) - input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0) - x += self.position_encoding(input_positions) - - x = self.dropout_pre(x) - - for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask)[0] - - x = self.layer_norm(x) - - return self.dropout_post(x), encoder_inputs_mask - - -class Decoder(ModelMixin, ConfigMixin): - ignore_for_config = ["weights"] - - @register_to_config - def __init__(self, config: T5Config, weights): - super().__init__() - - self.conditioning_emb = nn.Sequential( - nn.Linear(config.d_model, config.d_model * 4, bias=False), - nn.SiLU(), - nn.Linear(config.d_model * 4, config.d_model * 4, bias=False), - nn.SiLU(), - ) - self.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) - self.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) - - self.position_encoding = nn.Embedding( - config.targets_length, - config.d_model, - _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]), - ) - self.position_encoding.weight.requires_grad = False - - self.continuous_inputs_projection = nn.Linear( - config.input_dims, - config.d_model, - ) - self.continuous_inputs_projection.weight = nn.Parameter( - torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T) - ) - - self.dropout = nn.Dropout(p=config.dropout_rate) - - config.is_decoder = True - config.is_encoder_decoder = False - self.decoders = nn.ModuleList() - for lyr_num in range(config.num_decoder_layers): - # FiLM conditional T5 decoder - lyr = DecoderLayer(config) - ly_weight = weights[f"layers_{lyr_num}"] - - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) - ) - - lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter( - torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) - ) - - attention_weights = ly_weight["self_attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter( - torch.FloatTensor(attention_weights["query"]["kernel"].T) - ) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter( - torch.FloatTensor(attention_weights["value"]["kernel"].T) - ) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) - - attention_weights = ly_weight["MultiHeadDotProductAttention_0"] - lyr.layer[1].EncDecAttention.q.weight = nn.Parameter( - torch.FloatTensor(attention_weights["query"]["kernel"].T) - ) - lyr.layer[1].EncDecAttention.k.weight = nn.Parameter( - torch.FloatTensor(attention_weights["key"]["kernel"].T) - ) - lyr.layer[1].EncDecAttention.v.weight = nn.Parameter( - torch.FloatTensor(attention_weights["value"]["kernel"].T) - ) - lyr.layer[1].EncDecAttention.o.weight = nn.Parameter( - torch.FloatTensor(attention_weights["out"]["kernel"].T) - ) - - lyr.layer[1].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) - ) - - lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - - lyr.layer[3].scale_bias.weight = nn.Parameter( - torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) - ) - - lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) - ) - lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter( - torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T) - ) - lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - - self.decoders.append(lyr) - - self.decoder_norm = T5LayerNorm(config.d_model) - self.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) - - self.post_dropout = nn.Dropout(p=config.dropout_rate) - self.spec_out = nn.Linear(config.d_model, config.input_dims, bias=False) - self.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) - - self.max_decoder_noise_time = config.max_decoder_noise_time - self.emb_dim = config.d_model - - def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul): - mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) - return mask.unsqueeze(-3) - - def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time): - batch, _, _ = decoder_input_tokens.shape - assert decoder_noise_time.shape == (batch,) - - # decoder_noise_time is in [0, 1), so rescale to expected timing range. - conditioning_emb = get_timestep_embedding( - decoder_noise_time * self.max_decoder_noise_time, - embedding_dim=self.emb_dim, - max_period=self.max_decoder_noise_time, - ) - - conditioning_emb = self.conditioning_emb(conditioning_emb) - - assert conditioning_emb.shape == (batch, self.emb_dim * 4) - - seq_length = decoder_input_tokens.shape[1] - - # If we want to use relative positions for audio context, we can just offset - # this sequence by the length of encodings_and_masks. - decoder_positions = torch.broadcast_to( - torch.arange(seq_length, device=decoder_input_tokens.device), - (batch, seq_length), - ) - - position_encodings = self.position_encoding(decoder_positions) - - # decoder: No padding present. - decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device) - - # Translate encoding masks to encoder-decoder masks. - encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks] - - inputs = self.continuous_inputs_projection(decoder_input_tokens) - - inputs += position_encodings - - y = self.dropout(inputs) - - # cross attend style: concat encodings - encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) - encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) - for lyr in self.decoders: - y = lyr( - y, - conditioning_emb=conditioning_emb, - encoder_hidden_states=encoded, - encoder_attention_mask=encoder_decoder_mask, - )[0] - - y = self.decoder_norm(y) - y = self.post_dropout(y) - - spec_out = self.spec_out(y) - return spec_out - - -class ContinuousContextTransformer(ModelMixin, ConfigMixin): - ignore_for_config = ["weights"] - - @register_to_config - def __init__(self, config: T5Config, weights): - super().__init__() - - self.token_encoder = TokenEncoder(config=config, weights=weights["token_encoder"]) - self.continuous_encoder = ContinuousEncoder(config=config, weights=weights["continuous_encoder"]) - self.decoder = Decoder(config=config, weights=weights["decoder"]) - - def encode(self, input_tokens, continuous_inputs, continuous_mask): - tokens_mask = input_tokens > 0 - tokens_encoded, tokens_mask = self.token_encoder( - encoder_input_tokens=input_tokens, - encoder_inputs_mask=tokens_mask, - ) - - continuous_encoded, continuous_mask = self.continuous_encoder( - encoder_inputs=continuous_inputs, - encoder_inputs_mask=continuous_mask, - ) - - return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)] - - def decode(self, encodings_and_masks, input_tokens, noise_time): - logits = self.decoder( - encodings_and_masks=encodings_and_masks, - decoder_input_tokens=input_tokens, - decoder_noise_time=noise_time, - ) - return logits - - def forward( - self, - encoder_input_tokens, - encoder_continuous_inputs, - encoder_continuous_mask, - decoder_input_tokens, - decoder_noise_time, - ): - encodings_and_masks = self.encode( - input_tokens=encoder_input_tokens, - continuous_inputs=encoder_continuous_inputs, - continuous_mask=encoder_continuous_mask, - ) - - return self.decode( - encodings_and_masks=encodings_and_masks, - input_tokens=decoder_input_tokens, - noise_time=decoder_noise_time, - ) - - class SpectrogramDiffusionPipeline(DiffusionPipeline): def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None: super().__init__() From b3caf357cbfd9eb199eb05232e2f8b0a73555084 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 8 Nov 2022 09:54:08 +0100 Subject: [PATCH 022/131] import ContinuousContextTransformer and SpectrogramDiffusionPipeline --- src/diffusers/__init__.py | 11 ++++++- .../dummy_torch_and_accelerate_objects.py | 30 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 22b6589973a0..02521e8b6fd9 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -26,7 +26,15 @@ if is_torch_available(): from .modeling_utils import ModelMixin - from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel + from .models import ( + AutoencoderKL, + ContinuousContextTransformer, + Transformer2DModel, + UNet1DModel, + UNet2DConditionModel, + UNet2DModel, + VQModel, + ) from .optimization import ( get_constant_schedule, get_constant_schedule_with_warmup, @@ -46,6 +54,7 @@ PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline, + SpectrogramDiffusionPipeline, ) from .schedulers import ( DDIMScheduler, diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py index 335e3ca24d2a..e2a2046969fe 100644 --- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py +++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py @@ -34,6 +34,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "accelerate"]) +class ContinuousContextTransformer(metaclass=DummyObject): + _backends = ["torch", "accelerate"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "accelerate"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "accelerate"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "accelerate"]) + + class Transformer2DModel(metaclass=DummyObject): _backends = ["torch", "accelerate"] @@ -272,6 +287,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "accelerate"]) +class SpectrogramDiffusionPipeline(metaclass=DummyObject): + _backends = ["torch", "accelerate"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "accelerate"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "accelerate"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "accelerate"]) + + class DDIMScheduler(metaclass=DummyObject): _backends = ["torch", "accelerate"] From 593e2aa070621a54ef27b4b258d2670df539cdd1 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 8 Nov 2022 09:57:57 +0100 Subject: [PATCH 023/131] initial spec diffusion converstion script --- .../convert_music_spectrogram_to_diffusers.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 scripts/convert_music_spectrogram_to_diffusers.py diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py new file mode 100644 index 000000000000..1b2bdd2fb3bd --- /dev/null +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +import argparse +import os + +import jax +import tensorflow as tf + +from t5x import checkpoints +from music_spectrogram_diffusion import inference + +from transformers import T5Config + +from diffusers import DDPMScheduler, ContinuousContextTransformer, SpectrogramDiffusionPipeline + +MODEL = "base_with_context" + + +def main(args): + t5_checkpoint = checkpoints.load_t5x_checkpoint(args.checkpoint_path) + + gin_overrides = [ + "from __gin__ import dynamic_registration", + "from music_spectrogram_diffusion.models.diffusion import diffusion_utils", + "diffusion_utils.ClassifierFreeGuidanceConfig.eval_condition_weight = 2.0", + "diffusion_utils.DiffusionConfig.classifier_free_guidance = @diffusion_utils.ClassifierFreeGuidanceConfig()", + ] + + gin_file = os.path.join(args.checkpoint_path, "..", "config.gin") + gin_config = inference.parse_training_gin_file(gin_file, gin_overrides) + synth_model = inference.InferenceModel(args.checkpoint_path, gin_config) + + t5config = T5Config( + vocab_size=synth_model.model.module.config.vocab_size, + max_length=synth_model.sequence_length["inputs"], + input_dims=synth_model.audio_codec.n_dims, + targets_context_length=synth_model.sequence_length["targets_context"], + targets_length=synth_model.sequence_length["targets"], + d_model=synth_model.model.module.config.emb_dim, + num_heads=synth_model.model.module.config.num_heads, + num_layers=synth_model.model.module.config.num_encoder_layers, + num_decoder_layers=synth_model.model.module.config.num_decoder_layers, + d_kv=synth_model.model.module.config.head_dim, + d_ff=synth_model.model.module.config.mlp_dim, + dropout_rate=synth_model.model.module.config.dropout_rate, + feed_forward_proj=synth_model.model.module.config.mlp_activations[0], + is_gated_act=True, + max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time, + ) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large") + model = ContinuousContextTransformer(config=t5config) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.") + parser.add_argument( + "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not." + ) + parser.add_argument( + "--checkpoint_path", + default=f"{MODEL}/checkpoint_500000", + type=str, + required=True, + help="Path to the original jax model checkpoint.", + ) + args = parser.parse_args() + + main(args) From c7077995a4bbf171d03ab13ebde5569fdd2fd383 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 8 Nov 2022 11:02:51 +0100 Subject: [PATCH 024/131] renamed config to t5config --- .../convert_music_spectrogram_to_diffusers.py | 14 ++-- src/diffusers/models/t5_attention.py | 83 +++++++++---------- 2 files changed, 47 insertions(+), 50 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 1b2bdd2fb3bd..6810a9bfad6a 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -4,10 +4,10 @@ import jax import tensorflow as tf +import torch from t5x import checkpoints from music_spectrogram_diffusion import inference - from transformers import T5Config from diffusers import DDPMScheduler, ContinuousContextTransformer, SpectrogramDiffusionPipeline @@ -49,21 +49,21 @@ def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large") - model = ContinuousContextTransformer(config=t5config) + model = ContinuousContextTransformer(t5config=t5config) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.") - parser.add_argument( - "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not." - ) + # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.") + # parser.add_argument( + # "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not." + # ) parser.add_argument( "--checkpoint_path", default=f"{MODEL}/checkpoint_500000", type=str, - required=True, + required=False, help="Path to the original jax model checkpoint.", ) args = parser.parse_args() diff --git a/src/diffusers/models/t5_attention.py b/src/diffusers/models/t5_attention.py index 7d588a3113c4..cb4617bf188c 100644 --- a/src/diffusers/models/t5_attention.py +++ b/src/diffusers/models/t5_attention.py @@ -199,25 +199,25 @@ def forward( class TokenEncoder(ModelMixin, ConfigMixin): @register_to_config - def __init__(self, config: T5Config): + def __init__(self, t5config: T5Config): super().__init__() - self.token_embedder = nn.Embedding(config.vocab_size, config.d_model) + self.token_embedder = nn.Embedding(t5config.vocab_size, t5config.d_model) - self.position_encoding = nn.Embedding(config.max_length, config.d_model) + self.position_encoding = nn.Embedding(t5config.max_length, t5config.d_model) self.position_encoding.weight.requires_grad = False - self.dropout_pre = nn.Dropout(p=config.dropout_rate) + self.dropout_pre = nn.Dropout(p=t5config.dropout_rate) - config.is_decoder = False - config.is_encoder_decoder = False + t5config.is_decoder = False + t5config.is_encoder_decoder = False self.encoders = nn.ModuleList() - for lyr_num in range(config.num_layers): - lyr = T5Block(config) + for lyr_num in range(t5config.num_layers): + lyr = T5Block(t5config) self.encoders.append(lyr) - self.layer_norm = T5LayerNorm(hidden_size=config.d_model) - self.dropout_post = nn.Dropout(p=config.dropout_rate) + self.layer_norm = T5LayerNorm(hidden_size=t5config.d_model) + self.dropout_post = nn.Dropout(p=t5config.dropout_rate) def forward(self, encoder_input_tokens, encoder_inputs_mask): x = self.token_embedder(encoder_input_tokens) @@ -237,25 +237,25 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask): class ContinuousEncoder(ModelMixin, ConfigMixin): @register_to_config - def __init__(self, config): + def __init__(self, t5config): super().__init__() - self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False) + self.input_proj = nn.Linear(t5config.input_dims, t5config.d_model, bias=False) - self.position_encoding = nn.Embedding(config.targets_context_length, config.d_model) + self.position_encoding = nn.Embedding(t5config.targets_context_length, t5config.d_model) self.position_encoding.weight.requires_grad = False - self.dropout_pre = nn.Dropout(p=config.dropout_rate) + self.dropout_pre = nn.Dropout(p=t5config.dropout_rate) - config.is_decoder = False - config.is_encoder_decoder = False + t5config.is_decoder = False + t5config.is_encoder_decoder = False self.encoders = nn.ModuleList() - for lyr_num in range(config.num_layers): - lyr = T5Block(config) + for lyr_num in range(t5config.num_layers): + lyr = T5Block(t5config) self.encoders.append(lyr) - self.layer_norm = T5LayerNorm(hidden_size=config.d_model) - self.dropout_post = nn.Dropout(p=config.dropout_rate) + self.layer_norm = T5LayerNorm(hidden_size=t5config.d_model) + self.dropout_post = nn.Dropout(p=t5config.dropout_rate) def forward(self, encoder_inputs, encoder_inputs_mask): x = self.input_proj(encoder_inputs) @@ -279,38 +279,35 @@ def forward(self, encoder_inputs, encoder_inputs_mask): class Decoder(ModelMixin, ConfigMixin): @register_to_config - def __init__(self, config: T5Config): + def __init__(self, t5config: T5Config): super().__init__() self.conditioning_emb = nn.Sequential( - nn.Linear(config.d_model, config.d_model * 4, bias=False), + nn.Linear(t5config.d_model, t5config.d_model * 4, bias=False), nn.SiLU(), - nn.Linear(config.d_model * 4, config.d_model * 4, bias=False), + nn.Linear(t5config.d_model * 4, t5config.d_model * 4, bias=False), nn.SiLU(), ) - self.position_encoding = nn.Embedding(config.targets_length, config.d_model) + self.position_encoding = nn.Embedding(t5config.targets_length, t5config.d_model) self.position_encoding.weight.requires_grad = False - self.continuous_inputs_projection = nn.Linear(config.input_dims, config.d_model) + self.continuous_inputs_projection = nn.Linear(t5config.input_dims, t5config.d_model) - self.dropout = nn.Dropout(p=config.dropout_rate) + self.dropout = nn.Dropout(p=t5config.dropout_rate) - config.is_decoder = True - config.is_encoder_decoder = False + t5config.is_decoder = True + t5config.is_encoder_decoder = False self.decoders = nn.ModuleList() - for lyr_num in range(config.num_decoder_layers): + for lyr_num in range(t5config.num_decoder_layers): # FiLM conditional T5 decoder - lyr = DecoderLayer(config) + lyr = DecoderLayer(t5config) self.decoders.append(lyr) - self.decoder_norm = T5LayerNorm(config.d_model) + self.decoder_norm = T5LayerNorm(t5config.d_model) - self.post_dropout = nn.Dropout(p=config.dropout_rate) - self.spec_out = nn.Linear(config.d_model, config.input_dims, bias=False) - - self.max_decoder_noise_time = config.max_decoder_noise_time - self.emb_dim = config.d_model + self.post_dropout = nn.Dropout(p=t5config.dropout_rate) + self.spec_out = nn.Linear(t5config.d_model, t5config.input_dims, bias=False) def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul): mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) @@ -322,14 +319,14 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) # decoder_noise_time is in [0, 1), so rescale to expected timing range. conditioning_emb = get_timestep_embedding( - decoder_noise_time * self.max_decoder_noise_time, - embedding_dim=self.emb_dim, + decoder_noise_time * self.config.t5config.max_decoder_noise_time, + embedding_dim=self.config.t5config.d_model, max_period=self.max_decoder_noise_time, ) conditioning_emb = self.conditioning_emb(conditioning_emb) - assert conditioning_emb.shape == (batch, self.emb_dim * 4) + assert conditioning_emb.shape == (batch, self.config.t5config.d_model * 4) seq_length = decoder_input_tokens.shape[1] @@ -374,12 +371,12 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) class ContinuousContextTransformer(ModelMixin, ConfigMixin): @register_to_config - def __init__(self, config: T5Config): + def __init__(self, t5config: T5Config): super().__init__() - self.token_encoder = TokenEncoder(config=config) - self.continuous_encoder = ContinuousEncoder(config=config) - self.decoder = Decoder(config=config) + self.token_encoder = TokenEncoder(t5config=t5config) + self.continuous_encoder = ContinuousEncoder(t5config=t5config) + self.decoder = Decoder(t5config=t5config) def encode(self, input_tokens, continuous_inputs, continuous_mask): tokens_mask = input_tokens > 0 From 55bb6ddcd8c08cbf7cbcdf322a9bdeb7a6d7a915 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 9 Nov 2022 19:12:52 +0100 Subject: [PATCH 025/131] added weight loading --- .../convert_music_spectrogram_to_diffusers.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 6810a9bfad6a..7386a0fdcfbd 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -5,6 +5,7 @@ import jax import tensorflow as tf import torch +import torch.nn as nn from t5x import checkpoints from music_spectrogram_diffusion import inference @@ -15,6 +16,117 @@ MODEL = "base_with_context" +def load_token_encoder(weights, model): + model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"])) + model.position_encoding.weight = nn.Parameter( + torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False + ) + for lyr_num, lyr in enumerate(model.encoders): + ly_weight = weights[f"layers_{lyr_num}"] + attention_weights = ly_weight["attention"] + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + ) + + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + + model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + + +def load_continuous_encoder(weights, model): + model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) + + model.position_encoding.weight = nn.Parameter( + torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False + ) + + for lyr_num, lyr in enumerate(model.encoders): + ly_weight = weights[f"layers_{lyr_num}"] + attention_weights = ly_weight["attention"] + + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + ) + + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + + model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + + +def load_decoder(weights, model): + model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) + model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) + + model.position_encoding.weight = nn.Parameter( + torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False + ) + + model.continuous_inputs_projection.weight = nn.Parameter( + torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T) + ) + + for lyr_num, lyr in enumerate(model.decoders): + ly_weight = weights[f"layers_{lyr_num}"] + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) + ) + + lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter( + torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) + ) + + attention_weights = ly_weight["self_attention"] + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + + attention_weights = ly_weight["MultiHeadDotProductAttention_0"] + lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) + lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) + lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) + lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + + lyr.layer[1].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) + ) + + lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + + lyr.layer[3].scale_bias.weight = nn.Parameter( + torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) + ) + + lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + + model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) + + model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) + + +def load_checkpoint(t5_checkpoint, model): + load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder) + load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder) + load_decoder(t5_checkpoint["decoder"], model.decoder) + return model + + def main(args): t5_checkpoint = checkpoints.load_t5x_checkpoint(args.checkpoint_path) @@ -49,10 +161,17 @@ def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large") + model = ContinuousContextTransformer(t5config=t5config) + model = load_checkpoint(t5_checkpoint["target"], model).to(device) + + pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler) + pipe.save_pretrained(args.output_path) if __name__ == "__main__": + jax.config.update("jax_platform_name", "cpu") + parser = argparse.ArgumentParser() # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.") From 7cb32d7da4ca91630536f162be3a77a307919bfc Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 10 Nov 2022 10:27:37 +0100 Subject: [PATCH 026/131] use arguments instead of t5config --- .../convert_music_spectrogram_to_diffusers.py | 32 +- src/diffusers/__init__.py | 10 +- src/diffusers/models/__init__.py | 2 - src/diffusers/models/film.py | 27 - src/diffusers/models/t5_attention.py | 421 -------------- .../spectrogram_diffusion/__init__.py | 2 +- .../pipeline_spectrogram_diffusion.py | 525 +++++++++++++++++- .../dummy_torch_and_accelerate_objects.py | 15 - 8 files changed, 542 insertions(+), 492 deletions(-) delete mode 100644 src/diffusers/models/film.py delete mode 100644 src/diffusers/models/t5_attention.py diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 7386a0fdcfbd..52d7785fff62 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -2,16 +2,16 @@ import argparse import os -import jax -import tensorflow as tf import torch import torch.nn as nn -from t5x import checkpoints +import jax +import tensorflow as tf +from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline +from diffusers.pipelines.spectrogram_diffusion import ContinuousContextTransformer from music_spectrogram_diffusion import inference -from transformers import T5Config +from t5x import checkpoints -from diffusers import DDPMScheduler, ContinuousContextTransformer, SpectrogramDiffusionPipeline MODEL = "base_with_context" @@ -141,7 +141,10 @@ def main(args): gin_config = inference.parse_training_gin_file(gin_file, gin_overrides) synth_model = inference.InferenceModel(args.checkpoint_path, gin_config) - t5config = T5Config( + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large") + + model = ContinuousContextTransformer( vocab_size=synth_model.model.module.config.vocab_size, max_length=synth_model.sequence_length["inputs"], input_dims=synth_model.audio_codec.n_dims, @@ -149,23 +152,20 @@ def main(args): targets_length=synth_model.sequence_length["targets"], d_model=synth_model.model.module.config.emb_dim, num_heads=synth_model.model.module.config.num_heads, - num_layers=synth_model.model.module.config.num_encoder_layers, + num_encoder_layers=synth_model.model.module.config.num_encoder_layers, num_decoder_layers=synth_model.model.module.config.num_decoder_layers, d_kv=synth_model.model.module.config.head_dim, d_ff=synth_model.model.module.config.mlp_dim, dropout_rate=synth_model.model.module.config.dropout_rate, - feed_forward_proj=synth_model.model.module.config.mlp_activations[0], - is_gated_act=True, + feed_forward_proj="gated-gelu", max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time, ) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large") - - model = ContinuousContextTransformer(t5config=t5config) - model = load_checkpoint(t5_checkpoint["target"], model).to(device) + model = load_checkpoint(t5_checkpoint["target"], model) pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler) + import pdb + + pdb.set_trace() pipe.save_pretrained(args.output_path) @@ -174,7 +174,7 @@ def main(args): parser = argparse.ArgumentParser() - # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.") + # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.") # parser.add_argument( # "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not." # ) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 02521e8b6fd9..6c69aa31bc82 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -26,15 +26,7 @@ if is_torch_available(): from .modeling_utils import ModelMixin - from .models import ( - AutoencoderKL, - ContinuousContextTransformer, - Transformer2DModel, - UNet1DModel, - UNet2DConditionModel, - UNet2DModel, - VQModel, - ) + from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel from .optimization import ( get_constant_schedule, get_constant_schedule_with_warmup, diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index c3d524eddebc..5b101d169148 100644 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -17,8 +17,6 @@ if is_torch_available(): from .attention import Transformer2DModel - from .film import FiLMLayer - from .t5_attention import ContinuousContextTransformer, ContinuousEncoder, Decoder, TokenEncoder from .unet_1d import UNet1DModel from .unet_2d import UNet2DModel from .unet_2d_condition import UNet2DConditionModel diff --git a/src/diffusers/models/film.py b/src/diffusers/models/film.py deleted file mode 100644 index 8936fd32276d..000000000000 --- a/src/diffusers/models/film.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn as nn - - -class FiLMLayer(nn.Module): - def __init__(self, in_features, out_features): - super().__init__() - self.scale_bias = nn.Linear(in_features, out_features * 2) - - def forward(self, x, conditioning_emb): - scale_bias = self.scale_bias(conditioning_emb) - scale, bias = torch.chunk(scale_bias, 2, -1) - return x * (scale + 1.0) + bias diff --git a/src/diffusers/models/t5_attention.py b/src/diffusers/models/t5_attention.py deleted file mode 100644 index cb4617bf188c..000000000000 --- a/src/diffusers/models/t5_attention.py +++ /dev/null @@ -1,421 +0,0 @@ -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn as nn - -from transformers.models.t5.modeling_t5 import ( - T5Attention, - T5Block, - T5Config, - T5LayerCrossAttention, - T5LayerFF, - T5LayerNorm, -) - -from ..configuration_utils import ConfigMixin, register_to_config -from ..modeling_utils import ModelMixin -from .embeddings import get_timestep_embedding -from .film import FiLMLayer - - -class T5LayerSelfAttentionCond(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() - self.layer_norm = T5LayerNorm(config.d_model) - self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model) - self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.dropout = nn.Dropout(config.dropout_rate) - - def forward( - self, - hidden_states, - conditioning_emb=None, - attention_mask=None, - position_bias=None, - layer_head_mask=None, - past_key_value=None, - use_cache=False, - output_attentions=False, - ): - # pre_self_attention_layer_norm - normed_hidden_states = self.layer_norm(hidden_states) - - if conditioning_emb is not None: - normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb) - - # Self-attention block - attention_output = self.SelfAttention( - normed_hidden_states, - mask=attention_mask, - position_bias=position_bias, - layer_head_mask=layer_head_mask, - past_key_value=past_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states = hidden_states + self.dropout(attention_output[0]) - outputs = (hidden_states,) + attention_output[1:] - return outputs - - -class DecoderLayer(nn.Module): - def __init__(self, config, has_relative_attention_bias=False): - super().__init__() - self.layer = nn.ModuleList() - - # cond self attention: layer 0 - self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias)) - - # cross attention: layer 1 - self.layer.append(T5LayerCrossAttention(config)) - - # pre_mlp_layer_norm: layer 2 - self.layer.append(T5LayerNorm(hidden_size=config.d_model)) - - # FiLM layer: 3 - self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)) - - # MLP + dropout: last layer - self.layer.append(T5LayerFF(config)) - - def forward( - self, - hidden_states, - conditioning_emb=None, - attention_mask=None, - position_bias=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - encoder_decoder_position_bias=None, - layer_head_mask=None, - cross_attn_layer_head_mask=None, - past_key_value=None, - use_cache=False, - output_attentions=False, - return_dict=True, - ): - if past_key_value is not None: - expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 - - if len(past_key_value) != expected_num_past_key_values: - raise ValueError( - f"There should be {expected_num_past_key_values} past states. " - f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" - f"Got {len(past_key_value)} past key / value states" - ) - - self_attn_past_key_value = past_key_value[:2] - cross_attn_past_key_value = past_key_value[2:] - else: - self_attn_past_key_value, cross_attn_past_key_value = None, None - - self_attention_outputs = self.layer[0]( - hidden_states, - conditioning_emb=conditioning_emb, - attention_mask=attention_mask, - position_bias=position_bias, - layer_head_mask=layer_head_mask, - past_key_value=self_attn_past_key_value, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states, present_key_value_state = self_attention_outputs[:2] - attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - if encoder_hidden_states is not None: - # the actual query length is unknown for cross attention - # if using past key value states. Need to inject it here - if present_key_value_state is not None: - query_length = present_key_value_state[0].shape[2] - else: - query_length = None - - cross_attention_outputs = self.layer[1]( - hidden_states, - key_value_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, - position_bias=encoder_decoder_position_bias, - layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - query_length=query_length, - use_cache=use_cache, - output_attentions=output_attentions, - ) - hidden_states = cross_attention_outputs[0] - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - # Combine self attn and cross attn key value states - if present_key_value_state is not None: - present_key_value_state = present_key_value_state + cross_attention_outputs[1] - - # Keep cross-attention outputs and relative position weights - attention_outputs = attention_outputs + cross_attention_outputs[2:] - - # Apply LayerNorm - hidden_states = self.layer[2](hidden_states) - - # FiLM - if conditioning_emb is not None: - hidden_states = self.layer[3](hidden_states, conditioning_emb) - - # Apply Feed Forward layer - hidden_states = self.layer[-1](hidden_states) - - # clamp inf values to enable fp16 training - if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - outputs = (hidden_states,) - - if use_cache: - outputs = outputs + (present_key_value_state,) + attention_outputs - else: - outputs = outputs + attention_outputs - - return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) - - -class TokenEncoder(ModelMixin, ConfigMixin): - @register_to_config - def __init__(self, t5config: T5Config): - super().__init__() - - self.token_embedder = nn.Embedding(t5config.vocab_size, t5config.d_model) - - self.position_encoding = nn.Embedding(t5config.max_length, t5config.d_model) - self.position_encoding.weight.requires_grad = False - - self.dropout_pre = nn.Dropout(p=t5config.dropout_rate) - - t5config.is_decoder = False - t5config.is_encoder_decoder = False - self.encoders = nn.ModuleList() - for lyr_num in range(t5config.num_layers): - lyr = T5Block(t5config) - self.encoders.append(lyr) - - self.layer_norm = T5LayerNorm(hidden_size=t5config.d_model) - self.dropout_post = nn.Dropout(p=t5config.dropout_rate) - - def forward(self, encoder_input_tokens, encoder_inputs_mask): - x = self.token_embedder(encoder_input_tokens) - - seq_length = encoder_input_tokens.shape[1] - inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device) - x += self.position_encoding(inputs_positions) - - x = self.dropout_pre(x) - - for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask)[0] - x = self.layer_norm(x) - - return self.dropout_post(x), encoder_inputs_mask - - -class ContinuousEncoder(ModelMixin, ConfigMixin): - @register_to_config - def __init__(self, t5config): - super().__init__() - - self.input_proj = nn.Linear(t5config.input_dims, t5config.d_model, bias=False) - - self.position_encoding = nn.Embedding(t5config.targets_context_length, t5config.d_model) - self.position_encoding.weight.requires_grad = False - - self.dropout_pre = nn.Dropout(p=t5config.dropout_rate) - - t5config.is_decoder = False - t5config.is_encoder_decoder = False - self.encoders = nn.ModuleList() - for lyr_num in range(t5config.num_layers): - lyr = T5Block(t5config) - self.encoders.append(lyr) - - self.layer_norm = T5LayerNorm(hidden_size=t5config.d_model) - self.dropout_post = nn.Dropout(p=t5config.dropout_rate) - - def forward(self, encoder_inputs, encoder_inputs_mask): - x = self.input_proj(encoder_inputs) - - # terminal relative positional encodings - max_positions = encoder_inputs.shape[1] - input_positions = torch.arange(max_positions, device=encoder_inputs.device) - - seq_lens = encoder_inputs_mask.sum(-1) - input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0) - x += self.position_encoding(input_positions) - - x = self.dropout_pre(x) - - for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask)[0] - x = self.layer_norm(x) - - return self.dropout_post(x), encoder_inputs_mask - - -class Decoder(ModelMixin, ConfigMixin): - @register_to_config - def __init__(self, t5config: T5Config): - super().__init__() - - self.conditioning_emb = nn.Sequential( - nn.Linear(t5config.d_model, t5config.d_model * 4, bias=False), - nn.SiLU(), - nn.Linear(t5config.d_model * 4, t5config.d_model * 4, bias=False), - nn.SiLU(), - ) - - self.position_encoding = nn.Embedding(t5config.targets_length, t5config.d_model) - self.position_encoding.weight.requires_grad = False - - self.continuous_inputs_projection = nn.Linear(t5config.input_dims, t5config.d_model) - - self.dropout = nn.Dropout(p=t5config.dropout_rate) - - t5config.is_decoder = True - t5config.is_encoder_decoder = False - self.decoders = nn.ModuleList() - for lyr_num in range(t5config.num_decoder_layers): - # FiLM conditional T5 decoder - lyr = DecoderLayer(t5config) - self.decoders.append(lyr) - - self.decoder_norm = T5LayerNorm(t5config.d_model) - - self.post_dropout = nn.Dropout(p=t5config.dropout_rate) - self.spec_out = nn.Linear(t5config.d_model, t5config.input_dims, bias=False) - - def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul): - mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) - return mask.unsqueeze(-3) - - def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time): - batch, _, _ = decoder_input_tokens.shape - assert decoder_noise_time.shape == (batch,) - - # decoder_noise_time is in [0, 1), so rescale to expected timing range. - conditioning_emb = get_timestep_embedding( - decoder_noise_time * self.config.t5config.max_decoder_noise_time, - embedding_dim=self.config.t5config.d_model, - max_period=self.max_decoder_noise_time, - ) - - conditioning_emb = self.conditioning_emb(conditioning_emb) - - assert conditioning_emb.shape == (batch, self.config.t5config.d_model * 4) - - seq_length = decoder_input_tokens.shape[1] - - # If we want to use relative positions for audio context, we can just offset - # this sequence by the length of encodings_and_masks. - decoder_positions = torch.broadcast_to( - torch.arange(seq_length, device=decoder_input_tokens.device), - (batch, seq_length), - ) - - position_encodings = self.position_encoding(decoder_positions) - - # decoder: No padding present. - decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device) - - # Translate encoding masks to encoder-decoder masks. - encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks] - - inputs = self.continuous_inputs_projection(decoder_input_tokens) - - inputs += position_encodings - - y = self.dropout(inputs) - - # cross attend style: concat encodings - encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) - encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) - for lyr in self.decoders: - y = lyr( - y, - conditioning_emb=conditioning_emb, - encoder_hidden_states=encoded, - encoder_attention_mask=encoder_decoder_mask, - )[0] - - y = self.decoder_norm(y) - y = self.post_dropout(y) - - spec_out = self.spec_out(y) - return spec_out - - -class ContinuousContextTransformer(ModelMixin, ConfigMixin): - @register_to_config - def __init__(self, t5config: T5Config): - super().__init__() - - self.token_encoder = TokenEncoder(t5config=t5config) - self.continuous_encoder = ContinuousEncoder(t5config=t5config) - self.decoder = Decoder(t5config=t5config) - - def encode(self, input_tokens, continuous_inputs, continuous_mask): - tokens_mask = input_tokens > 0 - tokens_encoded, tokens_mask = self.token_encoder( - encoder_input_tokens=input_tokens, - encoder_inputs_mask=tokens_mask, - ) - - continuous_encoded, continuous_mask = self.continuous_encoder( - encoder_inputs=continuous_inputs, - encoder_inputs_mask=continuous_mask, - ) - - return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)] - - def decode(self, encodings_and_masks, input_tokens, noise_time): - logits = self.decoder( - encodings_and_masks=encodings_and_masks, - decoder_input_tokens=input_tokens, - decoder_noise_time=noise_time, - ) - return logits - - def forward( - self, - encoder_input_tokens, - encoder_continuous_inputs, - encoder_continuous_mask, - decoder_input_tokens, - decoder_noise_time, - ): - encodings_and_masks = self.encode( - input_tokens=encoder_input_tokens, - continuous_inputs=encoder_continuous_inputs, - continuous_mask=encoder_continuous_mask, - ) - - return self.decode( - encodings_and_masks=encodings_and_masks, - input_tokens=decoder_input_tokens, - noise_time=decoder_noise_time, - ) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py index de37e892a7db..850f9f7fba6d 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py @@ -1,2 +1,2 @@ # flake8: noqa -from .pipeline_spectrogram_diffusion import SpectrogramDiffusionPipeline +from .pipeline_spectrogram_diffusion import ContinuousContextTransformer, SpectrogramDiffusionPipeline diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 69f1f092febf..35ada88c9e9e 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -2,12 +2,535 @@ from typing import Optional import torch +import torch.nn as nn -from ...models.t5_attention import ContinuousContextTransformer +from transformers.models.t5.modeling_t5 import ( + T5Attention, + T5Block, + T5Config, + T5LayerCrossAttention, + T5LayerFF, + T5LayerNorm, +) + +from ...configuration_utils import ConfigMixin, register_to_config +from ...modeling_utils import ModelMixin +from ...models.embeddings import get_timestep_embedding from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput from ...schedulers import DDPMScheduler +class FiLMLayer(nn.Module): + def __init__(self, in_features, out_features): + super().__init__() + self.scale_bias = nn.Linear(in_features, out_features * 2) + + def forward(self, x, conditioning_emb): + scale_bias = self.scale_bias(conditioning_emb) + scale, bias = torch.chunk(scale_bias, 2, -1) + return x * (scale + 1.0) + bias + + +class T5LayerSelfAttentionCond(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.layer_norm = T5LayerNorm(config.d_model) + self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model) + self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + ): + # pre_self_attention_layer_norm + normed_hidden_states = self.layer_norm(hidden_states) + + if conditioning_emb is not None: + normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb) + + # Self-attention block + attention_output = self.SelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[1:] + return outputs + + +class DecoderLayer(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.layer = nn.ModuleList() + + # cond self attention: layer 0 + self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias)) + + # cross attention: layer 1 + self.layer.append(T5LayerCrossAttention(config)) + + # pre_mlp_layer_norm: layer 2 + self.layer.append(T5LayerNorm(hidden_size=config.d_model)) + + # FiLM layer: 3 + self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)) + + # MLP + dropout: last layer + self.layer.append(T5LayerFF(config)) + + def forward( + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): + if past_key_value is not None: + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + + if len(past_key_value) != expected_num_past_key_values: + raise ValueError( + f"There should be {expected_num_past_key_values} past states. " + f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" + f"Got {len(past_key_value)} past key / value states" + ) + + self_attn_past_key_value = past_key_value[:2] + cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + conditioning_emb=conditioning_emb, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=self_attn_past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] + attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + if encoder_hidden_states is not None: + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here + if present_key_value_state is not None: + query_length = present_key_value_state[0].shape[2] + else: + query_length = None + + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states + if present_key_value_state is not None: + present_key_value_state = present_key_value_state + cross_attention_outputs[1] + + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + + # Apply LayerNorm + hidden_states = self.layer[2](hidden_states) + + # FiLM + if conditioning_emb is not None: + hidden_states = self.layer[3](hidden_states, conditioning_emb) + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if use_cache: + outputs = outputs + (present_key_value_state,) + attention_outputs + else: + outputs = outputs + attention_outputs + + return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + +class TokenEncoder(ModelMixin, ConfigMixin): + @register_to_config + def __init__( + self, + max_length: int, + vocab_size: int, + d_model: int, + dropout_rate: float, + num_layers: int, + num_heads: int, + d_kv: int, + d_ff: int, + feed_forward_proj: str, + ): + super().__init__() + + self.token_embedder = nn.Embedding(vocab_size, d_model) + + self.position_encoding = nn.Embedding(max_length, d_model) + self.position_encoding.weight.requires_grad = False + + self.dropout_pre = nn.Dropout(p=dropout_rate) + + t5config = T5Config( + vocab_size=vocab_size, + d_model=d_model, + num_heads=num_heads, + d_kv=d_kv, + d_ff=d_ff, + dropout_rate=dropout_rate, + feed_forward_proj=feed_forward_proj, + is_decoder=False, + is_encoder_decoder=False, + ) + + self.encoders = nn.ModuleList() + for lyr_num in range(num_layers): + lyr = T5Block(t5config) + self.encoders.append(lyr) + + self.layer_norm = T5LayerNorm(hidden_size=d_model) + self.dropout_post = nn.Dropout(p=dropout_rate) + + def forward(self, encoder_input_tokens, encoder_inputs_mask): + x = self.token_embedder(encoder_input_tokens) + + seq_length = encoder_input_tokens.shape[1] + inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device) + x += self.position_encoding(inputs_positions) + + x = self.dropout_pre(x) + + for lyr in self.encoders: + x = lyr(x, encoder_inputs_mask)[0] + x = self.layer_norm(x) + + return self.dropout_post(x), encoder_inputs_mask + + +class ContinuousEncoder(ModelMixin, ConfigMixin): + @register_to_config + def __init__( + self, + input_dims: int, + targets_context_length: int, + d_model: int, + dropout_rate: float, + num_layers: int, + num_heads: int, + d_kv: int, + d_ff: int, + feed_forward_proj: str, + ): + super().__init__() + + self.input_proj = nn.Linear(input_dims, d_model, bias=False) + + self.position_encoding = nn.Embedding(targets_context_length, d_model) + self.position_encoding.weight.requires_grad = False + + self.dropout_pre = nn.Dropout(p=dropout_rate) + + t5config = T5Config( + d_model=d_model, + num_heads=num_heads, + d_kv=d_kv, + d_ff=d_ff, + feed_forward_proj=feed_forward_proj, + dropout_rate=dropout_rate, + is_decoder=False, + is_encoder_decoder=False, + ) + self.encoders = nn.ModuleList() + for lyr_num in range(num_layers): + lyr = T5Block(t5config) + self.encoders.append(lyr) + + self.layer_norm = T5LayerNorm(hidden_size=d_model) + self.dropout_post = nn.Dropout(p=dropout_rate) + + def forward(self, encoder_inputs, encoder_inputs_mask): + x = self.input_proj(encoder_inputs) + + # terminal relative positional encodings + max_positions = encoder_inputs.shape[1] + input_positions = torch.arange(max_positions, device=encoder_inputs.device) + + seq_lens = encoder_inputs_mask.sum(-1) + input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0) + x += self.position_encoding(input_positions) + + x = self.dropout_pre(x) + + for lyr in self.encoders: + x = lyr(x, encoder_inputs_mask)[0] + x = self.layer_norm(x) + + return self.dropout_post(x), encoder_inputs_mask + + +class Decoder(ModelMixin, ConfigMixin): + @register_to_config + def __init__( + self, + input_dims: int, + targets_length: int, + max_decoder_noise_time: float, + d_model: int, + num_layers: int, + num_heads: int, + d_kv: int, + d_ff: int, + dropout_rate: float, + feed_forward_proj: str, + ): + super().__init__() + + self.conditioning_emb = nn.Sequential( + nn.Linear(d_model, d_model * 4, bias=False), + nn.SiLU(), + nn.Linear(d_model * 4, d_model * 4, bias=False), + nn.SiLU(), + ) + + self.position_encoding = nn.Embedding(targets_length, d_model) + self.position_encoding.weight.requires_grad = False + + self.continuous_inputs_projection = nn.Linear(input_dims, d_model) + + self.dropout = nn.Dropout(p=dropout_rate) + + t5config = T5Config( + d_model=d_model, + num_heads=num_heads, + d_kv=d_kv, + d_ff=d_ff, + feed_forward_proj=feed_forward_proj, + dropout_rate=dropout_rate, + is_decoder=True, + is_encoder_decoder=False, + ) + self.decoders = nn.ModuleList() + for lyr_num in range(num_layers): + # FiLM conditional T5 decoder + lyr = DecoderLayer(t5config) + self.decoders.append(lyr) + + self.decoder_norm = T5LayerNorm(d_model) + + self.post_dropout = nn.Dropout(p=dropout_rate) + self.spec_out = nn.Linear(d_model, input_dims, bias=False) + + def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul): + mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) + return mask.unsqueeze(-3) + + def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time): + batch, _, _ = decoder_input_tokens.shape + assert decoder_noise_time.shape == (batch,) + + # decoder_noise_time is in [0, 1), so rescale to expected timing range. + conditioning_emb = get_timestep_embedding( + decoder_noise_time * self.config.max_decoder_noise_time, + embedding_dim=self.config.d_model, + max_period=self.config.max_decoder_noise_time, + ) + + conditioning_emb = self.conditioning_emb(conditioning_emb) + + assert conditioning_emb.shape == (batch, self.config.d_model * 4) + + seq_length = decoder_input_tokens.shape[1] + + # If we want to use relative positions for audio context, we can just offset + # this sequence by the length of encodings_and_masks. + decoder_positions = torch.broadcast_to( + torch.arange(seq_length, device=decoder_input_tokens.device), + (batch, seq_length), + ) + + position_encodings = self.position_encoding(decoder_positions) + + # decoder: No padding present. + decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device) + + # Translate encoding masks to encoder-decoder masks. + encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks] + + inputs = self.continuous_inputs_projection(decoder_input_tokens) + + inputs += position_encodings + + y = self.dropout(inputs) + + # cross attend style: concat encodings + encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) + encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) + for lyr in self.decoders: + y = lyr( + y, + conditioning_emb=conditioning_emb, + encoder_hidden_states=encoded, + encoder_attention_mask=encoder_decoder_mask, + )[0] + + y = self.decoder_norm(y) + y = self.post_dropout(y) + + spec_out = self.spec_out(y) + return spec_out + + +class ContinuousContextTransformer(ModelMixin, ConfigMixin): + @register_to_config + def __init__( + self, + input_dims: int, + max_length: int, + targets_context_length: int, + targets_length: int, + max_decoder_noise_time: float, + vocab_size: int, + d_model: int, + dropout_rate: float, + num_encoder_layers: int, + num_decoder_layers: int, + num_heads: int, + d_kv: int, + d_ff: int, + feed_forward_proj: str = "gated-gelu", + ): + super().__init__() + + self.token_encoder = TokenEncoder( + max_length=max_length, + vocab_size=vocab_size, + d_model=d_model, + dropout_rate=dropout_rate, + num_layers=num_encoder_layers, + num_heads=num_heads, + d_kv=d_kv, + d_ff=d_ff, + feed_forward_proj=feed_forward_proj, + ) + + self.continuous_encoder = ContinuousEncoder( + input_dims=input_dims, + targets_context_length=targets_context_length, + d_model=d_model, + dropout_rate=dropout_rate, + num_layers=num_encoder_layers, + num_heads=num_heads, + d_kv=d_kv, + d_ff=d_ff, + feed_forward_proj=feed_forward_proj, + ) + + self.decoder = Decoder( + input_dims=input_dims, + targets_length=targets_length, + max_decoder_noise_time=max_decoder_noise_time, + d_model=d_model, + num_layers=num_decoder_layers, + num_heads=num_heads, + d_kv=d_kv, + d_ff=d_ff, + dropout_rate=dropout_rate, + feed_forward_proj=feed_forward_proj, + ) + + def encode(self, input_tokens, continuous_inputs, continuous_mask): + tokens_mask = input_tokens > 0 + tokens_encoded, tokens_mask = self.token_encoder( + encoder_input_tokens=input_tokens, + encoder_inputs_mask=tokens_mask, + ) + + continuous_encoded, continuous_mask = self.continuous_encoder( + encoder_inputs=continuous_inputs, + encoder_inputs_mask=continuous_mask, + ) + + return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)] + + def decode(self, encodings_and_masks, input_tokens, noise_time): + logits = self.decoder( + encodings_and_masks=encodings_and_masks, + decoder_input_tokens=input_tokens, + decoder_noise_time=noise_time, + ) + return logits + + def forward( + self, + encoder_input_tokens, + encoder_continuous_inputs, + encoder_continuous_mask, + decoder_input_tokens, + decoder_noise_time, + ): + encodings_and_masks = self.encode( + input_tokens=encoder_input_tokens, + continuous_inputs=encoder_continuous_inputs, + continuous_mask=encoder_continuous_mask, + ) + + return self.decode( + encodings_and_masks=encodings_and_masks, + input_tokens=decoder_input_tokens, + noise_time=decoder_noise_time, + ) + + class SpectrogramDiffusionPipeline(DiffusionPipeline): def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None: super().__init__() diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py index e2a2046969fe..21b7c2a4d7b9 100644 --- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py +++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py @@ -34,21 +34,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "accelerate"]) -class ContinuousContextTransformer(metaclass=DummyObject): - _backends = ["torch", "accelerate"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "accelerate"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "accelerate"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "accelerate"]) - - class Transformer2DModel(metaclass=DummyObject): _backends = ["torch", "accelerate"] From 0251747a3660c7c488bbffeea3bfaad1fb78ded4 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 10 Nov 2022 11:09:27 +0100 Subject: [PATCH 027/131] broadcast noise time to batch dim --- .../pipeline_spectrogram_diffusion.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 35ada88c9e9e..64f185af7640 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -503,10 +503,19 @@ def encode(self, input_tokens, continuous_inputs, continuous_mask): return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)] def decode(self, encodings_and_masks, input_tokens, noise_time): + timesteps = noise_time + if not torch.is_tensor(timesteps): + timesteps = torch.tensor([timesteps], dtype=torch.long, device=input_tokens.device) + elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0: + timesteps = timesteps[None].to(input_tokens.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps * torch.ones(input_tokens.shape[0], dtype=timesteps.dtype, device=timesteps.device) + logits = self.decoder( encodings_and_masks=encodings_and_masks, decoder_input_tokens=input_tokens, - decoder_noise_time=noise_time, + decoder_noise_time=timesteps, ) return logits From 8a54f88a8e1747a2c87b7694c858fcbc3f2527c7 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 10 Nov 2022 14:41:28 +0100 Subject: [PATCH 028/131] fix call --- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 64f185af7640..9f8bb1124fd5 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -576,7 +576,7 @@ def __call__( encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True) encodings_and_masks = self.cont_context_trans.encode( - encoder_input_tokens=encoder_input_tokens, + input_tokens=encoder_input_tokens, continuous_inputs=encoder_continuous_inputs, continuous_mask=encoder_continuous_mask, ) From b6373b896eb228c4b18c76d7ac2dd1d29add800a Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 10 Nov 2022 14:53:15 +0100 Subject: [PATCH 029/131] added scale_to_features --- .../pipeline_spectrogram_diffusion.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 9f8bb1124fd5..e61f093b811b 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -560,6 +560,15 @@ def scale_features(self, features, output_range=(-1.0, 1.0), clip=False): # Scale to [min_out, max_out]. return zero_one * (max_out - min_out) + min_out + def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False): + """Invert by linearly scaling network outputs to features range.""" + min_out, max_out = input_range + outputs = torch.clip(outputs, min_out, max_out) if clip else outputs + # Scale to [0, 1]. + zero_one = (outputs - min_out) / (max_out - min_out) + # Scale to [self.min_value, self.max_value]. + return zero_one * (self.max_value - self.min_value) + self.min_value + @torch.no_grad() def __call__( self, From 5fb437d91854cb771aab0263a5e2b2a5d2d04561 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 10 Nov 2022 16:21:57 +0100 Subject: [PATCH 030/131] fix weights --- .../convert_music_spectrogram_to_diffusers.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 52d7785fff62..479986237ab0 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -2,11 +2,11 @@ import argparse import os +import numpy as np + import torch import torch.nn as nn -import jax -import tensorflow as tf from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline from diffusers.pipelines.spectrogram_diffusion import ContinuousContextTransformer from music_spectrogram_diffusion import inference @@ -33,12 +33,14 @@ def load_token_encoder(weights, model): ) lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + return model + def load_continuous_encoder(weights, model): model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) @@ -60,12 +62,14 @@ def load_continuous_encoder(weights, model): ) lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + return model + def load_decoder(weights, model): model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) @@ -112,18 +116,21 @@ def load_decoder(weights, model): ) lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) + return model + def load_checkpoint(t5_checkpoint, model): - load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder) - load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder) - load_decoder(t5_checkpoint["decoder"], model.decoder) + model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder) + + model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder) + model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder) return model @@ -163,15 +170,10 @@ def main(args): model = load_checkpoint(t5_checkpoint["target"], model) pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler) - import pdb - - pdb.set_trace() - pipe.save_pretrained(args.output_path) + pipe.save_pretrained("kashif") if __name__ == "__main__": - jax.config.update("jax_platform_name", "cpu") - parser = argparse.ArgumentParser() # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.") From 5591f21ddc49e3b54a55819a5a1f3c2845c36bfa Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 10 Nov 2022 17:04:47 +0100 Subject: [PATCH 031/131] transpose laynorm weight --- .../convert_music_spectrogram_to_diffusers.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 479986237ab0..582665c1de54 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -29,15 +29,15 @@ def load_token_encoder(weights, model): lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"].T) ) lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T)) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"].T)) return model @@ -58,15 +58,15 @@ def load_continuous_encoder(weights, model): lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"].T) ) lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T)) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"].T)) return model @@ -86,7 +86,7 @@ def load_decoder(weights, model): for lyr_num, lyr in enumerate(model.decoders): ly_weight = weights[f"layers_{lyr_num}"] lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) + torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"].T) ) lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter( @@ -106,10 +106,10 @@ def load_decoder(weights, model): lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) lyr.layer[1].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) + torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"].T) ) - lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T)) lyr.layer[3].scale_bias.weight = nn.Parameter( torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) @@ -119,7 +119,7 @@ def load_decoder(weights, model): lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) + model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"].T)) model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) From 21b7ea226e6b5a6d8deea71926475e26f3e7cde5 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 14 Nov 2022 16:27:59 +0100 Subject: [PATCH 032/131] scale is a vector --- .../convert_music_spectrogram_to_diffusers.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 582665c1de54..dfef4484c5b9 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -23,22 +23,23 @@ def load_token_encoder(weights, model): ) for lyr_num, lyr in enumerate(model.encoders): ly_weight = weights[f"layers_{lyr_num}"] + lyr.layer[0].layer_norm.weight = nn.Parameter( + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) + ) + attention_weights = ly_weight["attention"] lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"].T) - ) + + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T)) - - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"].T)) + model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) return model @@ -58,15 +59,15 @@ def load_continuous_encoder(weights, model): lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"].T) + torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) ) lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T)) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"].T)) + model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) return model @@ -86,7 +87,7 @@ def load_decoder(weights, model): for lyr_num, lyr in enumerate(model.decoders): ly_weight = weights[f"layers_{lyr_num}"] lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"].T) + torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) ) lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter( @@ -106,10 +107,10 @@ def load_decoder(weights, model): lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) lyr.layer[1].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"].T) + torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) ) - lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T)) + lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) lyr.layer[3].scale_bias.weight = nn.Parameter( torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) @@ -119,7 +120,7 @@ def load_decoder(weights, model): lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"].T)) + model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) @@ -167,7 +168,7 @@ def main(args): feed_forward_proj="gated-gelu", max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time, ) - model = load_checkpoint(t5_checkpoint["target"], model) + model = load_checkpoint(t5_checkpoint["target"], model).eval() pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler) pipe.save_pretrained("kashif") From 87ee8a34cf992c7a38a70512eecb02efadbad8d0 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 17 Nov 2022 10:41:09 +0100 Subject: [PATCH 033/131] scale the query outputs --- .../convert_music_spectrogram_to_diffusers.py | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index dfef4484c5b9..c9f056d7d17b 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -16,7 +16,7 @@ MODEL = "base_with_context" -def load_token_encoder(weights, model): +def load_token_encoder(weights, model, depth_scaling): model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"])) model.position_encoding.weight = nn.Parameter( torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False @@ -28,7 +28,9 @@ def load_token_encoder(weights, model): ) attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter( + torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling) + ) lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) @@ -43,7 +45,7 @@ def load_token_encoder(weights, model): return model -def load_continuous_encoder(weights, model): +def load_continuous_encoder(weights, model, depth_scaling): model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) model.position_encoding.weight = nn.Parameter( @@ -54,7 +56,9 @@ def load_continuous_encoder(weights, model): ly_weight = weights[f"layers_{lyr_num}"] attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter( + torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling) + ) lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) @@ -72,7 +76,7 @@ def load_continuous_encoder(weights, model): return model -def load_decoder(weights, model): +def load_decoder(weights, model, depth_scaling): model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) @@ -95,13 +99,17 @@ def load_decoder(weights, model): ) attention_weights = ly_weight["self_attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter( + torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling) + ) lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) attention_weights = ly_weight["MultiHeadDotProductAttention_0"] - lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) + lyr.layer[1].EncDecAttention.q.weight = nn.Parameter( + torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling) + ) lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) @@ -127,11 +135,13 @@ def load_decoder(weights, model): return model -def load_checkpoint(t5_checkpoint, model): - model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder) +def load_checkpoint(t5_checkpoint, model, depth_scaling): + model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder, depth_scaling) - model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder) - model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder) + model.continuous_encoder = load_continuous_encoder( + t5_checkpoint["continuous_encoder"], model.continuous_encoder, depth_scaling + ) + model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder, depth_scaling) return model @@ -168,7 +178,9 @@ def main(args): feed_forward_proj="gated-gelu", max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time, ) - model = load_checkpoint(t5_checkpoint["target"], model).eval() + model = load_checkpoint( + t5_checkpoint["target"], model, depth_scaling=synth_model.model.module.config.head_dim**-0.5 + ).eval() pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler) pipe.save_pretrained("kashif") From 6deafab654b5dc4691592155a3546b2b2dceb354 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 17 Nov 2022 10:53:00 +0100 Subject: [PATCH 034/131] added comment --- scripts/convert_music_spectrogram_to_diffusers.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index c9f056d7d17b..4f4e2b802d0f 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -16,7 +16,7 @@ MODEL = "base_with_context" -def load_token_encoder(weights, model, depth_scaling): +def load_token_encoder(weights, model, depth_scaling=1.0): model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"])) model.position_encoding.weight = nn.Parameter( torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False @@ -45,7 +45,7 @@ def load_token_encoder(weights, model, depth_scaling): return model -def load_continuous_encoder(weights, model, depth_scaling): +def load_continuous_encoder(weights, model, depth_scaling=1.0): model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) model.position_encoding.weight = nn.Parameter( @@ -76,7 +76,7 @@ def load_continuous_encoder(weights, model, depth_scaling): return model -def load_decoder(weights, model, depth_scaling): +def load_decoder(weights, model, depth_scaling=1.0): model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) @@ -135,7 +135,7 @@ def load_decoder(weights, model, depth_scaling): return model -def load_checkpoint(t5_checkpoint, model, depth_scaling): +def load_checkpoint(t5_checkpoint, model, depth_scaling=1.0): model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder, depth_scaling) model.continuous_encoder = load_continuous_encoder( @@ -178,6 +178,10 @@ def main(args): feed_forward_proj="gated-gelu", max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time, ) + + # NOTE: T5 does not explicitly rescale the attention logits by + # 1/sqrt(depth_kq)! This is folded into the initializers of the + # linear transformations, which is equivalent under Adafactor. model = load_checkpoint( t5_checkpoint["target"], model, depth_scaling=synth_model.model.module.config.head_dim**-0.5 ).eval() From 8830c2bb72397f4558f5d7124003c228358c29f6 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 17 Nov 2022 14:51:46 +0100 Subject: [PATCH 035/131] undo scaling --- scripts/convert_music_spectrogram_to_diffusers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 4f4e2b802d0f..9b68da20664d 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -182,9 +182,7 @@ def main(args): # NOTE: T5 does not explicitly rescale the attention logits by # 1/sqrt(depth_kq)! This is folded into the initializers of the # linear transformations, which is equivalent under Adafactor. - model = load_checkpoint( - t5_checkpoint["target"], model, depth_scaling=synth_model.model.module.config.head_dim**-0.5 - ).eval() + model = load_checkpoint(t5_checkpoint["target"], model).eval() pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler) pipe.save_pretrained("kashif") From 3b9e822eda59ba29dec97c31aaf6790bd471bc42 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 17 Nov 2022 15:11:59 +0100 Subject: [PATCH 036/131] undo depth_scaling --- .../convert_music_spectrogram_to_diffusers.py | 46 +++++++------------ 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 9b68da20664d..d3fad9380b4f 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -16,7 +16,7 @@ MODEL = "base_with_context" -def load_token_encoder(weights, model, depth_scaling=1.0): +def load_token_encoder(weights, model): model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"])) model.position_encoding.weight = nn.Parameter( torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False @@ -28,9 +28,7 @@ def load_token_encoder(weights, model, depth_scaling=1.0): ) attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter( - torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling) - ) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) @@ -45,7 +43,7 @@ def load_token_encoder(weights, model, depth_scaling=1.0): return model -def load_continuous_encoder(weights, model, depth_scaling=1.0): +def load_continuous_encoder(weights, model): model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) model.position_encoding.weight = nn.Parameter( @@ -56,9 +54,7 @@ def load_continuous_encoder(weights, model, depth_scaling=1.0): ly_weight = weights[f"layers_{lyr_num}"] attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter( - torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling) - ) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) @@ -76,7 +72,7 @@ def load_continuous_encoder(weights, model, depth_scaling=1.0): return model -def load_decoder(weights, model, depth_scaling=1.0): +def load_decoder(weights, model): model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) @@ -99,17 +95,13 @@ def load_decoder(weights, model, depth_scaling=1.0): ) attention_weights = ly_weight["self_attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter( - torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling) - ) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) attention_weights = ly_weight["MultiHeadDotProductAttention_0"] - lyr.layer[1].EncDecAttention.q.weight = nn.Parameter( - torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling) - ) + lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) @@ -135,13 +127,11 @@ def load_decoder(weights, model, depth_scaling=1.0): return model -def load_checkpoint(t5_checkpoint, model, depth_scaling=1.0): - model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder, depth_scaling) +def load_checkpoint(t5_checkpoint, model): + model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder) - model.continuous_encoder = load_continuous_encoder( - t5_checkpoint["continuous_encoder"], model.continuous_encoder, depth_scaling - ) - model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder, depth_scaling) + model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder) + model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder) return model @@ -179,22 +169,20 @@ def main(args): max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time, ) - # NOTE: T5 does not explicitly rescale the attention logits by - # 1/sqrt(depth_kq)! This is folded into the initializers of the - # linear transformations, which is equivalent under Adafactor. model = load_checkpoint(t5_checkpoint["target"], model).eval() pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler) - pipe.save_pretrained("kashif") + if args.save: + pipe.save_pretrained(args.output_path) if __name__ == "__main__": parser = argparse.ArgumentParser() - # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.") - # parser.add_argument( - # "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not." - # ) + parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.") + parser.add_argument( + "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not." + ) parser.add_argument( "--checkpoint_path", default=f"{MODEL}/checkpoint_500000", From 9328701ccb7797b1d88307d0779e2e0d07ce1908 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 17 Nov 2022 21:00:10 +0100 Subject: [PATCH 037/131] inital get_extended_attention_mask --- .../pipeline_spectrogram_diffusion.py | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index e61f093b811b..bd2db8e9bce7 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -12,6 +12,8 @@ T5LayerFF, T5LayerNorm, ) +from transformers.modeling_utils import ModuleUtilsMixin + from ...configuration_utils import ConfigMixin, register_to_config from ...modeling_utils import ModelMixin @@ -71,10 +73,11 @@ def forward( return outputs -class DecoderLayer(nn.Module): +class DecoderLayer(nn.Module, ModuleUtilsMixin): def __init__(self, config, has_relative_attention_bias=False): super().__init__() self.layer = nn.ModuleList() + self.config = config # cond self attention: layer 0 self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias)) @@ -122,10 +125,16 @@ def forward( else: self_attn_past_key_value, cross_attn_past_key_value = None, None + input_shape = (hidden_states.shape[0], hidden_states.shape[1]) + if attention_mask is None: + attention_mask = torch.ones(input_shape[0], input_shape[1], device=hidden_states.device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + self_attention_outputs = self.layer[0]( hidden_states, conditioning_emb=conditioning_emb, - attention_mask=attention_mask, + attention_mask=extended_attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, past_key_value=self_attn_past_key_value, @@ -148,10 +157,13 @@ def forward( else: query_length = None + input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1]) + extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape) + cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, + attention_mask=extended_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, past_key_value=cross_attn_past_key_value, @@ -198,7 +210,7 @@ def forward( return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) -class TokenEncoder(ModelMixin, ConfigMixin): +class TokenEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): @register_to_config def __init__( self, @@ -211,6 +223,7 @@ def __init__( d_kv: int, d_ff: int, feed_forward_proj: str, + is_decoder: bool = False, ): super().__init__() @@ -229,7 +242,7 @@ def __init__( d_ff=d_ff, dropout_rate=dropout_rate, feed_forward_proj=feed_forward_proj, - is_decoder=False, + is_decoder=is_decoder, is_encoder_decoder=False, ) @@ -250,14 +263,18 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask): x = self.dropout_pre(x) + # inverted the attention mask + input_shape = encoder_input_tokens.size() + extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape) + for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask)[0] + x = lyr(x, extended_attention_mask)[0] x = self.layer_norm(x) return self.dropout_post(x), encoder_inputs_mask -class ContinuousEncoder(ModelMixin, ConfigMixin): +class ContinuousEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): @register_to_config def __init__( self, @@ -270,6 +287,7 @@ def __init__( d_kv: int, d_ff: int, feed_forward_proj: str, + is_decoder: bool = False, ): super().__init__() @@ -287,7 +305,7 @@ def __init__( d_ff=d_ff, feed_forward_proj=feed_forward_proj, dropout_rate=dropout_rate, - is_decoder=False, + is_decoder=is_decoder, is_encoder_decoder=False, ) self.encoders = nn.ModuleList() @@ -311,8 +329,12 @@ def forward(self, encoder_inputs, encoder_inputs_mask): x = self.dropout_pre(x) + # inverted the attention mask + input_shape = encoder_inputs.size() + extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape) + for lyr in self.encoders: - x = lyr(x, encoder_inputs_mask)[0] + x = lyr(x, extended_attention_mask)[0] x = self.layer_norm(x) return self.dropout_post(x), encoder_inputs_mask From f86a785f4aee6b3d19ef92152977bb007a6ec9cf Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Sun, 20 Nov 2022 21:49:35 +0100 Subject: [PATCH 038/131] attention_mask is none in self-attention --- .../pipeline_spectrogram_diffusion.py | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index bd2db8e9bce7..9d4f540152b8 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -125,16 +125,10 @@ def forward( else: self_attn_past_key_value, cross_attn_past_key_value = None, None - input_shape = (hidden_states.shape[0], hidden_states.shape[1]) - if attention_mask is None: - attention_mask = torch.ones(input_shape[0], input_shape[1], device=hidden_states.device) - - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) - self_attention_outputs = self.layer[0]( hidden_states, conditioning_emb=conditioning_emb, - attention_mask=extended_attention_mask, + attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, past_key_value=self_attn_past_key_value, @@ -157,13 +151,13 @@ def forward( else: query_length = None - input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1]) - extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape) - - cross_attention_outputs = self.layer[1]( + input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1]) + extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape) + + cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, - attention_mask=extended_attention_mask, + attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, past_key_value=cross_attn_past_key_value, @@ -434,9 +428,24 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) y = self.dropout(inputs) + import pdb + + pdb.set_trace() + # cross attend style: concat encodings encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) + + # import pdb + + # pdb.set_trace() + + # # inverted the attention mask + # input_shape = encoded.size() + # extended_attention_mask = self.get_extended_attention_mask(encoder_decoder_mask, input_shape) + + encoder_decoder_mask = torch.where(encoder_decoder_mask > 0, 0, -1e10) + for lyr in self.decoders: y = lyr( y, From 9905492af78ae1cb4ec74aa2c4a6c3421f0cb7c6 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Sun, 20 Nov 2022 21:54:02 +0100 Subject: [PATCH 039/131] cleanup --- .../pipeline_spectrogram_diffusion.py | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 9d4f540152b8..876b35aaa4b4 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -151,13 +151,13 @@ def forward( else: query_length = None - input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1]) - extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape) - - cross_attention_outputs = self.layer[1]( + input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1]) + extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape) + + cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, + attention_mask=extended_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, past_key_value=cross_attn_past_key_value, @@ -421,31 +421,14 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) # Translate encoding masks to encoder-decoder masks. encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks] - inputs = self.continuous_inputs_projection(decoder_input_tokens) - inputs += position_encodings - y = self.dropout(inputs) - import pdb - - pdb.set_trace() - # cross attend style: concat encodings encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1) - # import pdb - - # pdb.set_trace() - - # # inverted the attention mask - # input_shape = encoded.size() - # extended_attention_mask = self.get_extended_attention_mask(encoder_decoder_mask, input_shape) - - encoder_decoder_mask = torch.where(encoder_decoder_mask > 0, 0, -1e10) - for lyr in self.decoders: y = lyr( y, From f439e5b2662b780796557685fa7b909168a3167d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Sun, 20 Nov 2022 22:23:27 +0100 Subject: [PATCH 040/131] manually invert attention --- .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 876b35aaa4b4..78140ddcb022 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -151,13 +151,12 @@ def forward( else: query_length = None - input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1]) - extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape) + encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10) cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, - attention_mask=extended_attention_mask, + attention_mask=encoder_extended_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, past_key_value=cross_attn_past_key_value, From dd5dc10e2f89fa177de1844ae05e4f85d595556f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 21 Nov 2022 12:17:32 +0100 Subject: [PATCH 041/131] nn.linear need bias=False --- .../convert_music_spectrogram_to_diffusers.py | 2 +- .../pipeline_spectrogram_diffusion.py | 22 +++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index d3fad9380b4f..005ae0ebeb9d 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -179,7 +179,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.") + parser.add_argument("--output_path", default=None, type=str, required=True, help="Path to the converted model.") parser.add_argument( "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not." ) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 78140ddcb022..3e1254b80b3f 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -25,7 +25,7 @@ class FiLMLayer(nn.Module): def __init__(self, in_features, out_features): super().__init__() - self.scale_bias = nn.Linear(in_features, out_features * 2) + self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False) def forward(self, x, conditioning_emb): scale_bias = self.scale_bias(conditioning_emb) @@ -360,7 +360,7 @@ def __init__( self.position_encoding = nn.Embedding(targets_length, d_model) self.position_encoding.weight.requires_grad = False - self.continuous_inputs_projection = nn.Linear(input_dims, d_model) + self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False) self.dropout = nn.Dropout(p=dropout_rate) @@ -393,16 +393,19 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) batch, _, _ = decoder_input_tokens.shape assert decoder_noise_time.shape == (batch,) + # TODO remove: + # decoder_input_tokens = torch.ones_like(decoder_input_tokens) + # decoder_noise_time is in [0, 1), so rescale to expected timing range. - conditioning_emb = get_timestep_embedding( + time_steps = get_timestep_embedding( decoder_noise_time * self.config.max_decoder_noise_time, embedding_dim=self.config.d_model, max_period=self.config.max_decoder_noise_time, ) - conditioning_emb = self.conditioning_emb(conditioning_emb) + conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1) - assert conditioning_emb.shape == (batch, self.config.d_model * 4) + assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4) seq_length = decoder_input_tokens.shape[1] @@ -415,14 +418,15 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) position_encodings = self.position_encoding(decoder_positions) + inputs = self.continuous_inputs_projection(decoder_input_tokens) + inputs += position_encodings + y = self.dropout(inputs) + # decoder: No padding present. decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device) # Translate encoding masks to encoder-decoder masks. encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks] - inputs = self.continuous_inputs_projection(decoder_input_tokens) - inputs += position_encodings - y = self.dropout(inputs) # cross attend style: concat encodings encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1) @@ -614,7 +618,7 @@ def __call__( output = self.cont_context_trans.decode( encodings_and_masks=encodings_and_masks, input_tokens=x, - noise_time=t, + noise_time=t / num_inference_steps, # rescale to [0, 1) ) # 2. compute previous output: x_t -> x_t-1 From d987df010e85a5bcce553f1be6fc4ba5278c57f4 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 23 Nov 2022 19:51:02 +0100 Subject: [PATCH 042/131] added T5LayerFFCond --- .../convert_music_spectrogram_to_diffusers.py | 13 ++--- .../pipeline_spectrogram_diffusion.py | 49 +++++++++++-------- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 005ae0ebeb9d..4fa9f151db5d 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -105,20 +105,17 @@ def load_decoder(weights, model): lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter( torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) ) - lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - - lyr.layer[3].scale_bias.weight = nn.Parameter( + lyr.layer[2].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[2].film.scale_bias.weight = nn.Parameter( torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) ) - - lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 3e1254b80b3f..7e0399e2471b 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -9,8 +9,9 @@ T5Block, T5Config, T5LayerCrossAttention, - T5LayerFF, T5LayerNorm, + T5DenseGatedActDense, + T5DenseActDense, ) from transformers.modeling_utils import ModuleUtilsMixin @@ -33,6 +34,28 @@ def forward(self, x, conditioning_emb): return x * (scale + 1.0) + bias +class T5LayerFFCond(nn.Module): + def __init__(self, config: T5Config): + super().__init__() + if config.is_gated_act: + self.DenseReluDense = T5DenseGatedActDense(config) + else: + self.DenseReluDense = T5DenseActDense(config) + + self.film = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward(self, hidden_states, conditioning_emb=None): + forwarded_states = self.layer_norm(hidden_states) + if conditioning_emb is not None: + forwarded_states = self.film(forwarded_states, conditioning_emb) + + forwarded_states = self.DenseReluDense(forwarded_states) + hidden_states = hidden_states + self.dropout(forwarded_states) + return hidden_states + + class T5LayerSelfAttentionCond(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super().__init__() @@ -85,14 +108,8 @@ def __init__(self, config, has_relative_attention_bias=False): # cross attention: layer 1 self.layer.append(T5LayerCrossAttention(config)) - # pre_mlp_layer_norm: layer 2 - self.layer.append(T5LayerNorm(hidden_size=config.d_model)) - - # FiLM layer: 3 - self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)) - - # MLP + dropout: last layer - self.layer.append(T5LayerFF(config)) + # Film Cond MLP + dropout: last layer + self.layer.append(T5LayerFFCond(config)) def forward( self, @@ -178,15 +195,8 @@ def forward( # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] - # Apply LayerNorm - hidden_states = self.layer[2](hidden_states) - - # FiLM - if conditioning_emb is not None: - hidden_states = self.layer[3](hidden_states, conditioning_emb) - - # Apply Feed Forward layer - hidden_states = self.layer[-1](hidden_states) + # Apply Film Conditional Feed Forward layer + hidden_states = self.layer[-1](hidden_states, conditioning_emb) # clamp inf values to enable fp16 training if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any(): @@ -393,9 +403,6 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) batch, _, _ = decoder_input_tokens.shape assert decoder_noise_time.shape == (batch,) - # TODO remove: - # decoder_input_tokens = torch.ones_like(decoder_input_tokens) - # decoder_noise_time is in [0, 1), so rescale to expected timing range. time_steps = get_timestep_embedding( decoder_noise_time * self.config.max_decoder_noise_time, From 428fae945ce82371dd4f7495c415e57e1fa0bc46 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 29 Nov 2022 13:59:40 +0100 Subject: [PATCH 043/131] remove to fix conflict --- src/diffusers/pipeline_utils.py | 14 -------------- .../utils/dummy_torch_and_accelerate_objects.py | 15 --------------- 2 files changed, 29 deletions(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index ca08c1b36ce3..36c2d5b888ef 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -109,20 +109,6 @@ class AudioPipelineOutput(BaseOutput): audios: np.ndarray -@dataclass -class MelPipelineOutput(BaseOutput): - """ - Output class for Mel pipelines. - - Args: - mels (`np.ndarray`) - List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the denoised mel - samples of the diffusion pipeline. - """ - - mels: np.ndarray - - class DiffusionPipeline(ConfigMixin): r""" Base class for all models. diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py index 21b7c2a4d7b9..335e3ca24d2a 100644 --- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py +++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py @@ -272,21 +272,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "accelerate"]) -class SpectrogramDiffusionPipeline(metaclass=DummyObject): - _backends = ["torch", "accelerate"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "accelerate"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "accelerate"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "accelerate"]) - - class DDIMScheduler(metaclass=DummyObject): _backends = ["torch", "accelerate"] From 670331eea986ffd3dc4d848c429c8105b0478a86 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 29 Nov 2022 14:02:38 +0100 Subject: [PATCH 044/131] make style and dummy --- scripts/convert_music_spectrogram_to_diffusers.py | 1 - src/diffusers/pipeline_utils.py | 14 ++++++++++++++ .../pipeline_spectrogram_diffusion.py | 7 +++---- src/diffusers/utils/dummy_pt_objects.py | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 4fa9f151db5d..718006229880 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -3,7 +3,6 @@ import os import numpy as np - import torch import torch.nn as nn diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 01bcc6a33803..57627c80df41 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -118,6 +118,20 @@ class AudioPipelineOutput(BaseOutput): audios: np.ndarray +@dataclass +class MelPipelineOutput(BaseOutput): + """ + Output class for Mel pipelines. + + Args: + mels (`np.ndarray`) + List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the denoised mel + samples of the diffusion pipeline. + """ + + mels: np.ndarray + + def is_safetensors_compatible(info) -> bool: filenames = set(sibling.rfilename for sibling in info.siblings) pt_filenames = set(filename for filename in filenames if filename.endswith(".bin")) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 7e0399e2471b..c13c7d12e342 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -4,17 +4,16 @@ import torch import torch.nn as nn +from transformers.modeling_utils import ModuleUtilsMixin from transformers.models.t5.modeling_t5 import ( T5Attention, T5Block, T5Config, + T5DenseActDense, + T5DenseGatedActDense, T5LayerCrossAttention, T5LayerNorm, - T5DenseGatedActDense, - T5DenseActDense, ) -from transformers.modeling_utils import ModuleUtilsMixin - from ...configuration_utils import ConfigMixin, register_to_config from ...modeling_utils import ModelMixin diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 9846927cb1ce..5d25154c651b 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -287,6 +287,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class SpectrogramDiffusionPipeline(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class DDIMScheduler(metaclass=DummyObject): _backends = ["torch"] From f98beebfa9faa7a5a7555ad09d2be30e6257a4b8 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 29 Nov 2022 14:08:06 +0100 Subject: [PATCH 045/131] remove unsed variables --- scripts/convert_music_spectrogram_to_diffusers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 718006229880..1335255a882c 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -2,7 +2,6 @@ import argparse import os -import numpy as np import torch import torch.nn as nn @@ -145,7 +144,6 @@ def main(args): gin_config = inference.parse_training_gin_file(gin_file, gin_overrides) synth_model = inference.InferenceModel(args.checkpoint_path, gin_config) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large") model = ContinuousContextTransformer( From 37735c0bd30b5a614b3c6262c00c85e22ec5cc6a Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 29 Nov 2022 15:11:10 +0100 Subject: [PATCH 046/131] remove predict_epsilon --- scripts/convert_music_spectrogram_to_diffusers.py | 1 - .../pipeline_spectrogram_diffusion.py | 8 +++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 1335255a882c..76c268e721fc 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -124,7 +124,6 @@ def load_decoder(weights, model): def load_checkpoint(t5_checkpoint, model): model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder) - model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder) model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder) return model diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index c13c7d12e342..1cc0220980fa 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -253,7 +253,7 @@ def __init__( lyr = T5Block(t5config) self.encoders.append(lyr) - self.layer_norm = T5LayerNorm(hidden_size=d_model) + self.layer_norm = T5LayerNorm(d_model) self.dropout_post = nn.Dropout(p=dropout_rate) def forward(self, encoder_input_tokens, encoder_inputs_mask): @@ -315,7 +315,7 @@ def __init__( lyr = T5Block(t5config) self.encoders.append(lyr) - self.layer_norm = T5LayerNorm(hidden_size=d_model) + self.layer_norm = T5LayerNorm(d_model) self.dropout_post = nn.Dropout(p=dropout_rate) def forward(self, encoder_inputs, encoder_inputs_mask): @@ -601,8 +601,6 @@ def __call__( generator: Optional[torch.Generator] = None, num_inference_steps: int = 1000, return_dict: bool = True, - predict_epsilon: bool = True, - **kwargs, ): target_shape = encoder_continuous_inputs.shape encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True) @@ -628,7 +626,7 @@ def __call__( ) # 2. compute previous output: x_t -> x_t-1 - x = self.scheduler.step(output, t, x, generator=generator, predict_epsilon=predict_epsilon).prev_sample + x = self.scheduler.step(output, t, x, generator=generator).prev_sample mel = self.scale_to_features(x, input_range=[-1.0, 1.0]) mel = mel.cpu().numpy() From f9217a7d3ea71c51443efe06c894a722c09762af Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 4 Nov 2022 14:58:52 +0100 Subject: [PATCH 047/131] Move accelerate to a soft-dependency (#1134) * finish * finish * Update src/diffusers/modeling_utils.py * Update src/diffusers/pipeline_utils.py Co-authored-by: Anton Lozhkov * more fixes * fix Co-authored-by: Anton Lozhkov --- .../convert_music_spectrogram_to_diffusers.py | 51 ++++-- .../spectrogram_diffusion/__init__.py | 7 +- .../pipeline_spectrogram_diffusion.py | 151 +++++------------- 3 files changed, 78 insertions(+), 131 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 76c268e721fc..2d011662dc74 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -6,7 +6,7 @@ import torch.nn as nn from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline -from diffusers.pipelines.spectrogram_diffusion import ContinuousContextTransformer +from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder from music_spectrogram_diffusion import inference from t5x import checkpoints @@ -14,7 +14,7 @@ MODEL = "base_with_context" -def load_token_encoder(weights, model): +def load_notes_encoder(weights, model): model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"])) model.position_encoding.weight = nn.Parameter( torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False @@ -122,13 +122,6 @@ def load_decoder(weights, model): return model -def load_checkpoint(t5_checkpoint, model): - model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder) - model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder) - model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder) - return model - - def main(args): t5_checkpoint = checkpoints.load_t5x_checkpoint(args.checkpoint_path) @@ -145,26 +138,50 @@ def main(args): scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large") - model = ContinuousContextTransformer( - vocab_size=synth_model.model.module.config.vocab_size, + notes_encoder = SpectrogramNotesEncoder( max_length=synth_model.sequence_length["inputs"], + vocab_size=synth_model.model.module.config.vocab_size, + d_model=synth_model.model.module.config.emb_dim, + dropout_rate=synth_model.model.module.config.dropout_rate, + num_layers=synth_model.model.module.config.num_encoder_layers, + num_heads=synth_model.model.module.config.num_heads, + d_kv=synth_model.model.module.config.head_dim, + d_ff=synth_model.model.module.config.mlp_dim, + feed_forward_proj="gated-gelu", + ) + + continuous_encoder = SpectrogramContEncoder( input_dims=synth_model.audio_codec.n_dims, targets_context_length=synth_model.sequence_length["targets_context"], - targets_length=synth_model.sequence_length["targets"], d_model=synth_model.model.module.config.emb_dim, + dropout_rate=synth_model.model.module.config.dropout_rate, + num_layers=synth_model.model.module.config.num_encoder_layers, num_heads=synth_model.model.module.config.num_heads, - num_encoder_layers=synth_model.model.module.config.num_encoder_layers, - num_decoder_layers=synth_model.model.module.config.num_decoder_layers, d_kv=synth_model.model.module.config.head_dim, d_ff=synth_model.model.module.config.mlp_dim, - dropout_rate=synth_model.model.module.config.dropout_rate, feed_forward_proj="gated-gelu", + ) + + decoder = T5FilmDecoder( + input_dims=synth_model.audio_codec.n_dims, + targets_length=synth_model.sequence_length["targets_context"], max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time, + d_model=synth_model.model.module.config.emb_dim, + num_layers=synth_model.model.module.config.num_decoder_layers, + num_heads=synth_model.model.module.config.num_heads, + d_kv=synth_model.model.module.config.head_dim, + d_ff=synth_model.model.module.config.mlp_dim, + dropout_rate=synth_model.model.module.config.dropout_rate, + feed_forward_proj="gated-gelu", ) - model = load_checkpoint(t5_checkpoint["target"], model).eval() + notes_encoder = load_notes_encoder(t5_checkpoint["target"]["token_encoder"], notes_encoder) + continuous_encoder = load_continuous_encoder(t5_checkpoint["target"]["continuous_encoder"], continuous_encoder) + decoder = load_decoder(t5_checkpoint["target"]["decoder"], decoder) - pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler) + pipe = SpectrogramDiffusionPipeline( + notes_encoder=notes_encoder, continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler + ) if args.save: pipe.save_pretrained(args.output_path) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py index 850f9f7fba6d..625185f58935 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py @@ -1,2 +1,7 @@ # flake8: noqa -from .pipeline_spectrogram_diffusion import ContinuousContextTransformer, SpectrogramDiffusionPipeline +from .pipeline_spectrogram_diffusion import ( + SpectrogramNotesEncoder, + SpectrogramContEncoder, + T5FilmDecoder, + SpectrogramDiffusionPipeline, +) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 1cc0220980fa..fe6b87f6e80a 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -212,7 +212,7 @@ def forward( return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) -class TokenEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): +class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): @register_to_config def __init__( self, @@ -276,7 +276,7 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask): return self.dropout_post(x), encoder_inputs_mask -class ContinuousEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): +class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): @register_to_config def __init__( self, @@ -342,7 +342,7 @@ def forward(self, encoder_inputs, encoder_inputs_mask): return self.dropout_post(x), encoder_inputs_mask -class Decoder(ModelMixin, ConfigMixin): +class T5FilmDecoder(ModelMixin, ConfigMixin): @register_to_config def __init__( self, @@ -453,74 +453,51 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time) return spec_out -class ContinuousContextTransformer(ModelMixin, ConfigMixin): - @register_to_config +class SpectrogramDiffusionPipeline(DiffusionPipeline): def __init__( self, - input_dims: int, - max_length: int, - targets_context_length: int, - targets_length: int, - max_decoder_noise_time: float, - vocab_size: int, - d_model: int, - dropout_rate: float, - num_encoder_layers: int, - num_decoder_layers: int, - num_heads: int, - d_kv: int, - d_ff: int, - feed_forward_proj: str = "gated-gelu", - ): + notes_encoder: SpectrogramNotesEncoder, + continuous_encoder: SpectrogramContEncoder, + decoder: T5FilmDecoder, + scheduler: DDPMScheduler, + ) -> None: super().__init__() - self.token_encoder = TokenEncoder( - max_length=max_length, - vocab_size=vocab_size, - d_model=d_model, - dropout_rate=dropout_rate, - num_layers=num_encoder_layers, - num_heads=num_heads, - d_kv=d_kv, - d_ff=d_ff, - feed_forward_proj=feed_forward_proj, - ) + # From MELGAN + self.min_value = math.log(1e-5) # Matches MelGAN training. + self.max_value = 4.0 # Largest value for most examples - self.continuous_encoder = ContinuousEncoder( - input_dims=input_dims, - targets_context_length=targets_context_length, - d_model=d_model, - dropout_rate=dropout_rate, - num_layers=num_encoder_layers, - num_heads=num_heads, - d_kv=d_kv, - d_ff=d_ff, - feed_forward_proj=feed_forward_proj, + self.register_modules( + notes_encoder=notes_encoder, continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler ) - self.decoder = Decoder( - input_dims=input_dims, - targets_length=targets_length, - max_decoder_noise_time=max_decoder_noise_time, - d_model=d_model, - num_layers=num_decoder_layers, - num_heads=num_heads, - d_kv=d_kv, - d_ff=d_ff, - dropout_rate=dropout_rate, - feed_forward_proj=feed_forward_proj, - ) + def scale_features(self, features, output_range=(-1.0, 1.0), clip=False): + """Linearly scale features to network outputs range.""" + min_out, max_out = output_range + if clip: + features = torch.clip(features, self.min_value, self.max_value) + # Scale to [0, 1]. + zero_one = (features - self.min_value) / (self.max_value - self.min_value) + # Scale to [min_out, max_out]. + return zero_one * (max_out - min_out) + min_out + + def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False): + """Invert by linearly scaling network outputs to features range.""" + min_out, max_out = input_range + outputs = torch.clip(outputs, min_out, max_out) if clip else outputs + # Scale to [0, 1]. + zero_one = (outputs - min_out) / (max_out - min_out) + # Scale to [self.min_value, self.max_value]. + return zero_one * (self.max_value - self.min_value) + self.min_value def encode(self, input_tokens, continuous_inputs, continuous_mask): tokens_mask = input_tokens > 0 - tokens_encoded, tokens_mask = self.token_encoder( - encoder_input_tokens=input_tokens, - encoder_inputs_mask=tokens_mask, + tokens_encoded, tokens_mask = self.notes_encoder( + encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask ) continuous_encoded, continuous_mask = self.continuous_encoder( - encoder_inputs=continuous_inputs, - encoder_inputs_mask=continuous_mask, + encoder_inputs=continuous_inputs, encoder_inputs_mask=continuous_mask ) return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)] @@ -536,62 +513,10 @@ def decode(self, encodings_and_masks, input_tokens, noise_time): timesteps = timesteps * torch.ones(input_tokens.shape[0], dtype=timesteps.dtype, device=timesteps.device) logits = self.decoder( - encodings_and_masks=encodings_and_masks, - decoder_input_tokens=input_tokens, - decoder_noise_time=timesteps, + encodings_and_masks=encodings_and_masks, decoder_input_tokens=input_tokens, decoder_noise_time=timesteps ) return logits - def forward( - self, - encoder_input_tokens, - encoder_continuous_inputs, - encoder_continuous_mask, - decoder_input_tokens, - decoder_noise_time, - ): - encodings_and_masks = self.encode( - input_tokens=encoder_input_tokens, - continuous_inputs=encoder_continuous_inputs, - continuous_mask=encoder_continuous_mask, - ) - - return self.decode( - encodings_and_masks=encodings_and_masks, - input_tokens=decoder_input_tokens, - noise_time=decoder_noise_time, - ) - - -class SpectrogramDiffusionPipeline(DiffusionPipeline): - def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None: - super().__init__() - - # From MELGAN - self.min_value = math.log(1e-5) # Matches MelGAN training. - self.max_value = 4.0 # Largest value for most examples - - self.register_modules(cont_context_trans=cont_context_trans, scheduler=scheduler) - - def scale_features(self, features, output_range=(-1.0, 1.0), clip=False): - """Linearly scale features to network outputs range.""" - min_out, max_out = output_range - if clip: - features = torch.clip(features, self.min_value, self.max_value) - # Scale to [0, 1]. - zero_one = (features - self.min_value) / (self.max_value - self.min_value) - # Scale to [min_out, max_out]. - return zero_one * (max_out - min_out) + min_out - - def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False): - """Invert by linearly scaling network outputs to features range.""" - min_out, max_out = input_range - outputs = torch.clip(outputs, min_out, max_out) if clip else outputs - # Scale to [0, 1]. - zero_one = (outputs - min_out) / (max_out - min_out) - # Scale to [self.min_value, self.max_value]. - return zero_one * (self.max_value - self.min_value) + self.min_value - @torch.no_grad() def __call__( self, @@ -605,7 +530,7 @@ def __call__( target_shape = encoder_continuous_inputs.shape encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True) - encodings_and_masks = self.cont_context_trans.encode( + encodings_and_masks = self.encode( input_tokens=encoder_input_tokens, continuous_inputs=encoder_continuous_inputs, continuous_mask=encoder_continuous_mask, @@ -619,7 +544,7 @@ def __call__( self.scheduler.set_timesteps(num_inference_steps) for t in self.progress_bar(self.scheduler.timesteps): - output = self.cont_context_trans.decode( + output = self.decode( encodings_and_masks=encodings_and_masks, input_tokens=x, noise_time=t / num_inference_steps, # rescale to [0, 1) From ff51d45f5bcb87756b98b0f5b98953b0f26edc83 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 1 Dec 2022 19:21:53 +0100 Subject: [PATCH 048/131] fix order --- src/diffusers/pipelines/spectrogram_diffusion/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py index 625185f58935..df245e763cce 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py @@ -1,7 +1,7 @@ # flake8: noqa from .pipeline_spectrogram_diffusion import ( - SpectrogramNotesEncoder, SpectrogramContEncoder, - T5FilmDecoder, SpectrogramDiffusionPipeline, + SpectrogramNotesEncoder, + T5FilmDecoder, ) From 4a215ddb8ba392940be87bd771f17f4105833023 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 8 Dec 2022 11:37:21 +0100 Subject: [PATCH 049/131] added initial midi to note token data pipeline --- .../pipelines/spectrogram_diffusion/data.py | 439 ++++++++++++++++++ 1 file changed, 439 insertions(+) create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/data.py diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/data.py new file mode 100644 index 000000000000..5a74a20e3c46 --- /dev/null +++ b/src/diffusers/pipelines/spectrogram_diffusion/data.py @@ -0,0 +1,439 @@ +from typing import Sequence, Tuple, Optional, MutableMapping, List, Callable, Mapping, Any +import dataclasses +import math +from immutabledict import immutabledict + +import numpy as np +import note_seq +import torch +import torch.nn.functional as F + + +SAMPLE_RATE = 16000 +HOP_SIZE = 320 +FRAME_RATE = int(SAMPLE_RATE // HOP_SIZE) + +DEFAULT_STEPS_PER_SECOND = 100 +DEFAULT_MAX_SHIFT_SECONDS = 10 +DEFAULT_NUM_VELOCITY_BINS = 1 + +SLAKH_CLASS_PROGRAMS = immutabledict( + { + "Acoustic Piano": 0, + "Electric Piano": 4, + "Chromatic Percussion": 8, + "Organ": 16, + "Acoustic Guitar": 24, + "Clean Electric Guitar": 26, + "Distorted Electric Guitar": 29, + "Acoustic Bass": 32, + "Electric Bass": 33, + "Violin": 40, + "Viola": 41, + "Cello": 42, + "Contrabass": 43, + "Orchestral Harp": 46, + "Timpani": 47, + "String Ensemble": 48, + "Synth Strings": 50, + "Choir and Voice": 52, + "Orchestral Hit": 55, + "Trumpet": 56, + "Trombone": 57, + "Tuba": 58, + "French Horn": 60, + "Brass Section": 61, + "Soprano/Alto Sax": 64, + "Tenor Sax": 66, + "Baritone Sax": 67, + "Oboe": 68, + "English Horn": 69, + "Bassoon": 70, + "Clarinet": 71, + "Pipe": 73, + "Synth Lead": 80, + "Synth Pad": 88, + } +) + + +@dataclasses.dataclass +class NoteRepresentationConfig: + """Configuration note representations.""" + + onsets_only: bool + include_ties: bool + + +@dataclasses.dataclass +class NoteEventData: + pitch: int + velocity: Optional[int] = None + program: Optional[int] = None + is_drum: Optional[bool] = None + instrument: Optional[int] = None + + +@dataclasses.dataclass +class NoteEncodingState: + """Encoding state for note transcription, keeping track of active pitches.""" + + # velocity bin for active pitches and programs + active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(default_factory=dict) + + +@dataclasses.dataclass +class EventRange: + type: str + min_value: int + max_value: int + + +@dataclasses.dataclass +class Event: + type: str + value: int + + +class Codec: + """Encode and decode events. + + Useful for declaring what certain ranges of a vocabulary should be used for. + This is intended to be used from Python before encoding or after decoding with + GenericTokenVocabulary. This class is more lightweight and does not include + things like EOS or UNK token handling. + + To ensure that 'shift' events are always the first block of the vocab and + start at 0, that event type is required and specified separately. + """ + + def __init__(self, max_shift_steps: int, steps_per_second: float, event_ranges: List[EventRange]): + """Define Codec. + + Args: + max_shift_steps: Maximum number of shift steps that can be encoded. + steps_per_second: Shift steps will be interpreted as having a duration of + 1 / steps_per_second. + event_ranges: Other supported event types and their ranges. + """ + self.steps_per_second = steps_per_second + self._shift_range = EventRange(type="shift", min_value=0, max_value=max_shift_steps) + self._event_ranges = [self._shift_range] + event_ranges + # Ensure all event types have unique names. + assert len(self._event_ranges) == len(set([er.type for er in self._event_ranges])) + + @property + def num_classes(self) -> int: + return sum(er.max_value - er.min_value + 1 for er in self._event_ranges) + + # The next couple methods are simplified special case methods just for shift + # events that are intended to be used from within autograph functions. + + def is_shift_event_index(self, index: int) -> bool: + return (self._shift_range.min_value <= index) and (index <= self._shift_range.max_value) + + @property + def max_shift_steps(self) -> int: + return self._shift_range.max_value + + def encode_event(self, event: Event) -> int: + """Encode an event to an index.""" + offset = 0 + for er in self._event_ranges: + if event.type == er.type: + if not er.min_value <= event.value <= er.max_value: + raise ValueError( + f"Event value {event.value} is not within valid range " + f"[{er.min_value}, {er.max_value}] for type {event.type}" + ) + return offset + event.value - er.min_value + offset += er.max_value - er.min_value + 1 + + raise ValueError(f"Unknown event type: {event.type}") + + def event_type_range(self, event_type: str) -> Tuple[int, int]: + """Return [min_id, max_id] for an event type.""" + offset = 0 + for er in self._event_ranges: + if event_type == er.type: + return offset, offset + (er.max_value - er.min_value) + offset += er.max_value - er.min_value + 1 + + raise ValueError(f"Unknown event type: {event_type}") + + def decode_event_index(self, index: int) -> Event: + """Decode an event index to an Event.""" + offset = 0 + for er in self._event_ranges: + if offset <= index <= offset + er.max_value - er.min_value: + return Event(type=er.type, value=er.min_value + index - offset) + offset += er.max_value - er.min_value + 1 + + raise ValueError(f"Unknown event index: {index}") + + +def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1): + """ + equivalent of tf.signal.frame + """ + signal_length = signal.shape[axis] + if pad_end: + frames_overlap = frame_length - frame_step + rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap) + pad_size = int(frame_length - rest_samples) + + if pad_size != 0: + pad_axis = [0] * signal.ndim + pad_axis[axis] = pad_size + signal = F.pad(signal, pad_axis, "constant", pad_value) + frames = signal.unfold(axis, frame_length, frame_step) + return frames + + +def program_to_slakh_program(program): + # this is done very hackily, probably should use a custom mapping + for slakh_program in sorted(SLAKH_CLASS_PROGRAMS.values(), reverse=True): + if program >= slakh_program: + return slakh_program + + +def audio_to_frames( + samples, + hop_size: int, + frame_rate: int, +) -> Tuple[Sequence[Sequence[int]], torch.Tensor]: + """Convert audio samples to non-overlapping frames and frame times.""" + frame_size = hop_size + samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], mode="constant") + + # Split audio into frames. + frames = frame( + torch.Tensor(samples).unsqueeze(0), + frame_length=frame_size, + frame_step=frame_size, + pad_end=False, # TODO check why its off by 1 here when True + ) + + num_frames = len(samples) // frame_size + + times = np.arange(num_frames) / frame_rate + return frames, times + + +def note_sequence_to_onsets_and_offsets_and_programs( + ns: note_seq.NoteSequence, +) -> Tuple[Sequence[float], Sequence[NoteEventData]]: + """Extract onset & offset times and pitches & programs from a NoteSequence. + + The onset & offset times will not necessarily be in sorted order. + + Args: + ns: NoteSequence from which to extract onsets and offsets. + + Returns: + times: A list of note onset and offset times. + values: A list of NoteEventData objects where velocity is zero for note + offsets. + """ + # Sort by program and pitch and put offsets before onsets as a tiebreaker for + # subsequent stable sort. + notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch)) + times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes] + values = [ + NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False) + for note in notes + if not note.is_drum + ] + [ + NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum) + for note in notes + ] + return times, values + + +def note_sequence_to_onsets_and_offsets_and_programs( + ns: note_seq.NoteSequence, +) -> Tuple[Sequence[float], Sequence[NoteEventData]]: + """Extract onset & offset times and pitches & programs from a NoteSequence. + + The onset & offset times will not necessarily be in sorted order. + + Args: + ns: NoteSequence from which to extract onsets and offsets. + + Returns: + times: A list of note onset and offset times. + values: A list of NoteEventData objects where velocity is zero for note + offsets. + """ + # Sort by program and pitch and put offsets before onsets as a tiebreaker for + # subsequent stable sort. + notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch)) + times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes] + values = [ + NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False) + for note in notes + if not note.is_drum + ] + [ + NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum) + for note in notes + ] + return times, values + + +def num_velocity_bins_from_codec(codec: Codec): + """Get number of velocity bins from event codec.""" + lo, hi = codec.event_type_range("velocity") + return hi - lo + + +def velocity_to_bin(velocity, num_velocity_bins): + if velocity == 0: + return 0 + else: + return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY) + + +def note_event_data_to_events( + state: Optional[NoteEncodingState], + value: NoteEventData, + codec: Codec, +) -> Sequence[Event]: + """Convert note event data to a sequence of events.""" + if value.velocity is None: + # onsets only, no program or velocity + return [Event("pitch", value.pitch)] + else: + num_velocity_bins = num_velocity_bins_from_codec(codec) + velocity_bin = velocity_to_bin(value.velocity, num_velocity_bins) + if value.program is None: + # onsets + offsets + velocities only, no programs + if state is not None: + state.active_pitches[(value.pitch, 0)] = velocity_bin + return [Event("velocity", velocity_bin), Event("pitch", value.pitch)] + else: + if value.is_drum: + # drum events use a separate vocabulary + return [Event("velocity", velocity_bin), Event("drum", value.pitch)] + else: + # program + velocity + pitch + if state is not None: + state.active_pitches[(value.pitch, value.program)] = velocity_bin + return [ + Event("program", value.program), + Event("velocity", velocity_bin), + Event("pitch", value.pitch), + ] + + +def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]: + """Output program and pitch events for active notes plus a final tie event.""" + events = [] + for pitch, program in sorted(state.active_pitches.keys(), key=lambda k: k[::-1]): + if state.active_pitches[(pitch, program)]: + events += [Event("program", program), Event("pitch", pitch)] + events.append(Event("tie", 0)) + return events + + +def encode_and_index_events( + state, event_times, event_values, codec, frame_times, encode_event_fn, encoding_state_to_events_fn=None +): + """Encode a sequence of timed events and index to audio frame times. + + Encodes time shifts as repeated single step shifts for later run length + encoding. + + Optionally, also encodes a sequence of "state events", keeping track of the + current encoding state at each audio frame. This can be used e.g. to prepend + events representing the current state to a targets segment. + + Args: + state: Initial event encoding state. + event_times: Sequence of event times. + event_values: Sequence of event values. + encode_event_fn: Function that transforms event value into a sequence of one + or more Event objects. + codec: An Codec object that maps Event objects to indices. + frame_times: Time for every audio frame. + encoding_state_to_events_fn: Function that transforms encoding state into a + sequence of one or more Event objects. + + Returns: + events: Encoded events and shifts. + event_start_indices: Corresponding start event index for every audio frame. + Note: one event can correspond to multiple audio indices due to sampling + rate differences. This makes splitting sequences tricky because the same + event can appear at the end of one sequence and the beginning of + another. + event_end_indices: Corresponding end event index for every audio frame. Used + to ensure when slicing that one chunk ends where the next begins. Should + always be true that event_end_indices[i] = event_start_indices[i + 1]. + state_events: Encoded "state" events representing the encoding state before + each event. + state_event_indices: Corresponding state event index for every audio frame. + """ + indices = np.argsort(event_times, kind="stable") + event_steps = [round(event_times[i] * codec.steps_per_second) for i in indices] + event_values = [event_values[i] for i in indices] + + events = [] + state_events = [] + event_start_indices = [] + state_event_indices = [] + + cur_step = 0 + cur_event_idx = 0 + cur_state_event_idx = 0 + + def fill_event_start_indices_to_cur_step(): + while ( + len(event_start_indices) < len(frame_times) + and frame_times[len(event_start_indices)] < cur_step / codec.steps_per_second + ): + event_start_indices.append(cur_event_idx) + state_event_indices.append(cur_state_event_idx) + + for event_step, event_value in zip(event_steps, event_values): + while event_step > cur_step: + events.append(codec.encode_event(Event(type="shift", value=1))) + cur_step += 1 + fill_event_start_indices_to_cur_step() + cur_event_idx = len(events) + cur_state_event_idx = len(state_events) + if encoding_state_to_events_fn: + # Dump state to state events *before* processing the next event, because + # we want to capture the state prior to the occurrence of the event. + for e in encoding_state_to_events_fn(state): + state_events.append(codec.encode_event(e)) + + for e in encode_event_fn(state, event_value, codec): + events.append(codec.encode_event(e)) + + # After the last event, continue filling out the event_start_indices array. + # The inequality is not strict because if our current step lines up exactly + # with (the start of) an audio frame, we need to add an additional shift event + # to "cover" that frame. + while cur_step / codec.steps_per_second <= frame_times[-1]: + events.append(codec.encode_event(Event(type="shift", value=1))) + cur_step += 1 + fill_event_start_indices_to_cur_step() + cur_event_idx = len(events) + + # Now fill in event_end_indices. We need this extra array to make sure that + # when we slice events, each slice ends exactly where the subsequent slice + # begins. + event_end_indices = event_start_indices[1:] + [len(events)] + + events = np.array(events) + state_events = np.array(state_events) + event_start_indices = np.array(event_start_indices) + event_end_indices = np.array(event_end_indices) + state_event_indices = np.array(state_event_indices) + + return { + "inputs": events.astype(np.int32), + "event_start_indices": event_start_indices.astype(np.int32), + "event_end_indices": event_end_indices.astype(np.int32), + "state_events": state_events.astype(np.int32), + "state_event_indices": state_event_indices.astype(np.int32), + } From d8544cb502d5298c4a531d9e0ed1fbec543da180 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 8 Dec 2022 11:45:23 +0100 Subject: [PATCH 050/131] added int to int tokenizer --- .../pipelines/spectrogram_diffusion/data.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/data.py index 5a74a20e3c46..8b20ed117f17 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/data.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/data.py @@ -95,6 +95,24 @@ class Event: value: int +class Tokenizer: + def __init__(self, regular_ids: int): + # The special tokens: 0=PAD, 1=EOS, and 2=UNK + self._num_special_tokens = 3 + self._num_regular_tokens = regular_ids + + def encode(self, token_ids): + encoded = [] + for token_id in token_ids: + if not 0 <= token_id < self._num_regular_tokens: + raise ValueError( + f"token_id {token_id} does not fall within valid range of " f"[0, {self._num_regular_tokens})" + ) + encoded.append(token_id + self._num_special_tokens) + + return encoded + + class Codec: """Encode and decode events. From 5f628432107a9d31feb5566a367bcb665dee3f44 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 8 Dec 2022 11:46:58 +0100 Subject: [PATCH 051/131] remove duplicate --- .../pipelines/spectrogram_diffusion/data.py | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/data.py index 8b20ed117f17..47ea7980898d 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/data.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/data.py @@ -268,36 +268,6 @@ def note_sequence_to_onsets_and_offsets_and_programs( return times, values -def note_sequence_to_onsets_and_offsets_and_programs( - ns: note_seq.NoteSequence, -) -> Tuple[Sequence[float], Sequence[NoteEventData]]: - """Extract onset & offset times and pitches & programs from a NoteSequence. - - The onset & offset times will not necessarily be in sorted order. - - Args: - ns: NoteSequence from which to extract onsets and offsets. - - Returns: - times: A list of note onset and offset times. - values: A list of NoteEventData objects where velocity is zero for note - offsets. - """ - # Sort by program and pitch and put offsets before onsets as a tiebreaker for - # subsequent stable sort. - notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch)) - times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes] - values = [ - NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False) - for note in notes - if not note.is_drum - ] + [ - NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum) - for note in notes - ] - return times, values - - def num_velocity_bins_from_codec(codec: Codec): """Get number of velocity bins from event codec.""" lo, hi = codec.event_type_range("velocity") From 505e78a310ef96057cf0e7d9cc2348e08f148e26 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 9 Dec 2022 10:18:46 +0100 Subject: [PATCH 052/131] added logic for segments --- .../pipelines/spectrogram_diffusion/data.py | 46 +++++++++++++------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/data.py index 47ea7980898d..8b5fd1afdfdc 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/data.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/data.py @@ -9,6 +9,9 @@ import torch.nn.functional as F +INPUT_FEATURE_LENGTHS = 2048 +TARGET_FEATURE_LENGTHS = 256 + SAMPLE_RATE = 16000 HOP_SIZE = 320 FRAME_RATE = int(SAMPLE_RATE // HOP_SIZE) @@ -110,6 +113,12 @@ def encode(self, token_ids): ) encoded.append(token_id + self._num_special_tokens) + # Add EOS token + encoded.append(1) + + # Pad to till INPUT_FEATURE_LENGTHS + encoded = encoded + [0] * (INPUT_FEATURE_LENGTHS - len(encoded)) + return encoded @@ -274,6 +283,11 @@ def num_velocity_bins_from_codec(codec: Codec): return hi - lo +# segment an array into segments of length n +def segment(a, n): + return [a[i : i + n] for i in range(0, len(a), n)] + + def velocity_to_bin(velocity, num_velocity_bins): if velocity == 0: return 0 @@ -412,16 +426,22 @@ def fill_event_start_indices_to_cur_step(): # begins. event_end_indices = event_start_indices[1:] + [len(events)] - events = np.array(events) - state_events = np.array(state_events) - event_start_indices = np.array(event_start_indices) - event_end_indices = np.array(event_end_indices) - state_event_indices = np.array(state_event_indices) - - return { - "inputs": events.astype(np.int32), - "event_start_indices": event_start_indices.astype(np.int32), - "event_end_indices": event_end_indices.astype(np.int32), - "state_events": state_events.astype(np.int32), - "state_event_indices": state_event_indices.astype(np.int32), - } + events = np.array(events).astype(np.int32) + state_events = np.array(state_events).astype(np.int32) + event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTHS) + event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTHS) + state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTHS) + + outputs = [] + for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices): + outputs.append( + { + "inputs": events, + "event_start_indices": start_indices, + "event_end_indices": end_indices, + "state_events": state_events, + "state_event_indices": event_indices, + } + ) + + return outputs From 52f7896a1ee5c9a5208b1aca24a8fd3b1e3e4e3b Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 9 Dec 2022 10:49:12 +0100 Subject: [PATCH 053/131] add melgan to pipeline --- scripts/convert_music_spectrogram_to_diffusers.py | 10 ++++++++-- .../pipeline_spectrogram_diffusion.py | 8 +++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 2d011662dc74..d9ef3340b2e0 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn -from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline +from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline, OnnxRuntimeModel from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder from music_spectrogram_diffusion import inference from t5x import checkpoints @@ -179,8 +179,14 @@ def main(args): continuous_encoder = load_continuous_encoder(t5_checkpoint["target"]["continuous_encoder"], continuous_encoder) decoder = load_decoder(t5_checkpoint["target"]["decoder"], decoder) + melgan = OnnxRuntimeModel.from_pretrained("kashif/soundstream_mel_decoder") + pipe = SpectrogramDiffusionPipeline( - notes_encoder=notes_encoder, continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler + notes_encoder=notes_encoder, + continuous_encoder=continuous_encoder, + decoder=decoder, + scheduler=scheduler, + melgan=melgan, ) if args.save: pipe.save_pretrained(args.output_path) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index fe6b87f6e80a..20ff8cc5851f 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -18,6 +18,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...modeling_utils import ModelMixin from ...models.embeddings import get_timestep_embedding +from ...onnx_utils import OnnxRuntimeModel from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput from ...schedulers import DDPMScheduler @@ -460,6 +461,7 @@ def __init__( continuous_encoder: SpectrogramContEncoder, decoder: T5FilmDecoder, scheduler: DDPMScheduler, + melgan: OnnxRuntimeModel, ) -> None: super().__init__() @@ -468,7 +470,11 @@ def __init__( self.max_value = 4.0 # Largest value for most examples self.register_modules( - notes_encoder=notes_encoder, continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler + notes_encoder=notes_encoder, + continuous_encoder=continuous_encoder, + decoder=decoder, + scheduler=scheduler, + melgan=melgan, ) def scale_features(self, features, output_range=(-1.0, 1.0), clip=False): From 1e267765ae93e3d680d1956546c1299f812d9a03 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 9 Dec 2022 12:45:16 +0100 Subject: [PATCH 054/131] move autoregressive gen into pipeline --- src/diffusers/pipeline_utils.py | 14 -- .../{data.py => midi_utils.py} | 14 +- .../pipeline_spectrogram_diffusion.py | 140 ++++++++++++++---- 3 files changed, 119 insertions(+), 49 deletions(-) rename src/diffusers/pipelines/spectrogram_diffusion/{data.py => midi_utils.py} (98%) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 57627c80df41..01bcc6a33803 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -118,20 +118,6 @@ class AudioPipelineOutput(BaseOutput): audios: np.ndarray -@dataclass -class MelPipelineOutput(BaseOutput): - """ - Output class for Mel pipelines. - - Args: - mels (`np.ndarray`) - List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the denoised mel - samples of the diffusion pipeline. - """ - - mels: np.ndarray - - def is_safetensors_compatible(info) -> bool: filenames = set(sibling.rfilename for sibling in info.siblings) pt_filenames = set(filename for filename in filenames if filename.endswith(".bin")) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py similarity index 98% rename from src/diffusers/pipelines/spectrogram_diffusion/data.py rename to src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py index 8b5fd1afdfdc..d94b7e1e8777 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/data.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py @@ -9,8 +9,8 @@ import torch.nn.functional as F -INPUT_FEATURE_LENGTHS = 2048 -TARGET_FEATURE_LENGTHS = 256 +INPUT_FEATURE_LENGTH = 2048 +TARGET_FEATURE_LENGTH = 256 SAMPLE_RATE = 16000 HOP_SIZE = 320 @@ -116,8 +116,8 @@ def encode(self, token_ids): # Add EOS token encoded.append(1) - # Pad to till INPUT_FEATURE_LENGTHS - encoded = encoded + [0] * (INPUT_FEATURE_LENGTHS - len(encoded)) + # Pad to till INPUT_FEATURE_LENGTH + encoded = encoded + [0] * (INPUT_FEATURE_LENGTH - len(encoded)) return encoded @@ -428,9 +428,9 @@ def fill_event_start_indices_to_cur_step(): events = np.array(events).astype(np.int32) state_events = np.array(state_events).astype(np.int32) - event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTHS) - event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTHS) - state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTHS) + event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH) + event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH) + state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH) outputs = [] for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices): diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 20ff8cc5851f..88d8c2fc3f26 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -1,9 +1,11 @@ import math from typing import Optional +import numpy as np import torch import torch.nn as nn +import note_seq from transformers.modeling_utils import ModuleUtilsMixin from transformers.models.t5.modeling_t5 import ( T5Attention, @@ -19,9 +21,31 @@ from ...modeling_utils import ModelMixin from ...models.embeddings import get_timestep_embedding from ...onnx_utils import OnnxRuntimeModel -from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput +from ...pipeline_utils import DiffusionPipeline, AudioPipelineOutput from ...schedulers import DDPMScheduler +from .midi_utils import ( + program_to_slakh_program, + audio_to_frames, + SAMPLE_RATE, + HOP_SIZE, + FRAME_RATE, + DEFAULT_MAX_SHIFT_SECONDS, + DEFAULT_STEPS_PER_SECOND, + DEFAULT_NUM_VELOCITY_BINS, + TARGET_FEATURE_LENGTH, + note_sequence_to_onsets_and_offsets_and_programs, + Codec, + EventRange, + encode_and_index_events, + NoteEncodingState, + note_event_data_to_events, + note_encoding_state_to_events, + NoteRepresentationConfig, + note_representation_processor_chain, + Tokenizer, +) + class FiLMLayer(nn.Module): def __init__(self, in_features, out_features): @@ -468,6 +492,7 @@ def __init__( # From MELGAN self.min_value = math.log(1e-5) # Matches MelGAN training. self.max_value = 4.0 # Largest value for most examples + self.n_dims = 128 self.register_modules( notes_encoder=notes_encoder, @@ -526,43 +551,102 @@ def decode(self, encodings_and_masks, input_tokens, noise_time): @torch.no_grad() def __call__( self, - encoder_input_tokens, - encoder_continuous_inputs, - encoder_continuous_mask, + midi_file, generator: Optional[torch.Generator] = None, num_inference_steps: int = 1000, return_dict: bool = True, ): - target_shape = encoder_continuous_inputs.shape - encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True) - - encodings_and_masks = self.encode( - input_tokens=encoder_input_tokens, - continuous_inputs=encoder_continuous_inputs, - continuous_mask=encoder_continuous_mask, + ns = note_seq.midi_file_to_note_sequence(midi_file) + ns_sus = note_seq.apply_sustain_control_changes(ns) + + for note in ns_sus.notes: + if not note.is_drum: + note.program = program_to_slakh_program(note.program) + + samples = np.zeros(int(ns_sus.total_time * SAMPLE_RATE)) + + _, frame_times = audio_to_frames(samples, HOP_SIZE, FRAME_RATE) + times, values = note_sequence_to_onsets_and_offsets_and_programs(ns_sus) + + codec = Codec( + max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND, + steps_per_second=DEFAULT_STEPS_PER_SECOND, + event_ranges=[ + EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH), + EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS), + EventRange("tie", 0, 0), + EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM), + EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH), + ], + ) + tokenizer = Tokenizer(codec.num_classes) + + events = encode_and_index_events( + state=NoteEncodingState(), + event_times=times, + event_values=values, + frame_times=frame_times, + codec=codec, + encode_event_fn=note_event_data_to_events, + encoding_state_to_events_fn=note_encoding_state_to_events, ) - # Sample gaussian noise to begin loop - x = torch.randn(target_shape, generator=generator) - x = x.to(self.device) + note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True) + events = [note_representation_processor_chain(event, codec, note_representation_config) for event in events] + input_tokens = [tokenizer.encode(event["inputs"]) for event in events] - # set step values - self.scheduler.set_timesteps(num_inference_steps) + pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims]) + full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32) - for t in self.progress_bar(self.scheduler.timesteps): - output = self.decode( - encodings_and_masks=encodings_and_masks, - input_tokens=x, - noise_time=t / num_inference_steps, # rescale to [0, 1) - ) + for i, encoder_input_tokens in enumerate(input_tokens): + encoder_continuous_inputs = pred_mel[:1] + if i == 0: + # The first chunk has no previous context. + encoder_continuous_mask = np.zeros((1, TARGET_FEATURE_LENGTH)) + else: + # The full song pipeline does not feed in a context feature, so the mask + # will be all 0s after the feature converter. Because we know we're + # feeding in a full context chunk from the previous prediction, set it + # to all 1s. + encoder_continuous_mask = np.ones((1, TARGET_FEATURE_LENGTH)) + + target_shape = encoder_continuous_inputs.shape + encoder_continuous_inputs = self.scale_features( + encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True + ) + + encodings_and_masks = self.encode( + input_tokens=encoder_input_tokens.to(self.device), + continuous_inputs=encoder_continuous_inputs.to(self.device), + continuous_mask=encoder_continuous_mask.to(self.device), + ) + + # Sample gaussian noise to begin loop + x = torch.randn(target_shape, generator=generator) + x = x.to(self.device) + + # set step values + self.scheduler.set_timesteps(num_inference_steps) + + # Denoising diffusion loop + for t in self.progress_bar(self.scheduler.timesteps): + output = self.decode( + encodings_and_masks=encodings_and_masks, + input_tokens=x, + noise_time=t / num_inference_steps, # rescale to [0, 1) + ) + + # Compute previous output: x_t -> x_t-1 + x = self.scheduler.step(output, t, x, generator=generator).prev_sample + + mel = self.scale_to_features(x, input_range=[-1.0, 1.0]) + pred_mel = mel.cpu().numpy() - # 2. compute previous output: x_t -> x_t-1 - x = self.scheduler.step(output, t, x, generator=generator).prev_sample + full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1) - mel = self.scale_to_features(x, input_range=[-1.0, 1.0]) - mel = mel.cpu().numpy() + full_pred_audio = self.melgan(input_features=full_pred_mel.astype(np.float32)) if not return_dict: - return (mel,) + return (full_pred_audio,) - return MelPipelineOutput(mels=mel) + return AudioPipelineOutput(audios=full_pred_audio) From a643c8b2e6cafbc457626296781fcc474526d756 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 9 Dec 2022 12:56:53 +0100 Subject: [PATCH 055/131] added note_representation_processor_chain --- .../convert_music_spectrogram_to_diffusers.py | 2 +- .../spectrogram_diffusion/midi_utils.py | 213 ++++++++++++++++-- .../pipeline_spectrogram_diffusion.py | 39 ++-- 3 files changed, 216 insertions(+), 38 deletions(-) diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index d9ef3340b2e0..1090e5c31fc7 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn -from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline, OnnxRuntimeModel +from diffusers import DDPMScheduler, OnnxRuntimeModel, SpectrogramDiffusionPipeline from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder from music_spectrogram_diffusion import inference from t5x import checkpoints diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py index d94b7e1e8777..0a30bc807d2f 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py @@ -1,13 +1,29 @@ -from typing import Sequence, Tuple, Optional, MutableMapping, List, Callable, Mapping, Any +# Copyright 2022 The Music Spectrogram Diffusion Authors. +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import dataclasses import math -from immutabledict import immutabledict +from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Sequence, Tuple import numpy as np -import note_seq import torch import torch.nn.functional as F +import note_seq +from immutabledict import immutabledict + INPUT_FEATURE_LENGTH = 2048 TARGET_FEATURE_LENGTH = 256 @@ -109,7 +125,7 @@ def encode(self, token_ids): for token_id in token_ids: if not 0 <= token_id < self._num_regular_tokens: raise ValueError( - f"token_id {token_id} does not fall within valid range of " f"[0, {self._num_regular_tokens})" + f"token_id {token_id} does not fall within valid range of [0, {self._num_regular_tokens})" ) encoded.append(token_id + self._num_special_tokens) @@ -125,13 +141,12 @@ def encode(self, token_ids): class Codec: """Encode and decode events. - Useful for declaring what certain ranges of a vocabulary should be used for. - This is intended to be used from Python before encoding or after decoding with - GenericTokenVocabulary. This class is more lightweight and does not include - things like EOS or UNK token handling. + Useful for declaring what certain ranges of a vocabulary should be used for. This is intended to be used from + Python before encoding or after decoding with GenericTokenVocabulary. This class is more lightweight and does not + include things like EOS or UNK token handling. - To ensure that 'shift' events are always the first block of the vocab and - start at 0, that event type is required and specified separately. + To ensure that 'shift' events are always the first block of the vocab and start at 0, that event type is required + and specified separately. """ def __init__(self, max_shift_steps: int, steps_per_second: float, event_ranges: List[EventRange]): @@ -199,6 +214,39 @@ def decode_event_index(self, index: int) -> Event: raise ValueError(f"Unknown event index: {index}") +@dataclasses.dataclass +class ProgramGranularity: + # both tokens_map_fn and program_map_fn should be idempotent + tokens_map_fn: Callable[[Sequence[int], Codec], Sequence[int]] + program_map_fn: Callable[[int], int] + + +def drop_programs(tokens, codec: Codec): + """Drops program change events from a token sequence.""" + min_program_id, max_program_id = codec.event_type_range("program") + return tokens[(tokens < min_program_id) | (tokens > max_program_id)] + + +def programs_to_midi_classes(tokens, codec): + """Modifies program events to be the first program in the MIDI class.""" + min_program_id, max_program_id = codec.event_type_range("program") + is_program = (tokens >= min_program_id) & (tokens <= max_program_id) + return tf.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens) + + +PROGRAM_GRANULARITIES = { + # "flat" granularity; drop program change tokens and set NoteSequence + # programs to zero + "flat": ProgramGranularity(tokens_map_fn=drop_programs, program_map_fn=lambda program: 0), + # map each program to the first program in its MIDI class + "midi_class": ProgramGranularity( + tokens_map_fn=programs_to_midi_classes, program_map_fn=lambda program: 8 * (program // 8) + ), + # leave programs as is + "full": ProgramGranularity(tokens_map_fn=lambda tokens, codec: tokens, program_map_fn=lambda program: program), +} + + def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1): """ equivalent of tf.signal.frame @@ -258,8 +306,8 @@ def note_sequence_to_onsets_and_offsets_and_programs( ns: NoteSequence from which to extract onsets and offsets. Returns: - times: A list of note onset and offset times. - values: A list of NoteEventData objects where velocity is zero for note + times: A list of note onset and offset times. values: A list of NoteEventData objects where velocity is zero for + note offsets. """ # Sort by program and pitch and put offsets before onsets as a tiebreaker for @@ -342,12 +390,10 @@ def encode_and_index_events( ): """Encode a sequence of timed events and index to audio frame times. - Encodes time shifts as repeated single step shifts for later run length - encoding. + Encodes time shifts as repeated single step shifts for later run length encoding. - Optionally, also encodes a sequence of "state events", keeping track of the - current encoding state at each audio frame. This can be used e.g. to prepend - events representing the current state to a targets segment. + Optionally, also encodes a sequence of "state events", keeping track of the current encoding state at each audio + frame. This can be used e.g. to prepend events representing the current state to a targets segment. Args: state: Initial event encoding state. @@ -361,15 +407,13 @@ def encode_and_index_events( sequence of one or more Event objects. Returns: - events: Encoded events and shifts. - event_start_indices: Corresponding start event index for every audio frame. - Note: one event can correspond to multiple audio indices due to sampling - rate differences. This makes splitting sequences tricky because the same - event can appear at the end of one sequence and the beginning of + events: Encoded events and shifts. event_start_indices: Corresponding start event index for every audio frame. + Note: one event can correspond to multiple audio indices due to sampling rate differences. This makes + splitting sequences tricky because the same event can appear at the end of one sequence and the beginning of another. event_end_indices: Corresponding end event index for every audio frame. Used - to ensure when slicing that one chunk ends where the next begins. Should - always be true that event_end_indices[i] = event_start_indices[i + 1]. + to ensure when slicing that one chunk ends where the next begins. Should always be true that + event_end_indices[i] = event_start_indices[i + 1]. state_events: Encoded "state" events representing the encoding state before each event. state_event_indices: Corresponding state event index for every audio frame. @@ -445,3 +489,124 @@ def fill_event_start_indices_to_cur_step(): ) return outputs + + +def extract_sequence_with_indices(features, state_events_end_token=None, feature_key="inputs"): + """Extract target sequence corresponding to audio token segment.""" + features = features.copy() + start_idx = features["event_start_indices"][0] + end_idx = features["event_end_indices"][-1] + + features[feature_key] = features[feature_key][start_idx:end_idx] + + if state_events_end_token is not None: + # Extract the state events corresponding to the audio start token, and + # prepend them to the targets array. + state_event_start_idx = features["state_event_indices"][0] + state_event_end_idx = state_event_start_idx + 1 + while features["state_events"][state_event_end_idx - 1] != state_events_end_token: + state_event_end_idx += 1 + features[feature_key] = np.concatenate( + [ + features["state_events"][state_event_start_idx:state_event_end_idx], + features[feature_key], + ], + axis=0, + ) + + return features + + +def map_midi_programs( + feature, codec: Codec, granularity_type: str = "full", feature_key: str = "inputs" +) -> Mapping[str, Any]: + """Apply MIDI program map to token sequences.""" + granularity = PROGRAM_GRANULARITIES[granularity_type] + + feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], codec) + return feature + + +def run_length_encode_shifts_fn( + features, + codec: Codec, + feature_key: str = "inputs", + state_change_event_types: Sequence[str] = (), +) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]: + """Return a function that run-length encodes shifts for a given codec. + + Args: + codec: The Codec to use for shift events. + feature_key: The feature key for which to run-length encode shifts. + state_change_event_types: A list of event types that represent state + changes; tokens corresponding to these event types will be interpreted as state changes and redundant ones + will be removed. + + Returns: + A preprocessing function that run-length encodes single-step shifts. + """ + state_change_event_ranges = [codec.event_type_range(event_type) for event_type in state_change_event_types] + + def run_length_encode_shifts(features: MutableMapping[str, Any]) -> Mapping[str, Any]: + """Combine leading/interior shifts, trim trailing shifts. + + Args: + features: Dict of features to process. + + Returns: + A dict of features. + """ + events = features[feature_key] + + shift_steps = 0 + total_shift_steps = 0 + output = np.array([], dtype=np.int32) + + current_state = np.zeros(len(state_change_event_ranges), dtype=np.int32) + + for event in events: + if codec.is_shift_event_index(event): + shift_steps += 1 + total_shift_steps += 1 + + else: + # If this event is a state change and has the same value as the current + # state, we can skip it entirely. + is_redundant = False + for i, (min_index, max_index) in enumerate(state_change_event_ranges): + if (min_index <= event) and (event <= max_index): + if current_state[i] == event: + is_redundant = True + current_state[i] = event + if is_redundant: + continue + + # Once we've reached a non-shift event, RLE all previous shift events + # before outputting the non-shift event. + if shift_steps > 0: + shift_steps = total_shift_steps + while shift_steps > 0: + output_steps = np.minimum(codec.max_shift_steps, shift_steps) + output = np.concatenate([output, [output_steps]], axis=0) + shift_steps -= output_steps + output = np.concatenate([output, [event]], axis=0) + + features[feature_key] = output + return features + + return run_length_encode_shifts(features) + + +def note_representation_processor_chain(features, codec: Codec, note_representation_config: NoteRepresentationConfig): + tie_token = codec.encode_event(Event("tie", 0)) + state_events_end_token = tie_token if note_representation_config.include_ties else None + + features = extract_sequence_with_indices( + features, state_events_end_token=state_events_end_token, feature_key="inputs" + ) + + features = map_midi_programs(features, codec) + + features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"]) + + return features diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 88d8c2fc3f26..da419c76e8cd 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -1,3 +1,17 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math from typing import Optional @@ -21,29 +35,28 @@ from ...modeling_utils import ModelMixin from ...models.embeddings import get_timestep_embedding from ...onnx_utils import OnnxRuntimeModel -from ...pipeline_utils import DiffusionPipeline, AudioPipelineOutput +from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline from ...schedulers import DDPMScheduler - from .midi_utils import ( - program_to_slakh_program, - audio_to_frames, - SAMPLE_RATE, - HOP_SIZE, - FRAME_RATE, DEFAULT_MAX_SHIFT_SECONDS, - DEFAULT_STEPS_PER_SECOND, DEFAULT_NUM_VELOCITY_BINS, + DEFAULT_STEPS_PER_SECOND, + FRAME_RATE, + HOP_SIZE, + SAMPLE_RATE, TARGET_FEATURE_LENGTH, - note_sequence_to_onsets_and_offsets_and_programs, Codec, EventRange, - encode_and_index_events, NoteEncodingState, - note_event_data_to_events, - note_encoding_state_to_events, NoteRepresentationConfig, - note_representation_processor_chain, Tokenizer, + audio_to_frames, + encode_and_index_events, + note_encoding_state_to_events, + note_event_data_to_events, + note_representation_processor_chain, + note_sequence_to_onsets_and_offsets_and_programs, + program_to_slakh_program, ) From 202b8105922e24e5e5ead1514df4bcda2649e21c Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 9 Dec 2022 13:21:54 +0100 Subject: [PATCH 056/131] fix dtypes --- .../pipeline_spectrogram_diffusion.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index da419c76e8cd..8343a262fef0 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -608,20 +608,20 @@ def __call__( events = [note_representation_processor_chain(event, codec, note_representation_config) for event in events] input_tokens = [tokenizer.encode(event["inputs"]) for event in events] - pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims]) + pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32) full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32) for i, encoder_input_tokens in enumerate(input_tokens): - encoder_continuous_inputs = pred_mel[:1] + encoder_continuous_inputs = torch.from_numpy(pred_mel[:1].copy()).to(self.device) if i == 0: # The first chunk has no previous context. - encoder_continuous_mask = np.zeros((1, TARGET_FEATURE_LENGTH)) + encoder_continuous_mask = np.zeros((1, TARGET_FEATURE_LENGTH), dtype=np.bool) else: # The full song pipeline does not feed in a context feature, so the mask # will be all 0s after the feature converter. Because we know we're # feeding in a full context chunk from the previous prediction, set it # to all 1s. - encoder_continuous_mask = np.ones((1, TARGET_FEATURE_LENGTH)) + encoder_continuous_mask = np.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool) target_shape = encoder_continuous_inputs.shape encoder_continuous_inputs = self.scale_features( @@ -629,9 +629,9 @@ def __call__( ) encodings_and_masks = self.encode( - input_tokens=encoder_input_tokens.to(self.device), - continuous_inputs=encoder_continuous_inputs.to(self.device), - continuous_mask=encoder_continuous_mask.to(self.device), + input_tokens=torch.IntTensor([encoder_input_tokens]).to(self.device), + continuous_inputs=encoder_continuous_inputs, + continuous_mask=torch.from_numpy(encoder_continuous_mask.copy()).to(self.device), ) # Sample gaussian noise to begin loop @@ -656,6 +656,7 @@ def __call__( pred_mel = mel.cpu().numpy() full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1) + print("Generated segment", i) full_pred_audio = self.melgan(input_features=full_pred_mel.astype(np.float32)) From 085d766a8f72c7ae3d5abe5d1cb5ace0ec2e982f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 9 Dec 2022 13:46:57 +0100 Subject: [PATCH 057/131] remove immutabledict req --- .../spectrogram_diffusion/midi_utils.py | 75 +++++++++---------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py index 0a30bc807d2f..215ead82c90d 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py @@ -22,7 +22,6 @@ import torch.nn.functional as F import note_seq -from immutabledict import immutabledict INPUT_FEATURE_LENGTH = 2048 @@ -36,44 +35,42 @@ DEFAULT_MAX_SHIFT_SECONDS = 10 DEFAULT_NUM_VELOCITY_BINS = 1 -SLAKH_CLASS_PROGRAMS = immutabledict( - { - "Acoustic Piano": 0, - "Electric Piano": 4, - "Chromatic Percussion": 8, - "Organ": 16, - "Acoustic Guitar": 24, - "Clean Electric Guitar": 26, - "Distorted Electric Guitar": 29, - "Acoustic Bass": 32, - "Electric Bass": 33, - "Violin": 40, - "Viola": 41, - "Cello": 42, - "Contrabass": 43, - "Orchestral Harp": 46, - "Timpani": 47, - "String Ensemble": 48, - "Synth Strings": 50, - "Choir and Voice": 52, - "Orchestral Hit": 55, - "Trumpet": 56, - "Trombone": 57, - "Tuba": 58, - "French Horn": 60, - "Brass Section": 61, - "Soprano/Alto Sax": 64, - "Tenor Sax": 66, - "Baritone Sax": 67, - "Oboe": 68, - "English Horn": 69, - "Bassoon": 70, - "Clarinet": 71, - "Pipe": 73, - "Synth Lead": 80, - "Synth Pad": 88, - } -) +SLAKH_CLASS_PROGRAMS = { + "Acoustic Piano": 0, + "Electric Piano": 4, + "Chromatic Percussion": 8, + "Organ": 16, + "Acoustic Guitar": 24, + "Clean Electric Guitar": 26, + "Distorted Electric Guitar": 29, + "Acoustic Bass": 32, + "Electric Bass": 33, + "Violin": 40, + "Viola": 41, + "Cello": 42, + "Contrabass": 43, + "Orchestral Harp": 46, + "Timpani": 47, + "String Ensemble": 48, + "Synth Strings": 50, + "Choir and Voice": 52, + "Orchestral Hit": 55, + "Trumpet": 56, + "Trombone": 57, + "Tuba": 58, + "French Horn": 60, + "Brass Section": 61, + "Soprano/Alto Sax": 64, + "Tenor Sax": 66, + "Baritone Sax": 67, + "Oboe": 68, + "English Horn": 69, + "Bassoon": 70, + "Clarinet": 71, + "Pipe": 73, + "Synth Lead": 80, + "Synth Pad": 88, +} @dataclasses.dataclass From 3edc9e19de85e4c3d9a6c5720ed8f3ba80473906 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 9 Dec 2022 14:11:59 +0100 Subject: [PATCH 058/131] initial doc --- .../api/pipelines/spectrogram_diffusion.mdx | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 docs/source/api/pipelines/spectrogram_diffusion.mdx diff --git a/docs/source/api/pipelines/spectrogram_diffusion.mdx b/docs/source/api/pipelines/spectrogram_diffusion.mdx new file mode 100644 index 000000000000..e38b43043e51 --- /dev/null +++ b/docs/source/api/pipelines/spectrogram_diffusion.mdx @@ -0,0 +1,32 @@ + + + # Multi-instrument Music Synthesis with Spectrogram Diffusion + + ## Overview + +[Spectrogram Diffusion](https://arxiv.org/abs/2206.05408) by Hawthorne et al. + +An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes. + +The original codebase of this implementation can be found [here](https://github.com/magenta/music-spectrogram-diffusion). + +## Available Pipelines: + +| Pipeline | Tasks | Colab +|---|---|:---:| +| [pipeline_spectrogram_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion) | *Unconditional Audio Generation* | - | + + +## SpectrogramDiffusionPipeline +[[autodoc]] SpectrogramDiffusionPipeline + - __call__ From 5472ef576664e36cefe06311bcb632faa753f590 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 14 Dec 2022 16:50:13 +0100 Subject: [PATCH 059/131] use np.where --- src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py index 215ead82c90d..5d0114a1549f 100644 --- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py +++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py @@ -228,7 +228,7 @@ def programs_to_midi_classes(tokens, codec): """Modifies program events to be the first program in the MIDI class.""" min_program_id, max_program_id = codec.event_type_range("program") is_program = (tokens >= min_program_id) & (tokens <= max_program_id) - return tf.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens) + return np.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens) PROGRAM_GRANULARITIES = { From 87b5914d987ef33966b2cac57a23b8bf6b421f42 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 19 Dec 2022 15:56:19 +0100 Subject: [PATCH 060/131] require note_seq --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index f1aafd85658f..14fdd469cb39 100644 --- a/setup.py +++ b/setup.py @@ -94,6 +94,7 @@ "k-diffusion", "librosa", "modelcards>=0.1.4", + "note_seq", "numpy", "parameterized", "pytest", From cf24a45976d09baabdf458592e196f85f1053da9 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 19 Dec 2022 15:57:09 +0100 Subject: [PATCH 061/131] fix typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 14fdd469cb39..b251d853333a 100644 --- a/setup.py +++ b/setup.py @@ -94,7 +94,7 @@ "k-diffusion", "librosa", "modelcards>=0.1.4", - "note_seq", + "note-seq", "numpy", "parameterized", "pytest", From 00465c4d975a0e2698ca0e2b54aacaec0c4860c8 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 19 Dec 2022 16:48:58 +0100 Subject: [PATCH 062/131] update dependency --- src/diffusers/dependency_versions_table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 1ef1edc14629..fc46ec3730d5 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -18,6 +18,7 @@ "k-diffusion": "k-diffusion", "librosa": "librosa", "modelcards": "modelcards>=0.1.4", + "note-seq": "note-seq", "numpy": "numpy", "parameterized": "parameterized", "pytest": "pytest", From cd097b488894f92993573c60dce89e926c2a4b09 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 19 Dec 2022 16:55:15 +0100 Subject: [PATCH 063/131] added note-seq to test --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index b251d853333a..51e2b91101b6 100644 --- a/setup.py +++ b/setup.py @@ -186,6 +186,7 @@ def run(self): "datasets", "k-diffusion", "librosa", + "note-seq", "parameterized", "pytest", "pytest-timeout", From 04ac770efaeb0b128a5d0840cdbb68f98f8432b6 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 20 Dec 2022 17:50:40 +0100 Subject: [PATCH 064/131] added is_note_seq_available --- src/diffusers/__init__.py | 9 +++++++++ src/diffusers/pipelines/__init__.py | 10 +++++++++- src/diffusers/utils/__init__.py | 1 + .../utils/dummy_torch_and_note_seq_objects.py | 19 +++++++++++++++++++ src/diffusers/utils/import_utils.py | 18 ++++++++++++++++++ 5 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 src/diffusers/utils/dummy_torch_and_note_seq_objects.py diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 2c8ac5e9a466..685d248c649f 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -8,6 +8,7 @@ is_inflect_available, is_k_diffusion_available, is_librosa_available, + is_note_seq_available, is_onnx_available, is_scipy_available, is_torch_available, @@ -144,6 +145,14 @@ else: from .pipelines import AudioDiffusionPipeline, Mel +try: + if not (is_torch_available() and is_note_seq_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils.dummy_torch_and_note_seq_objects import * # noqa F403 +else: + from .pipelines import SpectrogramDiffusionPipeline + try: if not is_flax_available(): raise OptionalDependencyNotAvailable() diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index b2936c9fa08a..50bb4abb02ff 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -3,6 +3,7 @@ is_flax_available, is_k_diffusion_available, is_librosa_available, + is_note_seq_available, is_onnx_available, is_torch_available, is_transformers_available, @@ -23,7 +24,6 @@ from .pndm import PNDMPipeline from .repaint import RePaintPipeline from .score_sde_ve import ScoreSdeVePipeline - from .spectrogram_diffusion import SpectrogramDiffusionPipeline from .stochastic_karras_ve import KarrasVePipeline try: @@ -34,6 +34,14 @@ else: from .audio_diffusion import AudioDiffusionPipeline, Mel +try: + if not (is_torch_available() and is_note_seq_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ..utils.dummy_torch_and_note_seq_objects import * # noqa F403 +else: + from .spectrogram_diffusion import SpectrogramDiffusionPipeline + try: if not (is_torch_available() and is_transformers_available()): raise OptionalDependencyNotAvailable() diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index e5a4d323e3eb..e6336f037e9e 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -33,6 +33,7 @@ is_k_diffusion_available, is_librosa_available, is_modelcards_available, + is_note_seq_available, is_onnx_available, is_safetensors_available, is_scipy_available, diff --git a/src/diffusers/utils/dummy_torch_and_note_seq_objects.py b/src/diffusers/utils/dummy_torch_and_note_seq_objects.py new file mode 100644 index 000000000000..288bec68ef2a --- /dev/null +++ b/src/diffusers/utils/dummy_torch_and_note_seq_objects.py @@ -0,0 +1,19 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +# flake8: noqa + +from ..utils import DummyObject, requires_backends + + +class SpectrogramDiffusionPipeline(metaclass=DummyObject): + _backends = ["torch", "note_seq"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "note_seq"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "note_seq"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "note_seq"]) diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index ad3ab69f66b8..7dad57443eda 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -217,6 +217,13 @@ except importlib_metadata.PackageNotFoundError: _k_diffusion_available = False +_note_seq_available = importlib.util.find_spec("note_seq") is not None +try: + _note_seq_version = importlib_metadata.version("note_seq") + logger.debug(f"Successfully imported note-seq version {_note_seq_version}") +except importlib_metadata.PackageNotFoundError: + _note_seq_available = False + def is_torch_available(): return _torch_available @@ -274,6 +281,10 @@ def is_k_diffusion_available(): return _k_diffusion_available +def is_note_seq_available(): + return _note_seq_available + + # docstyle-ignore FLAX_IMPORT_ERROR = """ {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the @@ -328,6 +339,12 @@ def is_k_diffusion_available(): install k-diffusion` """ +# docstyle-ignore +NOTE_SEQ_IMPORT_ERROR = """ +{0} requires the note-seq library but it was not found in your environment. You can install it with pip: `pip +install note-seq` +""" + BACKENDS_MAPPING = OrderedDict( [ @@ -340,6 +357,7 @@ def is_k_diffusion_available(): ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)), ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)), ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)), + ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)), ] ) From 2afaf2768e9013c378b851ceb2662a90bf9c2f33 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 20 Dec 2022 19:04:52 +0100 Subject: [PATCH 065/131] fix import --- src/diffusers/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 685d248c649f..37edc5378a6d 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -55,7 +55,6 @@ PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline, - SpectrogramDiffusionPipeline, ) from .schedulers import ( DDIMScheduler, From 3acb123a08bd26936b96f92d08e719173a680aae Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 30 Dec 2022 15:48:18 +0100 Subject: [PATCH 066/131] added toc --- docs/source/_toctree.yml | 2 ++ src/diffusers/utils/dummy_pt_objects.py | 15 --------------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 52ad170dc33a..9ade1b88d3e9 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -132,6 +132,8 @@ title: "RePaint" - local: api/pipelines/audio_diffusion title: "Audio Diffusion" + - local: api/pipelines/spectrogram_diffusion + title: "Spectrogram Diffusion" title: "Pipelines" - sections: - local: api/schedulers/overview diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 1e4e9d070448..63a7d258a902 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -332,21 +332,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) -class SpectrogramDiffusionPipeline(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) - - class DDIMScheduler(metaclass=DummyObject): _backends = ["torch"] From b9d0842a9d8d32be2fc3d8a2c3c70fd394a3b02d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 30 Dec 2022 17:46:22 +0100 Subject: [PATCH 067/131] added example usage --- .../api/pipelines/spectrogram_diffusion.mdx | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/docs/source/api/pipelines/spectrogram_diffusion.mdx b/docs/source/api/pipelines/spectrogram_diffusion.mdx index e38b43043e51..a5117d90b067 100644 --- a/docs/source/api/pipelines/spectrogram_diffusion.mdx +++ b/docs/source/api/pipelines/spectrogram_diffusion.mdx @@ -18,7 +18,7 @@ specific language governing permissions and limitations under the License. An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes. -The original codebase of this implementation can be found [here](https://github.com/magenta/music-spectrogram-diffusion). +The original codebase of this implementation can be found at [magenta/music-spectrogram-diffusion)](https://github.com/magenta/music-spectrogram-diffusion). ## Available Pipelines: @@ -27,6 +27,20 @@ The original codebase of this implementation can be found [here](https://github. | [pipeline_spectrogram_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion) | *Unconditional Audio Generation* | - | +## Example usage + +```python +from diffusers import SpectrogramDiffusionPipeline + +pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion") +pipe = pipe.to("cuda") + +output = pipe("beethoven_hammerklavier_2.mid") + +audio = output.audios[0] +``` + ## SpectrogramDiffusionPipeline [[autodoc]] SpectrogramDiffusionPipeline - - __call__ + - all + - __call__ From f3b4ad4c62f13fa651205afe214298531535f57c Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 18 Jan 2023 13:24:32 +0100 Subject: [PATCH 068/131] undo for now --- docs/source/_toctree.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 9ade1b88d3e9..52ad170dc33a 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -132,8 +132,6 @@ title: "RePaint" - local: api/pipelines/audio_diffusion title: "Audio Diffusion" - - local: api/pipelines/spectrogram_diffusion - title: "Spectrogram Diffusion" title: "Pipelines" - sections: - local: api/schedulers/overview From 50908b82874c4ca031372b9455b34476c2fbfb0c Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 18 Jan 2023 13:31:19 +0100 Subject: [PATCH 069/131] moved docs --- docs/source/en/_toctree.yml | 2 ++ docs/source/{ => en}/api/pipelines/spectrogram_diffusion.mdx | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) rename docs/source/{ => en}/api/pipelines/spectrogram_diffusion.mdx (97%) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 2c0d94fcc16b..3ee18bd7ecfb 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -120,6 +120,8 @@ title: Safe Stable Diffusion - local: api/pipelines/score_sde_ve title: Score SDE VE + - local: api/pipelines/spectrogram_diffusion + title: "Spectrogram Diffusion" - sections: - local: api/pipelines/stable_diffusion/overview title: Overview diff --git a/docs/source/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx similarity index 97% rename from docs/source/api/pipelines/spectrogram_diffusion.mdx rename to docs/source/en/api/pipelines/spectrogram_diffusion.mdx index a5117d90b067..816d729e4c27 100644 --- a/docs/source/api/pipelines/spectrogram_diffusion.mdx +++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx @@ -1,4 +1,4 @@ - - # Multi-instrument Music Synthesis with Spectrogram Diffusion +# Multi-instrument Music Synthesis with Spectrogram Diffusion - ## Overview +## Overview [Spectrogram Diffusion](https://arxiv.org/abs/2206.05408) by Curtis Hawthorne, Ian Simon, Adam Roberts, Neil Zeghidour, Josh Gardner, Ethan Manilow, and Jesse Engel. @@ -41,6 +41,7 @@ from diffusers import SpectrogramDiffusionPipeline pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion") pipe = pipe.to("cuda") +# Download via: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid output = pipe("beethoven_hammerklavier_2.mid") audio = output.audios[0] From 2a38f7648e53f30a6db385fde0e1421ef975864b Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 13:54:15 +0100 Subject: [PATCH 115/131] Update src/diffusers/models/t5_film_transformer.py Co-authored-by: Patrick von Platen --- src/diffusers/models/t5_film_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py index dcefaec760ae..f9bcb06ebc0b 100644 --- a/src/diffusers/models/t5_film_transformer.py +++ b/src/diffusers/models/t5_film_transformer.py @@ -17,7 +17,7 @@ from torch import nn from ..configuration_utils import ConfigMixin, register_to_config -from ..models.attention_processor import Attention +from .attention_processor import Attention from .embeddings import get_timestep_embedding from .modeling_utils import ModelMixin From 96111b2306e87318cbb1058d587bd689275c1337 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 13:54:47 +0100 Subject: [PATCH 116/131] Update src/diffusers/models/t5_film_transformer.py Co-authored-by: Patrick von Platen --- src/diffusers/models/t5_film_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py index f9bcb06ebc0b..1c41e656a9db 100644 --- a/src/diffusers/models/t5_film_transformer.py +++ b/src/diffusers/models/t5_film_transformer.py @@ -1,4 +1,4 @@ -# Copyright 2022 The HuggingFace Team. All rights reserved. +# Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 17dbe1d2bc3c07dc5b0dabdb26d0117c7354cdb7 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 14:05:11 +0100 Subject: [PATCH 117/131] Update docs/source/en/api/pipelines/spectrogram_diffusion.mdx Co-authored-by: Patrick von Platen --- docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx index 0ab8286bfc80..2f33bf1aaea4 100644 --- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx +++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx @@ -38,7 +38,7 @@ As depicted above the model takes as input a MIDI file and tokenizes it into a s ```python from diffusers import SpectrogramDiffusionPipeline -pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion") +pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") pipe = pipe.to("cuda") # Download via: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid From dd9f8ca51c61e8dfc60dded09c7e654a25365985 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 14:05:28 +0100 Subject: [PATCH 118/131] Update tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py Co-authored-by: Patrick von Platen --- .../spectrogram_diffusion/test_spectrogram_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index b44d1735a949..c2e3990b22ec 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -174,7 +174,7 @@ def test_callback(self): # so that music can be played live device = torch_device - pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion", melgan=None) + pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") melgan = OnnxRuntimeModel.from_pretrained("kashif/soundstream_mel_decoder") pipe = pipe.to(device) From 654c79669c1644539cf5e36c125c23ac6eabe569 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 14:05:43 +0100 Subject: [PATCH 119/131] Update tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py Co-authored-by: Patrick von Platen --- .../spectrogram_diffusion/test_spectrogram_diffusion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index c2e3990b22ec..5698bb6ef22f 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -175,7 +175,8 @@ def test_callback(self): device = torch_device pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") - melgan = OnnxRuntimeModel.from_pretrained("kashif/soundstream_mel_decoder") + melgan = pipe.melgan + pipe.melgan = None pipe = pipe.to(device) pipe.set_progress_bar_config(disable=None) From 9a8a93dd7c3b2fe9a4638c9da4a85919160b03a7 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 14:05:54 +0100 Subject: [PATCH 120/131] Update tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py Co-authored-by: Patrick von Platen --- .../spectrogram_diffusion/test_spectrogram_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index 5698bb6ef22f..cd11427dd7b0 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -217,7 +217,7 @@ def test_spectrogram_fast(self): def test_spectrogram(self): device = torch_device - pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion") + pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") pipe = pipe.to(device) pipe.set_progress_bar_config(disable=None) From ebb8e9a77a95777da6bf9eabe8161ea28b473448 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 14:06:08 +0100 Subject: [PATCH 121/131] Update tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py Co-authored-by: Patrick von Platen --- .../spectrogram_diffusion/test_spectrogram_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index cd11427dd7b0..587b166de01a 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -198,7 +198,7 @@ def callback(step, mel_output): def test_spectrogram_fast(self): device = torch_device - pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion") + pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") pipe = pipe.to(device) pipe.set_progress_bar_config(disable=None) processor = MidiProcessor() From 3a944769c11d17fc35f6877cc45308b1d47f6365 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 14:28:24 +0100 Subject: [PATCH 122/131] add MidiProcessor --- docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx index 2f33bf1aaea4..b2ed410d2896 100644 --- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx +++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx @@ -36,13 +36,15 @@ As depicted above the model takes as input a MIDI file and tokenizes it into a s ## Example usage ```python -from diffusers import SpectrogramDiffusionPipeline +from diffusers import SpectrogramDiffusionPipeline, MidiProcessor -pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") +pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion") pipe = pipe.to("cuda") # Download via: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid -output = pipe("beethoven_hammerklavier_2.mid") +processor = MidiProcessor() + +output = pipe(processor("beethoven_hammerklavier_2.mid")) audio = output.audios[0] ``` From 7c43be8936c0ebf23af2bfcfe08c189db00ff9c1 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 14:31:05 +0100 Subject: [PATCH 123/131] format --- docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx index b2ed410d2896..e9fb1c282c23 100644 --- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx +++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx @@ -40,10 +40,9 @@ from diffusers import SpectrogramDiffusionPipeline, MidiProcessor pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion") pipe = pipe.to("cuda") - -# Download via: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid processor = MidiProcessor() +# Download MIDI from: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid output = pipe(processor("beethoven_hammerklavier_2.mid")) audio = output.audios[0] From 6dcd3f7a204c224a1ee4ad8c2631fd2c9d224f36 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 14:42:40 +0100 Subject: [PATCH 124/131] fix org --- docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx index e9fb1c282c23..c98300fe791f 100644 --- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx +++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx @@ -38,7 +38,7 @@ As depicted above the model takes as input a MIDI file and tokenizes it into a s ```python from diffusers import SpectrogramDiffusionPipeline, MidiProcessor -pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion") +pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") pipe = pipe.to("cuda") processor = MidiProcessor() From 17b7481962ecb2f97dcb3bd2aee1daee6ed51a66 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 21 Mar 2023 15:24:11 +0100 Subject: [PATCH 125/131] Apply suggestions from code review --- .../spectrogram_diffusion/test_spectrogram_diffusion.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index 587b166de01a..e7ee2784ccdb 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -234,6 +234,3 @@ def test_spectrogram(self): audio = output.audios[0] assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2 - audio = output.audios[0] - rate = 16_000 - scipy.io.wavfile.write("/home/patrick_huggingface_co/audios/beet.wav", rate, audio[0]) From 458e7b77421af4bb755ca2a5bc8a78d1029e7af0 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 21 Mar 2023 15:24:32 +0100 Subject: [PATCH 126/131] Update tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py --- .../spectrogram_diffusion/test_spectrogram_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index e7ee2784ccdb..f8ddc66c05dc 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -233,4 +233,3 @@ def test_spectrogram(self): audio = output.audios[0] assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2 - From 4f27f66eb9c46a3c1593c252d16807890d9afb6f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 15:31:14 +0100 Subject: [PATCH 127/131] make style --- .../spectrogram_diffusion/test_spectrogram_diffusion.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index f8ddc66c05dc..ed9df3a56b1d 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -17,21 +17,17 @@ import unittest import numpy as np -import scipy import torch from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device -from diffusers.utils.testing_utils import is_onnx_available, require_note_seq, require_onnxruntime +from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS from ...test_pipelines_common import PipelineTesterMixin -if is_onnx_available(): - from diffusers import OnnxRuntimeModel - torch.backends.cuda.matmul.allow_tf32 = False From 76a28c1981f3c20a6152a78cf7da6e4a3b1a453b Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 15:47:43 +0100 Subject: [PATCH 128/131] pin protobuf to <4 --- setup.py | 1 + src/diffusers/dependency_versions_table.py | 1 + 2 files changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 5cc48c6caa19..6a37ede4a5ab 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,7 @@ "note-seq", "numpy", "parameterized", + "protobuf >=3.20.3,<4", "pytest", "pytest-timeout", "pytest-xdist", diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 1ac669a36753..787ce508b08b 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -22,6 +22,7 @@ "note-seq": "note-seq", "numpy": "numpy", "parameterized": "parameterized", + "protobuf ": "protobuf >=3.20.3,<4", "pytest": "pytest", "pytest-timeout": "pytest-timeout", "pytest-xdist": "pytest-xdist", From 7339d379e84f3fd2a994fa1a3703449c19949e62 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 15:52:26 +0100 Subject: [PATCH 129/131] fix formatting --- setup.py | 2 +- src/diffusers/dependency_versions_table.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 6a37ede4a5ab..0ad2ed6c3b3c 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ "note-seq", "numpy", "parameterized", - "protobuf >=3.20.3,<4", + "protobuf>=3.20.3,<4", "pytest", "pytest-timeout", "pytest-xdist", diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 787ce508b08b..4db1afba2fcd 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -22,7 +22,7 @@ "note-seq": "note-seq", "numpy": "numpy", "parameterized": "parameterized", - "protobuf ": "protobuf >=3.20.3,<4", + "protobuf ": "protobuf>=3.20.3,<4", "pytest": "pytest", "pytest-timeout": "pytest-timeout", "pytest-xdist": "pytest-xdist", From f71b15508e2934d62af645d831ff81bdfbd81cbd Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 15:54:27 +0100 Subject: [PATCH 130/131] white space --- src/diffusers/dependency_versions_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 4db1afba2fcd..1269cf1578a6 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -22,7 +22,7 @@ "note-seq": "note-seq", "numpy": "numpy", "parameterized": "parameterized", - "protobuf ": "protobuf>=3.20.3,<4", + "protobuf": "protobuf>=3.20.3,<4", "pytest": "pytest", "pytest-timeout": "pytest-timeout", "pytest-xdist": "pytest-xdist", From e5225a3920f203b43563c872655e8ce43acabaac Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 21 Mar 2023 16:22:27 +0100 Subject: [PATCH 131/131] tensorboard needs protobuf --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0ad2ed6c3b3c..972f9a5b4a24 100644 --- a/setup.py +++ b/setup.py @@ -184,7 +184,7 @@ def run(self): extras = {} extras["quality"] = deps_list("black", "isort", "ruff", "hf-doc-builder") extras["docs"] = deps_list("hf-doc-builder") -extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "Jinja2") +extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2") extras["test"] = deps_list( "compel", "datasets",