From f85d908b0428c389b58a65aba883795168cbdb42 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 26 Oct 2022 18:22:05 +0200
Subject: [PATCH 001/131] initial TokenEncoder and ContinuousEncoder

---
 .../spectrogram_diffusion/__init__.py         |   2 +
 .../spectrogram_diffusion/modules.py          | 148 ++++++++++++++++++
 .../pipeline_spectrogram_diffusion.py         |   0
 3 files changed, 150 insertions(+)
 create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/__init__.py
 create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/modules.py
 create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
new file mode 100644
index 000000000000..53377210e7b3
--- /dev/null
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .modules import TokenEncoder, ContinuousEncoder
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/modules.py b/src/diffusers/pipelines/spectrogram_diffusion/modules.py
new file mode 100644
index 000000000000..b4ea211e4741
--- /dev/null
+++ b/src/diffusers/pipelines/spectrogram_diffusion/modules.py
@@ -0,0 +1,148 @@
+import torch
+import torch.nn as nn
+
+from transformers.models.t5.modeling_t5 import T5LayerNorm, T5Block
+
+
+class TokenEncoder(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+
+        self.token_embedder = nn.Embedding(
+            config.vocab_size,
+            config.d_model,
+            _weight=torch.FloatTensor(weights["token_embedder"]["embedding"]),
+        )
+
+        self.position_encoding = nn.Embedding(
+            config.max_length,
+            config.d_model,
+            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
+        )
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
+
+        config.is_encoder_decoder = False
+        self.encoders = nn.ModuleList([])
+        for lyr_num in range(config.num_layers):
+            lyr = T5Block(config)
+            ly_weight = weights[f"layers_{lyr_num}"]
+
+            attention_weights = ly_weight["attention"]
+            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"]))
+            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"]))
+            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"]))
+            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"]))
+            lyr.layer[0].layer_norm.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+            )
+
+            lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"])
+            )
+            lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"])
+            )
+            lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"]))
+            lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
+        self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
+
+        self.dropout_post = nn.Dropout(p=config.dropout_rate)
+
+    def forward(self, encoder_input_tokens, encoder_inputs_mask):
+        x = self.token_embedder(encoder_input_tokens)
+
+        seq_length = encoder_input_tokens.shape[1]
+        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
+        x += self.position_encoding(inputs_positions)
+
+        x = self.dropout_pre(x)
+
+        for lyr in self.encoders:
+            x = lyr(x, encoder_inputs_mask)
+
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
+
+
+class ContinuousEncoder(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+
+        self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False)
+        self.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"]))
+
+        self.position_encoding = nn.Embedding(
+            config.targets_context_length,
+            config.d_model,
+            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
+        )
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
+
+        config.is_encoder_decoder = False
+        self.encoders = nn.ModuleList([])
+        for lyr_num in range(config.num_layers):
+            lyr = T5Block(config)
+            ly_weight = weights[f"layers_{lyr_num}"]
+
+            attention_weights = ly_weight["attention"]
+            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"]))
+            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"]))
+            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"]))
+            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"]))
+            lyr.layer[0].layer_norm.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+            )
+
+            lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"])
+            )
+            lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"])
+            )
+            lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"]))
+            lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
+        self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
+
+        self.dropout_post = nn.Dropout(p=config.dropout_rate)
+
+    def get_sequence_length(self, sequence):
+        # Return the first index where a 0 occurs.
+        length = torch.argmax(sequence == 0)
+
+        # If argmax returns 0, that means that either
+        # 1) No 0s were found, and the sequence length is the full length of the array
+        # 2) There's padding immediately at the beginning, indicating that the array
+        #    is all padding and the sequence length is 0.
+        return torch.where(length == 0 and sequence[0] != 0, sequence.shape[0], length)
+
+    def forward(self, encoder_inputs, encoder_inputs_mask):
+        x = self.input_proj(encoder_inputs)
+
+        # terminal relative positional encodings
+        max_positions = encoder_inputs.shape[1]
+        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
+        seq_lens = self.get_sequence_length(encoder_inputs_mask)
+        input_positions = torch.roll(input_positions, seq_lens, dims=0)
+        x += self.position_encoding(input_positions)
+
+        x = self.dropout_pre(x)
+
+        for lyr in self.encoders:
+            x = lyr(x, encoder_inputs_mask)
+
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
new file mode 100644
index 000000000000..e69de29bb2d1

From e02541020431a1fe8e6ca57480907726b53b2700 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 28 Oct 2022 18:28:36 +0200
Subject: [PATCH 002/131] initial modules

---
 .../spectrogram_diffusion/__init__.py         |   2 +-
 .../spectrogram_diffusion/modules.py          | 148 -----
 .../pipeline_spectrogram_diffusion.py         | 618 ++++++++++++++++++
 3 files changed, 619 insertions(+), 149 deletions(-)
 delete mode 100644 src/diffusers/pipelines/spectrogram_diffusion/modules.py

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index 53377210e7b3..fb094f2380ca 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,2 +1,2 @@
 # flake8: noqa
-from .modules import TokenEncoder, ContinuousEncoder
+from .pipeline_spectrogram_diffusion import ContinuousEncoder, Decoder, TokenEncoder
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/modules.py b/src/diffusers/pipelines/spectrogram_diffusion/modules.py
deleted file mode 100644
index b4ea211e4741..000000000000
--- a/src/diffusers/pipelines/spectrogram_diffusion/modules.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import torch
-import torch.nn as nn
-
-from transformers.models.t5.modeling_t5 import T5LayerNorm, T5Block
-
-
-class TokenEncoder(nn.Module):
-    def __init__(self, config, weights):
-        super().__init__()
-
-        self.token_embedder = nn.Embedding(
-            config.vocab_size,
-            config.d_model,
-            _weight=torch.FloatTensor(weights["token_embedder"]["embedding"]),
-        )
-
-        self.position_encoding = nn.Embedding(
-            config.max_length,
-            config.d_model,
-            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
-        )
-        self.position_encoding.weight.requires_grad = False
-
-        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
-
-        config.is_encoder_decoder = False
-        self.encoders = nn.ModuleList([])
-        for lyr_num in range(config.num_layers):
-            lyr = T5Block(config)
-            ly_weight = weights[f"layers_{lyr_num}"]
-
-            attention_weights = ly_weight["attention"]
-            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"]))
-            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"]))
-            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"]))
-            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"]))
-            lyr.layer[0].layer_norm.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
-            )
-
-            lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"])
-            )
-            lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"])
-            )
-            lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"]))
-            lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
-
-            self.encoders.append(lyr)
-
-        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
-        self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
-
-        self.dropout_post = nn.Dropout(p=config.dropout_rate)
-
-    def forward(self, encoder_input_tokens, encoder_inputs_mask):
-        x = self.token_embedder(encoder_input_tokens)
-
-        seq_length = encoder_input_tokens.shape[1]
-        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
-        x += self.position_encoding(inputs_positions)
-
-        x = self.dropout_pre(x)
-
-        for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)
-
-        x = self.layer_norm(x)
-
-        return self.dropout_post(x), encoder_inputs_mask
-
-
-class ContinuousEncoder(nn.Module):
-    def __init__(self, config, weights):
-        super().__init__()
-
-        self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False)
-        self.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"]))
-
-        self.position_encoding = nn.Embedding(
-            config.targets_context_length,
-            config.d_model,
-            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
-        )
-        self.position_encoding.weight.requires_grad = False
-
-        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
-
-        config.is_encoder_decoder = False
-        self.encoders = nn.ModuleList([])
-        for lyr_num in range(config.num_layers):
-            lyr = T5Block(config)
-            ly_weight = weights[f"layers_{lyr_num}"]
-
-            attention_weights = ly_weight["attention"]
-            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"]))
-            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"]))
-            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"]))
-            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"]))
-            lyr.layer[0].layer_norm.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
-            )
-
-            lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"])
-            )
-            lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"])
-            )
-            lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"]))
-            lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
-
-            self.encoders.append(lyr)
-
-        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
-        self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
-
-        self.dropout_post = nn.Dropout(p=config.dropout_rate)
-
-    def get_sequence_length(self, sequence):
-        # Return the first index where a 0 occurs.
-        length = torch.argmax(sequence == 0)
-
-        # If argmax returns 0, that means that either
-        # 1) No 0s were found, and the sequence length is the full length of the array
-        # 2) There's padding immediately at the beginning, indicating that the array
-        #    is all padding and the sequence length is 0.
-        return torch.where(length == 0 and sequence[0] != 0, sequence.shape[0], length)
-
-    def forward(self, encoder_inputs, encoder_inputs_mask):
-        x = self.input_proj(encoder_inputs)
-
-        # terminal relative positional encodings
-        max_positions = encoder_inputs.shape[1]
-        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
-        seq_lens = self.get_sequence_length(encoder_inputs_mask)
-        input_positions = torch.roll(input_positions, seq_lens, dims=0)
-        x += self.position_encoding(input_positions)
-
-        x = self.dropout_pre(x)
-
-        for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)
-
-        x = self.layer_norm(x)
-
-        return self.dropout_post(x), encoder_inputs_mask
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index e69de29bb2d1..43240b206a52 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -0,0 +1,618 @@
+import torch
+import torch.nn as nn
+
+from diffusers.models.embeddings import get_timestep_embedding
+from transformers.models.t5.modeling_t5 import T5Block, T5Config, T5LayerCrossAttention, T5LayerFF, T5LayerNorm
+
+
+class FiLMLayer(nn.Module):
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        self.scale_bias = nn.Linear(in_features, out_features * 2)
+
+    def forward(self, x, conditioning_emb):
+        scale_bias = self.scale_bias(conditioning_emb)
+        scale, bias = torch.chunk(scale_bias, 2, -1)
+        return x * (scale + 1.0) + bias
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(config.d_model)
+        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(config))
+
+        # pre_mlp_layer_norm: layer 2
+        self.layer.append(T5LayerNorm(hidden_size=config.d_model))
+
+        # FiLM layer: 3
+        self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model))
+
+        # MLP + dropout: last layer
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+
+        if past_key_value is not None:
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        if encoder_hidden_states is not None:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply LayerNorm
+        hidden_states = self.layer[2](hidden_states)
+
+        # FiLM
+        if conditioning_emb is not None:
+            hidden_states = self.layer[3](hidden_states, conditioning_emb)
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(config))
+
+        # pre_mlp_layer_norm: layer 2
+        self.layer.append(T5LayerNorm(hidden_size=config.d_model))
+
+        # FiLM layer: 3
+        self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model))
+
+        # MLP + dropout: last layer
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+
+        if past_key_value is not None:
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        if encoder_hidden_states is not None:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply LayerNorm
+        hidden_states = self.layer[2](hidden_states)
+
+        # FiLM
+        if conditioning_emb is not None:
+            hidden_states = self.layer[3](hidden_states, conditioning_emb)
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class TokenEncoder(nn.Module):
+    def __init__(self, config: T5Config, weights):
+        super().__init__()
+
+        self.token_embedder = nn.Embedding(
+            config.vocab_size,
+            config.d_model,
+            _weight=torch.FloatTensor(weights["token_embedder"]["embedding"]),
+        )
+
+        self.position_encoding = nn.Embedding(
+            config.max_length,
+            config.d_model,
+            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
+        )
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
+
+        config.is_encoder_decoder = False
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(config.num_layers):
+            lyr = T5Block(config)
+            ly_weight = weights[f"layers_{lyr_num}"]
+
+            attention_weights = ly_weight["attention"]
+            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["query"]["kernel"].T)
+            )
+            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["value"]["kernel"].T)
+            )
+            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+            lyr.layer[0].layer_norm.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+            )
+
+            lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
+            )
+            lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
+            )
+            lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
+            lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
+        self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
+
+        self.dropout_post = nn.Dropout(p=config.dropout_rate)
+
+    def forward(self, encoder_input_tokens, encoder_inputs_mask):
+        x = self.token_embedder(encoder_input_tokens)
+
+        seq_length = encoder_input_tokens.shape[1]
+        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
+        x += self.position_encoding(inputs_positions)
+
+        x = self.dropout_pre(x)
+
+        for lyr in self.encoders:
+            x = lyr(x, encoder_inputs_mask)
+
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
+
+
+class ContinuousEncoder(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+
+        self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False)
+        self.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T))
+
+        self.position_encoding = nn.Embedding(
+            config.targets_context_length,
+            config.d_model,
+            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
+        )
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
+
+        config.is_encoder_decoder = False
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(config.num_layers):
+            lyr = T5Block(config)
+            ly_weight = weights[f"layers_{lyr_num}"]
+
+            attention_weights = ly_weight["attention"]
+            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["query"]["kernel"].T)
+            )
+            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["value"]["kernel"].T)
+            )
+            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+            lyr.layer[0].layer_norm.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+            )
+
+            lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
+            )
+            lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
+            )
+            lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
+            lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
+        self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
+
+        self.dropout_post = nn.Dropout(p=config.dropout_rate)
+
+    def get_sequence_length(self, sequence):
+        # Return the first index where a 0 occurs.
+        length = torch.argmax(sequence == 0)
+
+        # If argmax returns 0, that means that either
+        # 1) No 0s were found, and the sequence length is the full length of the array
+        # 2) There's padding immediately at the beginning, indicating that the array
+        #    is all padding and the sequence length is 0.
+        return torch.where(length == 0 and sequence[0] != 0, sequence.shape[0], length)
+
+    def forward(self, encoder_inputs, encoder_inputs_mask):
+        x = self.input_proj(encoder_inputs)
+
+        # terminal relative positional encodings
+        max_positions = encoder_inputs.shape[1]
+        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
+        seq_lens = self.get_sequence_length(encoder_inputs_mask)
+        input_positions = torch.roll(input_positions, seq_lens, dims=0)
+        x += self.position_encoding(input_positions)
+
+        x = self.dropout_pre(x)
+
+        for lyr in self.encoders:
+            x = lyr(x, encoder_inputs_mask)
+
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
+
+
+class Decoder(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(config.d_model * 4, config.d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+        self.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T))
+        self.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T))
+
+        self.position_encoding = nn.Embedding(
+            config.targets_length,
+            config.d_model,
+            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
+        )
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(
+            config.input_dims,
+            config.d_model,
+        )
+        self.continuous_inputs_projection.weight = nn.Parameter(
+            torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T)
+        )
+
+        self.dropout = nn.Dropout(p=config.dropout_rate)
+
+        self.decoders = nn.ModuleList()
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        for lyr_num in range(config.num_decoder_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(config)
+            ly_weight = weights[f"layers_{lyr_num}"]
+
+            lyr.layer[0].layer_norm.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"])
+            )
+
+            lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T)
+            )
+
+            attention_weights = ly_weight["self_attention"]
+            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["query"]["kernel"].T)
+            )
+            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["value"]["kernel"].T)
+            )
+            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+
+            attention_weights = ly_weight["MultiHeadDotProductAttention_0"]
+            lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["query"]["kernel"].T)
+            )
+            lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["key"]["kernel"].T)
+            )
+            lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["value"]["kernel"].T)
+            )
+            lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(
+                torch.FloatTensor(attention_weights["out"]["kernel"].T)
+            )
+
+            lyr.layer[1].layer_norm.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"])
+            )
+
+            lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+            lyr.layer[3].scale_bias.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T)
+            )
+
+            lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
+            )
+            lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(
+                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
+            )
+            lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
+
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(config.d_model)
+        self.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"]))
+
+        self.post_dropout = nn.Dropout(p=config.dropout_rate)
+        self.spec_out = nn.Linear(config.d_model, config.input_dims, bias=False)
+        self.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T))
+
+        self.max_decoder_noise_time = config.max_decoder_noise_time
+        self.emb_dim = condig.d_model
+
+    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
+        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        conditioning_emb = get_timestep_embedding(
+            decoder_noise_time * self.max_decoder_noise_time,
+            embedding_dim=self.emb_dim,
+            max_period=self.max_decoder_noise_time,
+        )
+
+        conditioning_emb = self.conditioning_emb(conditioning_emb)
+
+        assert conditioning_emb.shape == (batch, 1, self.emb_dim * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+
+        inputs += position_encodings
+
+        inputs = self.dropout(inputs)
+        y = inputs
+
+        for lyr in self.decoders:
+            y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb)
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out

From e88dc6fe6c9f76a0df44e52774da36d94a4e35a6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 28 Oct 2022 18:29:41 +0200
Subject: [PATCH 003/131] added ContinuousContextTransformer

---
 .../spectrogram_diffusion/__init__.py         |  2 +-
 .../pipeline_spectrogram_diffusion.py         | 52 +++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index fb094f2380ca..a404e61c1217 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,2 +1,2 @@
 # flake8: noqa
-from .pipeline_spectrogram_diffusion import ContinuousEncoder, Decoder, TokenEncoder
+from .pipeline_spectrogram_diffusion import ContinuousContextTransformer, ContinuousEncoder, Decoder, TokenEncoder
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 43240b206a52..7f3faa8c8b3b 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -616,3 +616,55 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         spec_out = self.spec_out(y)
         return spec_out
+
+
+class ContinuousContextTransformer(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+
+        self.token_encoder = TokenEncoder(config=config, weights=weights)
+        self.continuous_encoder = ContinuousEncoder(config=config, weights=weights)
+        self.decoder = Decoder(config=config, weights=weights)
+
+    def encode(self, input_tokens, continuous_inputs, continuous_mask):
+        tokens_mask = input_tokens > 0
+        tokens_encoded, tokens_mask = self.token_encoder(
+            encoder_input_tokens=input_tokens,
+            encoder_inputs_mask=tokens_mask,
+        )
+
+        continuous_encoded, continuous_mask = self.continuous_encoder(
+            encoder_inputs=continuous_inputs,
+            encoder_inputs_mask=continuous_mask,
+        )
+
+        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
+
+    def decode(self, encodings_and_masks, input_tokens, noise_time):
+        logits = self.decoder(
+            encodings_and_masks=encodings_and_masks,
+            decoder_input_tokens=input_tokens,
+            decoder_noise_time=noise_time,
+        )
+        return logits
+
+    def forward(
+        self,
+        encoder_input_tokens,
+        encoder_continuous_inputs,
+        encoder_continuous_mask,
+        decoder_input_tokens,
+        decoder_noise_time,
+    ):
+
+        encodings_and_masks = self.encode(
+            input_tokens=encoder_input_tokens,
+            continuous_inputs=encoder_continuous_inputs,
+            continuous_mask=encoder_continuous_mask,
+        )
+
+        return self.decode(
+            encodings_and_masks=encodings_and_masks,
+            input_tokens=decoder_input_tokens,
+            noise_time=decoder_noise_time,
+        )

From 59e2111fbd9c07357e07b57e87dec32d9bc7f992 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 2 Nov 2022 15:15:21 +0300
Subject: [PATCH 004/131] fix copy paste error

---
 .../pipeline_spectrogram_diffusion.py         | 145 ++----------------
 1 file changed, 12 insertions(+), 133 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 7f3faa8c8b3b..82764042b81d 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -2,7 +2,14 @@
 import torch.nn as nn
 
 from diffusers.models.embeddings import get_timestep_embedding
-from transformers.models.t5.modeling_t5 import T5Block, T5Config, T5LayerCrossAttention, T5LayerFF, T5LayerNorm
+from transformers.models.t5.modeling_t5 import (
+    T5Attention,
+    T5Block,
+    T5Config,
+    T5LayerCrossAttention,
+    T5LayerFF,
+    T5LayerNorm,
+)
 
 
 class FiLMLayer(nn.Module):
@@ -184,134 +191,6 @@ def forward(
         return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
 
 
-class DecoderLayer(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer = nn.ModuleList()
-
-        # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
-
-        # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(config))
-
-        # pre_mlp_layer_norm: layer 2
-        self.layer.append(T5LayerNorm(hidden_size=config.d_model))
-
-        # FiLM layer: 3
-        self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model))
-
-        # MLP + dropout: last layer
-        self.layer.append(T5LayerFF(config))
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-
-        if past_key_value is not None:
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            if len(past_key_value) != expected_num_past_key_values:
-                raise ValueError(
-                    f"There should be {expected_num_past_key_values} past states. "
-                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                    f"Got {len(past_key_value)} past key / value states"
-                )
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        if encoder_hidden_states is not None:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-        # Apply LayerNorm
-        hidden_states = self.layer[2](hidden_states)
-
-        # FiLM
-        if conditioning_emb is not None:
-            hidden_states = self.layer[3](hidden_states, conditioning_emb)
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_value_state,) + attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
 class TokenEncoder(nn.Module):
     def __init__(self, config: T5Config, weights):
         super().__init__()
@@ -376,7 +255,7 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask):
         x = self.dropout_pre(x)
 
         for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)
+            x = lyr(x, encoder_inputs_mask)[0]
 
         x = self.layer_norm(x)
 
@@ -457,7 +336,7 @@ def forward(self, encoder_inputs, encoder_inputs_mask):
         x = self.dropout_pre(x)
 
         for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)
+            x = lyr(x, encoder_inputs_mask)[0]
 
         x = self.layer_norm(x)
 
@@ -562,7 +441,7 @@ def __init__(self, config, weights):
         self.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T))
 
         self.max_decoder_noise_time = config.max_decoder_noise_time
-        self.emb_dim = condig.d_model
+        self.emb_dim = config.d_model
 
     def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
         mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
@@ -609,7 +488,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
         y = inputs
 
         for lyr in self.decoders:
-            y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb)
+            y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb)[0]
 
         y = self.decoder_norm(y)
         y = self.post_dropout(y)

From ab829233087ba7e45fed12211521e48160c8a975 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 2 Nov 2022 21:57:32 +0300
Subject: [PATCH 005/131] use numpy for get_sequence_length

---
 .../pipeline_spectrogram_diffusion.py                     | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 82764042b81d..5179c0953b5f 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -1,3 +1,5 @@
+import numpy as np
+
 import torch
 import torch.nn as nn
 
@@ -315,13 +317,13 @@ def __init__(self, config, weights):
 
     def get_sequence_length(self, sequence):
         # Return the first index where a 0 occurs.
-        length = torch.argmax(sequence == 0)
+        length = np.argmax(sequence == 0)
 
         # If argmax returns 0, that means that either
         # 1) No 0s were found, and the sequence length is the full length of the array
         # 2) There's padding immediately at the beginning, indicating that the array
         #    is all padding and the sequence length is 0.
-        return torch.where(length == 0 and sequence[0] != 0, sequence.shape[0], length)
+        return np.where(length == 0 and sequence[0] != 0, sequence.shape[0], length).tolist()
 
     def forward(self, encoder_inputs, encoder_inputs_mask):
         x = self.input_proj(encoder_inputs)
@@ -329,7 +331,7 @@ def forward(self, encoder_inputs, encoder_inputs_mask):
         # terminal relative positional encodings
         max_positions = encoder_inputs.shape[1]
         input_positions = torch.arange(max_positions, device=encoder_inputs.device)
-        seq_lens = self.get_sequence_length(encoder_inputs_mask)
+        seq_lens = self.get_sequence_length(encoder_inputs_mask.cpu().numpy())
         input_positions = torch.roll(input_positions, seq_lens, dims=0)
         x += self.position_encoding(input_positions)
 

From cdc6ec7eef389ea906d4c97e4e1b609bd815dc84 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 3 Nov 2022 13:50:11 +0300
Subject: [PATCH 006/131] initial terminal relative positional encodings

---
 .../pipeline_spectrogram_diffusion.py             | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 5179c0953b5f..042342a27261 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -315,24 +315,15 @@ def __init__(self, config, weights):
 
         self.dropout_post = nn.Dropout(p=config.dropout_rate)
 
-    def get_sequence_length(self, sequence):
-        # Return the first index where a 0 occurs.
-        length = np.argmax(sequence == 0)
-
-        # If argmax returns 0, that means that either
-        # 1) No 0s were found, and the sequence length is the full length of the array
-        # 2) There's padding immediately at the beginning, indicating that the array
-        #    is all padding and the sequence length is 0.
-        return np.where(length == 0 and sequence[0] != 0, sequence.shape[0], length).tolist()
-
     def forward(self, encoder_inputs, encoder_inputs_mask):
         x = self.input_proj(encoder_inputs)
 
         # terminal relative positional encodings
         max_positions = encoder_inputs.shape[1]
         input_positions = torch.arange(max_positions, device=encoder_inputs.device)
-        seq_lens = self.get_sequence_length(encoder_inputs_mask.cpu().numpy())
-        input_positions = torch.roll(input_positions, seq_lens, dims=0)
+
+        seq_lens = encoder_inputs_mask.sum(-1)
+        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
         x += self.position_encoding(input_positions)
 
         x = self.dropout_pre(x)

From c55fb5bb4a6945cd39f6f98738995689923a9605 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 3 Nov 2022 14:22:44 +0300
Subject: [PATCH 007/131] fix weights keys

---
 .../pipeline_spectrogram_diffusion.py              | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 042342a27261..52185f661a93 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -212,6 +212,7 @@ def __init__(self, config: T5Config, weights):
 
         self.dropout_pre = nn.Dropout(p=config.dropout_rate)
 
+        config.is_decoder = False
         config.is_encoder_decoder = False
         self.encoders = nn.ModuleList()
         for lyr_num in range(config.num_layers):
@@ -280,6 +281,7 @@ def __init__(self, config, weights):
 
         self.dropout_pre = nn.Dropout(p=config.dropout_rate)
 
+        config.is_decoder = False
         config.is_encoder_decoder = False
         self.encoders = nn.ModuleList()
         for lyr_num in range(config.num_layers):
@@ -366,9 +368,9 @@ def __init__(self, config, weights):
 
         self.dropout = nn.Dropout(p=config.dropout_rate)
 
-        self.decoders = nn.ModuleList()
         config.is_decoder = True
         config.is_encoder_decoder = False
+        self.decoders = nn.ModuleList()
         for lyr_num in range(config.num_decoder_layers):
             # FiLM conditional T5 decoder
             lyr = DecoderLayer(config)
@@ -477,9 +479,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         inputs += position_encodings
 
-        inputs = self.dropout(inputs)
-        y = inputs
-
+        y = self.dropout(inputs)
         for lyr in self.decoders:
             y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb)[0]
 
@@ -494,9 +494,9 @@ class ContinuousContextTransformer(nn.Module):
     def __init__(self, config, weights):
         super().__init__()
 
-        self.token_encoder = TokenEncoder(config=config, weights=weights)
-        self.continuous_encoder = ContinuousEncoder(config=config, weights=weights)
-        self.decoder = Decoder(config=config, weights=weights)
+        self.token_encoder = TokenEncoder(config=config, weights=weights["token_encoder"])
+        self.continuous_encoder = ContinuousEncoder(config=config, weights=weights["continuous_encoder"])
+        self.decoder = Decoder(config=config, weights=weights["decoder"])
 
     def encode(self, input_tokens, continuous_inputs, continuous_mask):
         tokens_mask = input_tokens > 0

From af673745d91ad31af510a91823fce588efac4ebe Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 3 Nov 2022 16:16:38 +0300
Subject: [PATCH 008/131] fix assert

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 52185f661a93..c760fbcfa6a1 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -456,7 +456,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         conditioning_emb = self.conditioning_emb(conditioning_emb)
 
-        assert conditioning_emb.shape == (batch, 1, self.emb_dim * 4)
+        assert conditioning_emb.shape == (batch, self.emb_dim * 4)
 
         seq_length = decoder_input_tokens.shape[1]
 

From ef43fe0a0c893e78b92d61bad771c812e701288d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 3 Nov 2022 16:54:23 +0300
Subject: [PATCH 009/131] cross attend style: concat encodings

---
 .../pipeline_spectrogram_diffusion.py                 | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index c760fbcfa6a1..c589d2ce8194 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -481,7 +481,16 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         y = self.dropout(inputs)
         for lyr in self.decoders:
-            y = lyr(y, encodings_and_encdec_masks, conditioning_emb=conditioning_emb)[0]
+            # cross attend style: concat encodings
+            encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+            encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
 
         y = self.decoder_norm(y)
         y = self.post_dropout(y)

From 6de0cfb163c2b8a89e7b7bd6513e2c9b0cb8a267 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 3 Nov 2022 19:21:40 +0300
Subject: [PATCH 010/131] make style

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index c589d2ce8194..baa10984636c 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -1,5 +1,4 @@
 import numpy as np
-
 import torch
 import torch.nn as nn
 

From 5546c121dad5998956c51629c12866f693ed1384 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 3 Nov 2022 21:51:18 +0300
Subject: [PATCH 011/131] concat once

---
 .../pipeline_spectrogram_diffusion.py                     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index baa10984636c..3e3c4feb16d8 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -479,11 +479,11 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
         inputs += position_encodings
 
         y = self.dropout(inputs)
-        for lyr in self.decoders:
-            # cross attend style: concat encodings
-            encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
-            encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
 
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+        for lyr in self.decoders:
             y = lyr(
                 y,
                 conditioning_emb=conditioning_emb,

From 8b32df3462ba1e5072dd914e297f7870d7159175 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 4 Nov 2022 10:11:39 +0300
Subject: [PATCH 012/131] fix formatting

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 3e3c4feb16d8..48116424cde3 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -439,7 +439,6 @@ def __init__(self, config, weights):
 
     def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
         mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
-
         return mask.unsqueeze(-3)
 
     def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):

From c69a3b902de252a9391217b0be5cf98b657228a1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 4 Nov 2022 11:03:32 +0300
Subject: [PATCH 013/131] Initial SpectrogramPipeline

---
 .../pipeline_spectrogram_diffusion.py         | 73 ++++++++++++++++++-
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 48116424cde3..83c24fefbe54 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -1,8 +1,9 @@
-import numpy as np
+from typing import Optional
+import math
+
 import torch
 import torch.nn as nn
 
-from diffusers.models.embeddings import get_timestep_embedding
 from transformers.models.t5.modeling_t5 import (
     T5Attention,
     T5Block,
@@ -12,6 +13,10 @@
     T5LayerNorm,
 )
 
+from ...models.embeddings import get_timestep_embedding
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDPMScheduler
+
 
 class FiLMLayer(nn.Module):
     def __init__(self, in_features, out_features):
@@ -547,3 +552,67 @@ def forward(
             input_tokens=decoder_input_tokens,
             noise_time=decoder_noise_time,
         )
+
+
+class SpectrogramPipeline(DiffusionPipeline):
+    def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None:
+        super().__init__()
+
+        # From MELGAN
+        self.min_value = math.log(1e-5)  # Matches MelGAN training.
+        self.max_value = 4.0  # Largest value for most examples.
+
+        self.register_modules(cont_context_trans=cont_context_trans, scheduler=scheduler)
+
+    def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
+        """Linearly scale features to network outputs range."""
+        min_out, max_out = output_range
+        if clip:
+            features = torch.clip(features, self.min_value, self.max_value)
+        # Scale to [0, 1].
+        zero_one = (features - self.min_value) / (self.max_value - self.min_value)
+        # Scale to [min_out, max_out].
+        return zero_one * (max_out - min_out) + min_out
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        encoder_input_tokens,
+        encoder_continuous_inputs,
+        encoder_continuous_mask,
+        decoder_input_tokens,
+        generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = 1000,
+        return_dict: bool = True,
+        predict_epsilon: bool = True,
+        **kwargs,
+    ):
+        target_shape = encoder_continuous_inputs.shape
+        encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True)
+
+        encodings_and_masks = self.cont_context_trans.encode(
+            encoder_input_tokens=encoder_input_tokens,
+            continuous_inputs=encoder_continuous_inputs,
+            continuous_mask=encoder_continuous_mask,
+        )
+
+        # Sample gaussian noise to begin loop
+        x = torch.randn(target_shape, generator=generator)
+        x = x.to(self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            output = self.cont_context_trans.decode(
+                encodings_and_masks=encodings_and_masks,
+                input_tokens=decoder_input_tokens,
+                noise_time=t,
+            )
+
+            # 2. compute previous output: x_t -> x_t-1
+            x = self.scheduler.step(output, t, x, generator=generator, predict_epsilon=predict_epsilon).prev_sample
+
+        decode = self.scale_to_features(x, input_range=[-1.0, 1.0])
+
+        return decode

From f7254db3f17031abf9c8b77217d049d5a0aa773f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 4 Nov 2022 11:07:41 +0300
Subject: [PATCH 014/131] fix input_tokens

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 83c24fefbe54..5bdf0c341ad0 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -580,7 +580,6 @@ def __call__(
         encoder_input_tokens,
         encoder_continuous_inputs,
         encoder_continuous_mask,
-        decoder_input_tokens,
         generator: Optional[torch.Generator] = None,
         num_inference_steps: int = 1000,
         return_dict: bool = True,
@@ -606,7 +605,7 @@ def __call__(
         for t in self.progress_bar(self.scheduler.timesteps):
             output = self.cont_context_trans.decode(
                 encodings_and_masks=encodings_and_masks,
-                input_tokens=decoder_input_tokens,
+                input_tokens=x,
                 noise_time=t,
             )
 

From 133d155b343caff53fe449a903301dbf3012eeb2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 4 Nov 2022 11:36:08 +0300
Subject: [PATCH 015/131] make style

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 5bdf0c341ad0..e12a1b0b8ceb 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -1,5 +1,5 @@
-from typing import Optional
 import math
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -105,7 +105,6 @@ def forward(
         output_attentions=False,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
 
@@ -540,7 +539,6 @@ def forward(
         decoder_input_tokens,
         decoder_noise_time,
     ):
-
         encodings_and_masks = self.encode(
             input_tokens=encoder_input_tokens,
             continuous_inputs=encoder_continuous_inputs,

From aa2323f06e9348ab3f46517392ec0cb2e05c8478 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 7 Nov 2022 16:41:00 +0100
Subject: [PATCH 016/131] added mel output

---
 src/diffusers/pipeline_utils.py                  | 14 ++++++++++++++
 .../pipeline_spectrogram_diffusion.py            | 16 +++++++++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 36c2d5b888ef..40a7924dad90 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -109,6 +109,20 @@ class AudioPipelineOutput(BaseOutput):
     audios: np.ndarray
 
 
+@dataclass
+class MelPipelineOutput(BaseOutput):
+    """
+    Output class for Mel pipelines.
+
+    Args:
+        mels (`np.ndarray`)
+            List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the
+            denoised mel samples of the diffusion pipeline.
+    """
+
+    mels: np.ndarray
+
+
 class DiffusionPipeline(ConfigMixin):
     r"""
     Base class for all models.
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index e12a1b0b8ceb..c888819daae5 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -13,8 +13,10 @@
     T5LayerNorm,
 )
 
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...modeling_utils import ModelMixin
 from ...models.embeddings import get_timestep_embedding
-from ...pipeline_utils import DiffusionPipeline
+from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput
 from ...schedulers import DDPMScheduler
 
 
@@ -196,7 +198,8 @@ def forward(
         return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
 
 
-class TokenEncoder(nn.Module):
+class TokenEncoder(ModelMixin, ConfigMixin):
+    @register_to_config
     def __init__(self, config: T5Config, weights):
         super().__init__()
 
@@ -558,7 +561,7 @@ def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler:
 
         # From MELGAN
         self.min_value = math.log(1e-5)  # Matches MelGAN training.
-        self.max_value = 4.0  # Largest value for most examples.
+        self.max_value = 4.0  # Largest value for most examples
 
         self.register_modules(cont_context_trans=cont_context_trans, scheduler=scheduler)
 
@@ -610,6 +613,9 @@ def __call__(
             # 2. compute previous output: x_t -> x_t-1
             x = self.scheduler.step(output, t, x, generator=generator, predict_epsilon=predict_epsilon).prev_sample
 
-        decode = self.scale_to_features(x, input_range=[-1.0, 1.0])
+        mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
 
-        return decode
+        if not return_dict:
+            return (mel,)
+
+        return MelPipelineOutput(mels=mel)

From c154878fd2888f755b52cc9110aab6dbfc26e8ef Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 7 Nov 2022 17:04:45 +0100
Subject: [PATCH 017/131] ignore weights for config

---
 .../pipeline_spectrogram_diffusion.py         | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index c888819daae5..34d58aa706ad 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -199,6 +199,8 @@ def forward(
 
 
 class TokenEncoder(ModelMixin, ConfigMixin):
+    ignore_for_config = ["weights"]
+
     @register_to_config
     def __init__(self, config: T5Config, weights):
         super().__init__()
@@ -271,7 +273,8 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask):
         return self.dropout_post(x), encoder_inputs_mask
 
 
-class ContinuousEncoder(nn.Module):
+class ContinuousEncoder(ModelMixin, ConfigMixin):
+    @register_to_config
     def __init__(self, config, weights):
         super().__init__()
 
@@ -344,8 +347,11 @@ def forward(self, encoder_inputs, encoder_inputs_mask):
         return self.dropout_post(x), encoder_inputs_mask
 
 
-class Decoder(nn.Module):
-    def __init__(self, config, weights):
+class Decoder(ModelMixin, ConfigMixin):
+    ignore_for_config = ["weights"]
+
+    @register_to_config
+    def __init__(self, config: T5Config, weights):
         super().__init__()
 
         self.conditioning_emb = nn.Sequential(
@@ -504,8 +510,11 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
         return spec_out
 
 
-class ContinuousContextTransformer(nn.Module):
-    def __init__(self, config, weights):
+class ContinuousContextTransformer(ModelMixin, ConfigMixin):
+    ignore_for_config = ["weights"]
+
+    @register_to_config
+    def __init__(self, config: T5Config, weights):
         super().__init__()
 
         self.token_encoder = TokenEncoder(config=config, weights=weights["token_encoder"])

From 63f69b6cb99cf21e51ef5db41cc1dd9aa682cc30 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 7 Nov 2022 17:09:13 +0100
Subject: [PATCH 018/131] move mel to numpy

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 34d58aa706ad..0dd239e4a69f 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -623,6 +623,7 @@ def __call__(
             x = self.scheduler.step(output, t, x, generator=generator, predict_epsilon=predict_epsilon).prev_sample
 
         mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
+        mel = mel.cpu().numpy()
 
         if not return_dict:
             return (mel,)

From 9808d06642b94ea4e6aba780437abef07b6133c8 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 7 Nov 2022 17:16:19 +0100
Subject: [PATCH 019/131] import pipeline

---
 src/diffusers/pipelines/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index bb3440b2bfbc..7f474a8b9774 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -10,6 +10,7 @@
     from .repaint import RePaintPipeline
     from .score_sde_ve import ScoreSdeVePipeline
     from .stochastic_karras_ve import KarrasVePipeline
+    from .spectrogram_diffusion import SpectrogramDiffusionPipeline
 else:
     from ..utils.dummy_pt_objects import *  # noqa F403
 

From 49d95c0ce3cba464cdefd7262d358d85d04611de Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 7 Nov 2022 17:23:55 +0100
Subject: [PATCH 020/131] fix class names and import

---
 src/diffusers/pipelines/spectrogram_diffusion/__init__.py | 8 +++++++-
 .../pipeline_spectrogram_diffusion.py                     | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index a404e61c1217..85230f5e95d0 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,2 +1,8 @@
 # flake8: noqa
-from .pipeline_spectrogram_diffusion import ContinuousContextTransformer, ContinuousEncoder, Decoder, TokenEncoder
+from .pipeline_spectrogram_diffusion import (
+    SpectrogramDiffusionPipeline,
+    ContinuousContextTransformer,
+    ContinuousEncoder,
+    Decoder,
+    TokenEncoder,
+)
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 0dd239e4a69f..ad3c7d0a545a 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -564,7 +564,7 @@ def forward(
         )
 
 
-class SpectrogramPipeline(DiffusionPipeline):
+class SpectrogramDiffusionPipeline(DiffusionPipeline):
     def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None:
         super().__init__()
 

From ce4a6582ccc9130cb4c0486f7ed4d8545e914869 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 8 Nov 2022 09:20:39 +0100
Subject: [PATCH 021/131] moved models to models folder

---
 src/diffusers/models/__init__.py              |   2 +
 src/diffusers/models/film.py                  |  27 +
 src/diffusers/models/t5_attention.py          | 424 +++++++++++++
 src/diffusers/pipeline_utils.py               |   4 +-
 src/diffusers/pipelines/__init__.py           |   2 +-
 .../spectrogram_diffusion/__init__.py         |   8 +-
 .../pipeline_spectrogram_diffusion.py         | 558 +-----------------
 7 files changed, 458 insertions(+), 567 deletions(-)
 create mode 100644 src/diffusers/models/film.py
 create mode 100644 src/diffusers/models/t5_attention.py

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 5b101d169148..c3d524eddebc 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -17,6 +17,8 @@
 
 if is_torch_available():
     from .attention import Transformer2DModel
+    from .film import FiLMLayer
+    from .t5_attention import ContinuousContextTransformer, ContinuousEncoder, Decoder, TokenEncoder
     from .unet_1d import UNet1DModel
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
diff --git a/src/diffusers/models/film.py b/src/diffusers/models/film.py
new file mode 100644
index 000000000000..8936fd32276d
--- /dev/null
+++ b/src/diffusers/models/film.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+
+class FiLMLayer(nn.Module):
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        self.scale_bias = nn.Linear(in_features, out_features * 2)
+
+    def forward(self, x, conditioning_emb):
+        scale_bias = self.scale_bias(conditioning_emb)
+        scale, bias = torch.chunk(scale_bias, 2, -1)
+        return x * (scale + 1.0) + bias
diff --git a/src/diffusers/models/t5_attention.py b/src/diffusers/models/t5_attention.py
new file mode 100644
index 000000000000..7d588a3113c4
--- /dev/null
+++ b/src/diffusers/models/t5_attention.py
@@ -0,0 +1,424 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+from transformers.models.t5.modeling_t5 import (
+    T5Attention,
+    T5Block,
+    T5Config,
+    T5LayerCrossAttention,
+    T5LayerFF,
+    T5LayerNorm,
+)
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from .embeddings import get_timestep_embedding
+from .film import FiLMLayer
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(config.d_model)
+        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]
+        return outputs
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(config))
+
+        # pre_mlp_layer_norm: layer 2
+        self.layer.append(T5LayerNorm(hidden_size=config.d_model))
+
+        # FiLM layer: 3
+        self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model))
+
+        # MLP + dropout: last layer
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+        if past_key_value is not None:
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        if encoder_hidden_states is not None:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply LayerNorm
+        hidden_states = self.layer[2](hidden_states)
+
+        # FiLM
+        if conditioning_emb is not None:
+            hidden_states = self.layer[3](hidden_states, conditioning_emb)
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class TokenEncoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, config: T5Config):
+        super().__init__()
+
+        self.token_embedder = nn.Embedding(config.vocab_size, config.d_model)
+
+        self.position_encoding = nn.Embedding(config.max_length, config.d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
+
+        config.is_decoder = False
+        config.is_encoder_decoder = False
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(config.num_layers):
+            lyr = T5Block(config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
+        self.dropout_post = nn.Dropout(p=config.dropout_rate)
+
+    def forward(self, encoder_input_tokens, encoder_inputs_mask):
+        x = self.token_embedder(encoder_input_tokens)
+
+        seq_length = encoder_input_tokens.shape[1]
+        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
+        x += self.position_encoding(inputs_positions)
+
+        x = self.dropout_pre(x)
+
+        for lyr in self.encoders:
+            x = lyr(x, encoder_inputs_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
+
+
+class ContinuousEncoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, config):
+        super().__init__()
+
+        self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False)
+
+        self.position_encoding = nn.Embedding(config.targets_context_length, config.d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
+
+        config.is_decoder = False
+        config.is_encoder_decoder = False
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(config.num_layers):
+            lyr = T5Block(config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
+        self.dropout_post = nn.Dropout(p=config.dropout_rate)
+
+    def forward(self, encoder_inputs, encoder_inputs_mask):
+        x = self.input_proj(encoder_inputs)
+
+        # terminal relative positional encodings
+        max_positions = encoder_inputs.shape[1]
+        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
+
+        seq_lens = encoder_inputs_mask.sum(-1)
+        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
+        x += self.position_encoding(input_positions)
+
+        x = self.dropout_pre(x)
+
+        for lyr in self.encoders:
+            x = lyr(x, encoder_inputs_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
+
+
+class Decoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, config: T5Config):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(config.d_model * 4, config.d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+
+        self.position_encoding = nn.Embedding(config.targets_length, config.d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(config.input_dims, config.d_model)
+
+        self.dropout = nn.Dropout(p=config.dropout_rate)
+
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        self.decoders = nn.ModuleList()
+        for lyr_num in range(config.num_decoder_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(config)
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(config.d_model)
+
+        self.post_dropout = nn.Dropout(p=config.dropout_rate)
+        self.spec_out = nn.Linear(config.d_model, config.input_dims, bias=False)
+
+        self.max_decoder_noise_time = config.max_decoder_noise_time
+        self.emb_dim = config.d_model
+
+    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
+        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        conditioning_emb = get_timestep_embedding(
+            decoder_noise_time * self.max_decoder_noise_time,
+            embedding_dim=self.emb_dim,
+            max_period=self.max_decoder_noise_time,
+        )
+
+        conditioning_emb = self.conditioning_emb(conditioning_emb)
+
+        assert conditioning_emb.shape == (batch, self.emb_dim * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+
+        inputs += position_encodings
+
+        y = self.dropout(inputs)
+
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+        for lyr in self.decoders:
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out
+
+
+class ContinuousContextTransformer(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, config: T5Config):
+        super().__init__()
+
+        self.token_encoder = TokenEncoder(config=config)
+        self.continuous_encoder = ContinuousEncoder(config=config)
+        self.decoder = Decoder(config=config)
+
+    def encode(self, input_tokens, continuous_inputs, continuous_mask):
+        tokens_mask = input_tokens > 0
+        tokens_encoded, tokens_mask = self.token_encoder(
+            encoder_input_tokens=input_tokens,
+            encoder_inputs_mask=tokens_mask,
+        )
+
+        continuous_encoded, continuous_mask = self.continuous_encoder(
+            encoder_inputs=continuous_inputs,
+            encoder_inputs_mask=continuous_mask,
+        )
+
+        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
+
+    def decode(self, encodings_and_masks, input_tokens, noise_time):
+        logits = self.decoder(
+            encodings_and_masks=encodings_and_masks,
+            decoder_input_tokens=input_tokens,
+            decoder_noise_time=noise_time,
+        )
+        return logits
+
+    def forward(
+        self,
+        encoder_input_tokens,
+        encoder_continuous_inputs,
+        encoder_continuous_mask,
+        decoder_input_tokens,
+        decoder_noise_time,
+    ):
+        encodings_and_masks = self.encode(
+            input_tokens=encoder_input_tokens,
+            continuous_inputs=encoder_continuous_inputs,
+            continuous_mask=encoder_continuous_mask,
+        )
+
+        return self.decode(
+            encodings_and_masks=encodings_and_masks,
+            input_tokens=decoder_input_tokens,
+            noise_time=decoder_noise_time,
+        )
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 40a7924dad90..ca08c1b36ce3 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -116,8 +116,8 @@ class MelPipelineOutput(BaseOutput):
 
     Args:
         mels (`np.ndarray`)
-            List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the
-            denoised mel samples of the diffusion pipeline.
+            List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the denoised mel
+            samples of the diffusion pipeline.
     """
 
     mels: np.ndarray
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 7f474a8b9774..4b63ab8af277 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -9,8 +9,8 @@
     from .pndm import PNDMPipeline
     from .repaint import RePaintPipeline
     from .score_sde_ve import ScoreSdeVePipeline
-    from .stochastic_karras_ve import KarrasVePipeline
     from .spectrogram_diffusion import SpectrogramDiffusionPipeline
+    from .stochastic_karras_ve import KarrasVePipeline
 else:
     from ..utils.dummy_pt_objects import *  # noqa F403
 
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index 85230f5e95d0..de37e892a7db 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,8 +1,2 @@
 # flake8: noqa
-from .pipeline_spectrogram_diffusion import (
-    SpectrogramDiffusionPipeline,
-    ContinuousContextTransformer,
-    ContinuousEncoder,
-    Decoder,
-    TokenEncoder,
-)
+from .pipeline_spectrogram_diffusion import SpectrogramDiffusionPipeline
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index ad3c7d0a545a..69f1f092febf 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -2,568 +2,12 @@
 from typing import Optional
 
 import torch
-import torch.nn as nn
 
-from transformers.models.t5.modeling_t5 import (
-    T5Attention,
-    T5Block,
-    T5Config,
-    T5LayerCrossAttention,
-    T5LayerFF,
-    T5LayerNorm,
-)
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...modeling_utils import ModelMixin
-from ...models.embeddings import get_timestep_embedding
+from ...models.t5_attention import ContinuousContextTransformer
 from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput
 from ...schedulers import DDPMScheduler
 
 
-class FiLMLayer(nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.scale_bias = nn.Linear(in_features, out_features * 2)
-
-    def forward(self, x, conditioning_emb):
-        scale_bias = self.scale_bias(conditioning_emb)
-        scale, bias = torch.chunk(scale_bias, 2, -1)
-        return x * (scale + 1.0) + bias
-
-
-class T5LayerSelfAttentionCond(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer_norm = T5LayerNorm(config.d_model)
-        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        # pre_self_attention_layer_norm
-        normed_hidden_states = self.layer_norm(hidden_states)
-
-        if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
-
-        # Self-attention block
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer = nn.ModuleList()
-
-        # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
-
-        # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(config))
-
-        # pre_mlp_layer_norm: layer 2
-        self.layer.append(T5LayerNorm(hidden_size=config.d_model))
-
-        # FiLM layer: 3
-        self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model))
-
-        # MLP + dropout: last layer
-        self.layer.append(T5LayerFF(config))
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-        if past_key_value is not None:
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            if len(past_key_value) != expected_num_past_key_values:
-                raise ValueError(
-                    f"There should be {expected_num_past_key_values} past states. "
-                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                    f"Got {len(past_key_value)} past key / value states"
-                )
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        if encoder_hidden_states is not None:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-        # Apply LayerNorm
-        hidden_states = self.layer[2](hidden_states)
-
-        # FiLM
-        if conditioning_emb is not None:
-            hidden_states = self.layer[3](hidden_states, conditioning_emb)
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_value_state,) + attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
-class TokenEncoder(ModelMixin, ConfigMixin):
-    ignore_for_config = ["weights"]
-
-    @register_to_config
-    def __init__(self, config: T5Config, weights):
-        super().__init__()
-
-        self.token_embedder = nn.Embedding(
-            config.vocab_size,
-            config.d_model,
-            _weight=torch.FloatTensor(weights["token_embedder"]["embedding"]),
-        )
-
-        self.position_encoding = nn.Embedding(
-            config.max_length,
-            config.d_model,
-            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
-        )
-        self.position_encoding.weight.requires_grad = False
-
-        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
-
-        config.is_decoder = False
-        config.is_encoder_decoder = False
-        self.encoders = nn.ModuleList()
-        for lyr_num in range(config.num_layers):
-            lyr = T5Block(config)
-            ly_weight = weights[f"layers_{lyr_num}"]
-
-            attention_weights = ly_weight["attention"]
-            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["query"]["kernel"].T)
-            )
-            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
-            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["value"]["kernel"].T)
-            )
-            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
-            lyr.layer[0].layer_norm.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
-            )
-
-            lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
-            )
-            lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
-            )
-            lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
-            lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
-
-            self.encoders.append(lyr)
-
-        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
-        self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
-
-        self.dropout_post = nn.Dropout(p=config.dropout_rate)
-
-    def forward(self, encoder_input_tokens, encoder_inputs_mask):
-        x = self.token_embedder(encoder_input_tokens)
-
-        seq_length = encoder_input_tokens.shape[1]
-        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
-        x += self.position_encoding(inputs_positions)
-
-        x = self.dropout_pre(x)
-
-        for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)[0]
-
-        x = self.layer_norm(x)
-
-        return self.dropout_post(x), encoder_inputs_mask
-
-
-class ContinuousEncoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(self, config, weights):
-        super().__init__()
-
-        self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False)
-        self.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T))
-
-        self.position_encoding = nn.Embedding(
-            config.targets_context_length,
-            config.d_model,
-            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
-        )
-        self.position_encoding.weight.requires_grad = False
-
-        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
-
-        config.is_decoder = False
-        config.is_encoder_decoder = False
-        self.encoders = nn.ModuleList()
-        for lyr_num in range(config.num_layers):
-            lyr = T5Block(config)
-            ly_weight = weights[f"layers_{lyr_num}"]
-
-            attention_weights = ly_weight["attention"]
-            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["query"]["kernel"].T)
-            )
-            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
-            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["value"]["kernel"].T)
-            )
-            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
-            lyr.layer[0].layer_norm.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
-            )
-
-            lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
-            )
-            lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
-            )
-            lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
-            lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
-
-            self.encoders.append(lyr)
-
-        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
-        self.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
-
-        self.dropout_post = nn.Dropout(p=config.dropout_rate)
-
-    def forward(self, encoder_inputs, encoder_inputs_mask):
-        x = self.input_proj(encoder_inputs)
-
-        # terminal relative positional encodings
-        max_positions = encoder_inputs.shape[1]
-        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
-
-        seq_lens = encoder_inputs_mask.sum(-1)
-        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
-        x += self.position_encoding(input_positions)
-
-        x = self.dropout_pre(x)
-
-        for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)[0]
-
-        x = self.layer_norm(x)
-
-        return self.dropout_post(x), encoder_inputs_mask
-
-
-class Decoder(ModelMixin, ConfigMixin):
-    ignore_for_config = ["weights"]
-
-    @register_to_config
-    def __init__(self, config: T5Config, weights):
-        super().__init__()
-
-        self.conditioning_emb = nn.Sequential(
-            nn.Linear(config.d_model, config.d_model * 4, bias=False),
-            nn.SiLU(),
-            nn.Linear(config.d_model * 4, config.d_model * 4, bias=False),
-            nn.SiLU(),
-        )
-        self.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T))
-        self.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T))
-
-        self.position_encoding = nn.Embedding(
-            config.targets_length,
-            config.d_model,
-            _weight=torch.FloatTensor(weights["Embed_0"]["embedding"]),
-        )
-        self.position_encoding.weight.requires_grad = False
-
-        self.continuous_inputs_projection = nn.Linear(
-            config.input_dims,
-            config.d_model,
-        )
-        self.continuous_inputs_projection.weight = nn.Parameter(
-            torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T)
-        )
-
-        self.dropout = nn.Dropout(p=config.dropout_rate)
-
-        config.is_decoder = True
-        config.is_encoder_decoder = False
-        self.decoders = nn.ModuleList()
-        for lyr_num in range(config.num_decoder_layers):
-            # FiLM conditional T5 decoder
-            lyr = DecoderLayer(config)
-            ly_weight = weights[f"layers_{lyr_num}"]
-
-            lyr.layer[0].layer_norm.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"])
-            )
-
-            lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T)
-            )
-
-            attention_weights = ly_weight["self_attention"]
-            lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["query"]["kernel"].T)
-            )
-            lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
-            lyr.layer[0].SelfAttention.v.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["value"]["kernel"].T)
-            )
-            lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
-
-            attention_weights = ly_weight["MultiHeadDotProductAttention_0"]
-            lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["query"]["kernel"].T)
-            )
-            lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["key"]["kernel"].T)
-            )
-            lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["value"]["kernel"].T)
-            )
-            lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(
-                torch.FloatTensor(attention_weights["out"]["kernel"].T)
-            )
-
-            lyr.layer[1].layer_norm.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"])
-            )
-
-            lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
-
-            lyr.layer[3].scale_bias.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T)
-            )
-
-            lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
-            )
-            lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(
-                torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)
-            )
-            lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
-
-            self.decoders.append(lyr)
-
-        self.decoder_norm = T5LayerNorm(config.d_model)
-        self.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"]))
-
-        self.post_dropout = nn.Dropout(p=config.dropout_rate)
-        self.spec_out = nn.Linear(config.d_model, config.input_dims, bias=False)
-        self.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T))
-
-        self.max_decoder_noise_time = config.max_decoder_noise_time
-        self.emb_dim = config.d_model
-
-    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
-        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
-        return mask.unsqueeze(-3)
-
-    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
-        batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape == (batch,)
-
-        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
-        conditioning_emb = get_timestep_embedding(
-            decoder_noise_time * self.max_decoder_noise_time,
-            embedding_dim=self.emb_dim,
-            max_period=self.max_decoder_noise_time,
-        )
-
-        conditioning_emb = self.conditioning_emb(conditioning_emb)
-
-        assert conditioning_emb.shape == (batch, self.emb_dim * 4)
-
-        seq_length = decoder_input_tokens.shape[1]
-
-        # If we want to use relative positions for audio context, we can just offset
-        # this sequence by the length of encodings_and_masks.
-        decoder_positions = torch.broadcast_to(
-            torch.arange(seq_length, device=decoder_input_tokens.device),
-            (batch, seq_length),
-        )
-
-        position_encodings = self.position_encoding(decoder_positions)
-
-        # decoder: No padding present.
-        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
-
-        # Translate encoding masks to encoder-decoder masks.
-        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-
-        inputs = self.continuous_inputs_projection(decoder_input_tokens)
-
-        inputs += position_encodings
-
-        y = self.dropout(inputs)
-
-        # cross attend style: concat encodings
-        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
-        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
-        for lyr in self.decoders:
-            y = lyr(
-                y,
-                conditioning_emb=conditioning_emb,
-                encoder_hidden_states=encoded,
-                encoder_attention_mask=encoder_decoder_mask,
-            )[0]
-
-        y = self.decoder_norm(y)
-        y = self.post_dropout(y)
-
-        spec_out = self.spec_out(y)
-        return spec_out
-
-
-class ContinuousContextTransformer(ModelMixin, ConfigMixin):
-    ignore_for_config = ["weights"]
-
-    @register_to_config
-    def __init__(self, config: T5Config, weights):
-        super().__init__()
-
-        self.token_encoder = TokenEncoder(config=config, weights=weights["token_encoder"])
-        self.continuous_encoder = ContinuousEncoder(config=config, weights=weights["continuous_encoder"])
-        self.decoder = Decoder(config=config, weights=weights["decoder"])
-
-    def encode(self, input_tokens, continuous_inputs, continuous_mask):
-        tokens_mask = input_tokens > 0
-        tokens_encoded, tokens_mask = self.token_encoder(
-            encoder_input_tokens=input_tokens,
-            encoder_inputs_mask=tokens_mask,
-        )
-
-        continuous_encoded, continuous_mask = self.continuous_encoder(
-            encoder_inputs=continuous_inputs,
-            encoder_inputs_mask=continuous_mask,
-        )
-
-        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
-
-    def decode(self, encodings_and_masks, input_tokens, noise_time):
-        logits = self.decoder(
-            encodings_and_masks=encodings_and_masks,
-            decoder_input_tokens=input_tokens,
-            decoder_noise_time=noise_time,
-        )
-        return logits
-
-    def forward(
-        self,
-        encoder_input_tokens,
-        encoder_continuous_inputs,
-        encoder_continuous_mask,
-        decoder_input_tokens,
-        decoder_noise_time,
-    ):
-        encodings_and_masks = self.encode(
-            input_tokens=encoder_input_tokens,
-            continuous_inputs=encoder_continuous_inputs,
-            continuous_mask=encoder_continuous_mask,
-        )
-
-        return self.decode(
-            encodings_and_masks=encodings_and_masks,
-            input_tokens=decoder_input_tokens,
-            noise_time=decoder_noise_time,
-        )
-
-
 class SpectrogramDiffusionPipeline(DiffusionPipeline):
     def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None:
         super().__init__()

From b3caf357cbfd9eb199eb05232e2f8b0a73555084 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 8 Nov 2022 09:54:08 +0100
Subject: [PATCH 022/131] import ContinuousContextTransformer and
 SpectrogramDiffusionPipeline

---
 src/diffusers/__init__.py                     | 11 ++++++-
 .../dummy_torch_and_accelerate_objects.py     | 30 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 22b6589973a0..02521e8b6fd9 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -26,7 +26,15 @@
 
 if is_torch_available():
     from .modeling_utils import ModelMixin
-    from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
+    from .models import (
+        AutoencoderKL,
+        ContinuousContextTransformer,
+        Transformer2DModel,
+        UNet1DModel,
+        UNet2DConditionModel,
+        UNet2DModel,
+        VQModel,
+    )
     from .optimization import (
         get_constant_schedule,
         get_constant_schedule_with_warmup,
@@ -46,6 +54,7 @@
         PNDMPipeline,
         RePaintPipeline,
         ScoreSdeVePipeline,
+        SpectrogramDiffusionPipeline,
     )
     from .schedulers import (
         DDIMScheduler,
diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
index 335e3ca24d2a..e2a2046969fe 100644
--- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
@@ -34,6 +34,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
+class ContinuousContextTransformer(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
 class Transformer2DModel(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 
@@ -272,6 +287,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
+class SpectrogramDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
 class DDIMScheduler(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 

From 593e2aa070621a54ef27b4b258d2670df539cdd1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 8 Nov 2022 09:57:57 +0100
Subject: [PATCH 023/131] initial spec diffusion converstion script

---
 .../convert_music_spectrogram_to_diffusers.py | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 scripts/convert_music_spectrogram_to_diffusers.py

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
new file mode 100644
index 000000000000..1b2bdd2fb3bd
--- /dev/null
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+import argparse
+import os
+
+import jax
+import tensorflow as tf
+
+from t5x import checkpoints
+from music_spectrogram_diffusion import inference
+
+from transformers import T5Config
+
+from diffusers import DDPMScheduler, ContinuousContextTransformer, SpectrogramDiffusionPipeline
+
+MODEL = "base_with_context"
+
+
+def main(args):
+    t5_checkpoint = checkpoints.load_t5x_checkpoint(args.checkpoint_path)
+
+    gin_overrides = [
+        "from __gin__ import dynamic_registration",
+        "from music_spectrogram_diffusion.models.diffusion import diffusion_utils",
+        "diffusion_utils.ClassifierFreeGuidanceConfig.eval_condition_weight = 2.0",
+        "diffusion_utils.DiffusionConfig.classifier_free_guidance = @diffusion_utils.ClassifierFreeGuidanceConfig()",
+    ]
+
+    gin_file = os.path.join(args.checkpoint_path, "..", "config.gin")
+    gin_config = inference.parse_training_gin_file(gin_file, gin_overrides)
+    synth_model = inference.InferenceModel(args.checkpoint_path, gin_config)
+
+    t5config = T5Config(
+        vocab_size=synth_model.model.module.config.vocab_size,
+        max_length=synth_model.sequence_length["inputs"],
+        input_dims=synth_model.audio_codec.n_dims,
+        targets_context_length=synth_model.sequence_length["targets_context"],
+        targets_length=synth_model.sequence_length["targets"],
+        d_model=synth_model.model.module.config.emb_dim,
+        num_heads=synth_model.model.module.config.num_heads,
+        num_layers=synth_model.model.module.config.num_encoder_layers,
+        num_decoder_layers=synth_model.model.module.config.num_decoder_layers,
+        d_kv=synth_model.model.module.config.head_dim,
+        d_ff=synth_model.model.module.config.mlp_dim,
+        dropout_rate=synth_model.model.module.config.dropout_rate,
+        feed_forward_proj=synth_model.model.module.config.mlp_activations[0],
+        is_gated_act=True,
+        max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time,
+    )
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large")
+    model = ContinuousContextTransformer(config=t5config)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        default=f"{MODEL}/checkpoint_500000",
+        type=str,
+        required=True,
+        help="Path to the original jax model checkpoint.",
+    )
+    args = parser.parse_args()
+
+    main(args)

From c7077995a4bbf171d03ab13ebde5569fdd2fd383 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 8 Nov 2022 11:02:51 +0100
Subject: [PATCH 024/131] renamed config to t5config

---
 .../convert_music_spectrogram_to_diffusers.py | 14 ++--
 src/diffusers/models/t5_attention.py          | 83 +++++++++----------
 2 files changed, 47 insertions(+), 50 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 1b2bdd2fb3bd..6810a9bfad6a 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -4,10 +4,10 @@
 
 import jax
 import tensorflow as tf
+import torch
 
 from t5x import checkpoints
 from music_spectrogram_diffusion import inference
-
 from transformers import T5Config
 
 from diffusers import DDPMScheduler, ContinuousContextTransformer, SpectrogramDiffusionPipeline
@@ -49,21 +49,21 @@ def main(args):
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large")
-    model = ContinuousContextTransformer(config=t5config)
+    model = ContinuousContextTransformer(t5config=t5config)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
-    parser.add_argument(
-        "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
-    )
+    # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    # parser.add_argument(
+    #     "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
+    # )
     parser.add_argument(
         "--checkpoint_path",
         default=f"{MODEL}/checkpoint_500000",
         type=str,
-        required=True,
+        required=False,
         help="Path to the original jax model checkpoint.",
     )
     args = parser.parse_args()
diff --git a/src/diffusers/models/t5_attention.py b/src/diffusers/models/t5_attention.py
index 7d588a3113c4..cb4617bf188c 100644
--- a/src/diffusers/models/t5_attention.py
+++ b/src/diffusers/models/t5_attention.py
@@ -199,25 +199,25 @@ def forward(
 
 class TokenEncoder(ModelMixin, ConfigMixin):
     @register_to_config
-    def __init__(self, config: T5Config):
+    def __init__(self, t5config: T5Config):
         super().__init__()
 
-        self.token_embedder = nn.Embedding(config.vocab_size, config.d_model)
+        self.token_embedder = nn.Embedding(t5config.vocab_size, t5config.d_model)
 
-        self.position_encoding = nn.Embedding(config.max_length, config.d_model)
+        self.position_encoding = nn.Embedding(t5config.max_length, t5config.d_model)
         self.position_encoding.weight.requires_grad = False
 
-        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
+        self.dropout_pre = nn.Dropout(p=t5config.dropout_rate)
 
-        config.is_decoder = False
-        config.is_encoder_decoder = False
+        t5config.is_decoder = False
+        t5config.is_encoder_decoder = False
         self.encoders = nn.ModuleList()
-        for lyr_num in range(config.num_layers):
-            lyr = T5Block(config)
+        for lyr_num in range(t5config.num_layers):
+            lyr = T5Block(t5config)
             self.encoders.append(lyr)
 
-        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
-        self.dropout_post = nn.Dropout(p=config.dropout_rate)
+        self.layer_norm = T5LayerNorm(hidden_size=t5config.d_model)
+        self.dropout_post = nn.Dropout(p=t5config.dropout_rate)
 
     def forward(self, encoder_input_tokens, encoder_inputs_mask):
         x = self.token_embedder(encoder_input_tokens)
@@ -237,25 +237,25 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask):
 
 class ContinuousEncoder(ModelMixin, ConfigMixin):
     @register_to_config
-    def __init__(self, config):
+    def __init__(self, t5config):
         super().__init__()
 
-        self.input_proj = nn.Linear(config.input_dims, config.d_model, bias=False)
+        self.input_proj = nn.Linear(t5config.input_dims, t5config.d_model, bias=False)
 
-        self.position_encoding = nn.Embedding(config.targets_context_length, config.d_model)
+        self.position_encoding = nn.Embedding(t5config.targets_context_length, t5config.d_model)
         self.position_encoding.weight.requires_grad = False
 
-        self.dropout_pre = nn.Dropout(p=config.dropout_rate)
+        self.dropout_pre = nn.Dropout(p=t5config.dropout_rate)
 
-        config.is_decoder = False
-        config.is_encoder_decoder = False
+        t5config.is_decoder = False
+        t5config.is_encoder_decoder = False
         self.encoders = nn.ModuleList()
-        for lyr_num in range(config.num_layers):
-            lyr = T5Block(config)
+        for lyr_num in range(t5config.num_layers):
+            lyr = T5Block(t5config)
             self.encoders.append(lyr)
 
-        self.layer_norm = T5LayerNorm(hidden_size=config.d_model)
-        self.dropout_post = nn.Dropout(p=config.dropout_rate)
+        self.layer_norm = T5LayerNorm(hidden_size=t5config.d_model)
+        self.dropout_post = nn.Dropout(p=t5config.dropout_rate)
 
     def forward(self, encoder_inputs, encoder_inputs_mask):
         x = self.input_proj(encoder_inputs)
@@ -279,38 +279,35 @@ def forward(self, encoder_inputs, encoder_inputs_mask):
 
 class Decoder(ModelMixin, ConfigMixin):
     @register_to_config
-    def __init__(self, config: T5Config):
+    def __init__(self, t5config: T5Config):
         super().__init__()
 
         self.conditioning_emb = nn.Sequential(
-            nn.Linear(config.d_model, config.d_model * 4, bias=False),
+            nn.Linear(t5config.d_model, t5config.d_model * 4, bias=False),
             nn.SiLU(),
-            nn.Linear(config.d_model * 4, config.d_model * 4, bias=False),
+            nn.Linear(t5config.d_model * 4, t5config.d_model * 4, bias=False),
             nn.SiLU(),
         )
 
-        self.position_encoding = nn.Embedding(config.targets_length, config.d_model)
+        self.position_encoding = nn.Embedding(t5config.targets_length, t5config.d_model)
         self.position_encoding.weight.requires_grad = False
 
-        self.continuous_inputs_projection = nn.Linear(config.input_dims, config.d_model)
+        self.continuous_inputs_projection = nn.Linear(t5config.input_dims, t5config.d_model)
 
-        self.dropout = nn.Dropout(p=config.dropout_rate)
+        self.dropout = nn.Dropout(p=t5config.dropout_rate)
 
-        config.is_decoder = True
-        config.is_encoder_decoder = False
+        t5config.is_decoder = True
+        t5config.is_encoder_decoder = False
         self.decoders = nn.ModuleList()
-        for lyr_num in range(config.num_decoder_layers):
+        for lyr_num in range(t5config.num_decoder_layers):
             # FiLM conditional T5 decoder
-            lyr = DecoderLayer(config)
+            lyr = DecoderLayer(t5config)
             self.decoders.append(lyr)
 
-        self.decoder_norm = T5LayerNorm(config.d_model)
+        self.decoder_norm = T5LayerNorm(t5config.d_model)
 
-        self.post_dropout = nn.Dropout(p=config.dropout_rate)
-        self.spec_out = nn.Linear(config.d_model, config.input_dims, bias=False)
-
-        self.max_decoder_noise_time = config.max_decoder_noise_time
-        self.emb_dim = config.d_model
+        self.post_dropout = nn.Dropout(p=t5config.dropout_rate)
+        self.spec_out = nn.Linear(t5config.d_model, t5config.input_dims, bias=False)
 
     def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
         mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
@@ -322,14 +319,14 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         # decoder_noise_time is in [0, 1), so rescale to expected timing range.
         conditioning_emb = get_timestep_embedding(
-            decoder_noise_time * self.max_decoder_noise_time,
-            embedding_dim=self.emb_dim,
+            decoder_noise_time * self.config.t5config.max_decoder_noise_time,
+            embedding_dim=self.config.t5config.d_model,
             max_period=self.max_decoder_noise_time,
         )
 
         conditioning_emb = self.conditioning_emb(conditioning_emb)
 
-        assert conditioning_emb.shape == (batch, self.emb_dim * 4)
+        assert conditioning_emb.shape == (batch, self.config.t5config.d_model * 4)
 
         seq_length = decoder_input_tokens.shape[1]
 
@@ -374,12 +371,12 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
 class ContinuousContextTransformer(ModelMixin, ConfigMixin):
     @register_to_config
-    def __init__(self, config: T5Config):
+    def __init__(self, t5config: T5Config):
         super().__init__()
 
-        self.token_encoder = TokenEncoder(config=config)
-        self.continuous_encoder = ContinuousEncoder(config=config)
-        self.decoder = Decoder(config=config)
+        self.token_encoder = TokenEncoder(t5config=t5config)
+        self.continuous_encoder = ContinuousEncoder(t5config=t5config)
+        self.decoder = Decoder(t5config=t5config)
 
     def encode(self, input_tokens, continuous_inputs, continuous_mask):
         tokens_mask = input_tokens > 0

From 55bb6ddcd8c08cbf7cbcdf322a9bdeb7a6d7a915 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 9 Nov 2022 19:12:52 +0100
Subject: [PATCH 025/131] added weight loading

---
 .../convert_music_spectrogram_to_diffusers.py | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 6810a9bfad6a..7386a0fdcfbd 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -5,6 +5,7 @@
 import jax
 import tensorflow as tf
 import torch
+import torch.nn as nn
 
 from t5x import checkpoints
 from music_spectrogram_diffusion import inference
@@ -15,6 +16,117 @@
 MODEL = "base_with_context"
 
 
+def load_token_encoder(weights, model):
+    model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"]))
+    model.position_encoding.weight = nn.Parameter(
+        torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False
+    )
+    for lyr_num, lyr in enumerate(model.encoders):
+        ly_weight = weights[f"layers_{lyr_num}"]
+        attention_weights = ly_weight["attention"]
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+        lyr.layer[0].layer_norm.weight = nn.Parameter(
+            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+        )
+
+        lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
+        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
+
+
+def load_continuous_encoder(weights, model):
+    model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T))
+
+    model.position_encoding.weight = nn.Parameter(
+        torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False
+    )
+
+    for lyr_num, lyr in enumerate(model.encoders):
+        ly_weight = weights[f"layers_{lyr_num}"]
+        attention_weights = ly_weight["attention"]
+
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+        lyr.layer[0].layer_norm.weight = nn.Parameter(
+            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+        )
+
+        lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
+        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
+
+
+def load_decoder(weights, model):
+    model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T))
+    model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T))
+
+    model.position_encoding.weight = nn.Parameter(
+        torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False
+    )
+
+    model.continuous_inputs_projection.weight = nn.Parameter(
+        torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T)
+    )
+
+    for lyr_num, lyr in enumerate(model.decoders):
+        ly_weight = weights[f"layers_{lyr_num}"]
+        lyr.layer[0].layer_norm.weight = nn.Parameter(
+            torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"])
+        )
+
+        lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter(
+            torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T)
+        )
+
+        attention_weights = ly_weight["self_attention"]
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+
+        attention_weights = ly_weight["MultiHeadDotProductAttention_0"]
+        lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+
+        lyr.layer[1].layer_norm.weight = nn.Parameter(
+            torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"])
+        )
+
+        lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+
+        lyr.layer[3].scale_bias.weight = nn.Parameter(
+            torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T)
+        )
+
+        lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
+
+    model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"]))
+
+    model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T))
+
+
+def load_checkpoint(t5_checkpoint, model):
+    load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder)
+    load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder)
+    load_decoder(t5_checkpoint["decoder"], model.decoder)
+    return model
+
+
 def main(args):
     t5_checkpoint = checkpoints.load_t5x_checkpoint(args.checkpoint_path)
 
@@ -49,10 +161,17 @@ def main(args):
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large")
+
     model = ContinuousContextTransformer(t5config=t5config)
+    model = load_checkpoint(t5_checkpoint["target"], model).to(device)
+
+    pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler)
+    pipe.save_pretrained(args.output_path)
 
 
 if __name__ == "__main__":
+    jax.config.update("jax_platform_name", "cpu")
+
     parser = argparse.ArgumentParser()
 
     # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")

From 7cb32d7da4ca91630536f162be3a77a307919bfc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 10 Nov 2022 10:27:37 +0100
Subject: [PATCH 026/131] use arguments instead of t5config

---
 .../convert_music_spectrogram_to_diffusers.py |  32 +-
 src/diffusers/__init__.py                     |  10 +-
 src/diffusers/models/__init__.py              |   2 -
 src/diffusers/models/film.py                  |  27 -
 src/diffusers/models/t5_attention.py          | 421 --------------
 .../spectrogram_diffusion/__init__.py         |   2 +-
 .../pipeline_spectrogram_diffusion.py         | 525 +++++++++++++++++-
 .../dummy_torch_and_accelerate_objects.py     |  15 -
 8 files changed, 542 insertions(+), 492 deletions(-)
 delete mode 100644 src/diffusers/models/film.py
 delete mode 100644 src/diffusers/models/t5_attention.py

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 7386a0fdcfbd..52d7785fff62 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -2,16 +2,16 @@
 import argparse
 import os
 
-import jax
-import tensorflow as tf
 import torch
 import torch.nn as nn
 
-from t5x import checkpoints
+import jax
+import tensorflow as tf
+from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline
+from diffusers.pipelines.spectrogram_diffusion import ContinuousContextTransformer
 from music_spectrogram_diffusion import inference
-from transformers import T5Config
+from t5x import checkpoints
 
-from diffusers import DDPMScheduler, ContinuousContextTransformer, SpectrogramDiffusionPipeline
 
 MODEL = "base_with_context"
 
@@ -141,7 +141,10 @@ def main(args):
     gin_config = inference.parse_training_gin_file(gin_file, gin_overrides)
     synth_model = inference.InferenceModel(args.checkpoint_path, gin_config)
 
-    t5config = T5Config(
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large")
+
+    model = ContinuousContextTransformer(
         vocab_size=synth_model.model.module.config.vocab_size,
         max_length=synth_model.sequence_length["inputs"],
         input_dims=synth_model.audio_codec.n_dims,
@@ -149,23 +152,20 @@ def main(args):
         targets_length=synth_model.sequence_length["targets"],
         d_model=synth_model.model.module.config.emb_dim,
         num_heads=synth_model.model.module.config.num_heads,
-        num_layers=synth_model.model.module.config.num_encoder_layers,
+        num_encoder_layers=synth_model.model.module.config.num_encoder_layers,
         num_decoder_layers=synth_model.model.module.config.num_decoder_layers,
         d_kv=synth_model.model.module.config.head_dim,
         d_ff=synth_model.model.module.config.mlp_dim,
         dropout_rate=synth_model.model.module.config.dropout_rate,
-        feed_forward_proj=synth_model.model.module.config.mlp_activations[0],
-        is_gated_act=True,
+        feed_forward_proj="gated-gelu",
         max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time,
     )
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large")
-
-    model = ContinuousContextTransformer(t5config=t5config)
-    model = load_checkpoint(t5_checkpoint["target"], model).to(device)
+    model = load_checkpoint(t5_checkpoint["target"], model)
 
     pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler)
+    import pdb
+
+    pdb.set_trace()
     pipe.save_pretrained(args.output_path)
 
 
@@ -174,7 +174,7 @@ def main(args):
 
     parser = argparse.ArgumentParser()
 
-    # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.")
     # parser.add_argument(
     #     "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
     # )
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 02521e8b6fd9..6c69aa31bc82 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -26,15 +26,7 @@
 
 if is_torch_available():
     from .modeling_utils import ModelMixin
-    from .models import (
-        AutoencoderKL,
-        ContinuousContextTransformer,
-        Transformer2DModel,
-        UNet1DModel,
-        UNet2DConditionModel,
-        UNet2DModel,
-        VQModel,
-    )
+    from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
     from .optimization import (
         get_constant_schedule,
         get_constant_schedule_with_warmup,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index c3d524eddebc..5b101d169148 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -17,8 +17,6 @@
 
 if is_torch_available():
     from .attention import Transformer2DModel
-    from .film import FiLMLayer
-    from .t5_attention import ContinuousContextTransformer, ContinuousEncoder, Decoder, TokenEncoder
     from .unet_1d import UNet1DModel
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
diff --git a/src/diffusers/models/film.py b/src/diffusers/models/film.py
deleted file mode 100644
index 8936fd32276d..000000000000
--- a/src/diffusers/models/film.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-
-
-class FiLMLayer(nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.scale_bias = nn.Linear(in_features, out_features * 2)
-
-    def forward(self, x, conditioning_emb):
-        scale_bias = self.scale_bias(conditioning_emb)
-        scale, bias = torch.chunk(scale_bias, 2, -1)
-        return x * (scale + 1.0) + bias
diff --git a/src/diffusers/models/t5_attention.py b/src/diffusers/models/t5_attention.py
deleted file mode 100644
index cb4617bf188c..000000000000
--- a/src/diffusers/models/t5_attention.py
+++ /dev/null
@@ -1,421 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-
-from transformers.models.t5.modeling_t5 import (
-    T5Attention,
-    T5Block,
-    T5Config,
-    T5LayerCrossAttention,
-    T5LayerFF,
-    T5LayerNorm,
-)
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..modeling_utils import ModelMixin
-from .embeddings import get_timestep_embedding
-from .film import FiLMLayer
-
-
-class T5LayerSelfAttentionCond(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer_norm = T5LayerNorm(config.d_model)
-        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        # pre_self_attention_layer_norm
-        normed_hidden_states = self.layer_norm(hidden_states)
-
-        if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
-
-        # Self-attention block
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]
-        return outputs
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer = nn.ModuleList()
-
-        # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
-
-        # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(config))
-
-        # pre_mlp_layer_norm: layer 2
-        self.layer.append(T5LayerNorm(hidden_size=config.d_model))
-
-        # FiLM layer: 3
-        self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model))
-
-        # MLP + dropout: last layer
-        self.layer.append(T5LayerFF(config))
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-        if past_key_value is not None:
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            if len(past_key_value) != expected_num_past_key_values:
-                raise ValueError(
-                    f"There should be {expected_num_past_key_values} past states. "
-                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                    f"Got {len(past_key_value)} past key / value states"
-                )
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        if encoder_hidden_states is not None:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-        # Apply LayerNorm
-        hidden_states = self.layer[2](hidden_states)
-
-        # FiLM
-        if conditioning_emb is not None:
-            hidden_states = self.layer[3](hidden_states, conditioning_emb)
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_value_state,) + attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
-class TokenEncoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(self, t5config: T5Config):
-        super().__init__()
-
-        self.token_embedder = nn.Embedding(t5config.vocab_size, t5config.d_model)
-
-        self.position_encoding = nn.Embedding(t5config.max_length, t5config.d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.dropout_pre = nn.Dropout(p=t5config.dropout_rate)
-
-        t5config.is_decoder = False
-        t5config.is_encoder_decoder = False
-        self.encoders = nn.ModuleList()
-        for lyr_num in range(t5config.num_layers):
-            lyr = T5Block(t5config)
-            self.encoders.append(lyr)
-
-        self.layer_norm = T5LayerNorm(hidden_size=t5config.d_model)
-        self.dropout_post = nn.Dropout(p=t5config.dropout_rate)
-
-    def forward(self, encoder_input_tokens, encoder_inputs_mask):
-        x = self.token_embedder(encoder_input_tokens)
-
-        seq_length = encoder_input_tokens.shape[1]
-        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
-        x += self.position_encoding(inputs_positions)
-
-        x = self.dropout_pre(x)
-
-        for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)[0]
-        x = self.layer_norm(x)
-
-        return self.dropout_post(x), encoder_inputs_mask
-
-
-class ContinuousEncoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(self, t5config):
-        super().__init__()
-
-        self.input_proj = nn.Linear(t5config.input_dims, t5config.d_model, bias=False)
-
-        self.position_encoding = nn.Embedding(t5config.targets_context_length, t5config.d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.dropout_pre = nn.Dropout(p=t5config.dropout_rate)
-
-        t5config.is_decoder = False
-        t5config.is_encoder_decoder = False
-        self.encoders = nn.ModuleList()
-        for lyr_num in range(t5config.num_layers):
-            lyr = T5Block(t5config)
-            self.encoders.append(lyr)
-
-        self.layer_norm = T5LayerNorm(hidden_size=t5config.d_model)
-        self.dropout_post = nn.Dropout(p=t5config.dropout_rate)
-
-    def forward(self, encoder_inputs, encoder_inputs_mask):
-        x = self.input_proj(encoder_inputs)
-
-        # terminal relative positional encodings
-        max_positions = encoder_inputs.shape[1]
-        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
-
-        seq_lens = encoder_inputs_mask.sum(-1)
-        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
-        x += self.position_encoding(input_positions)
-
-        x = self.dropout_pre(x)
-
-        for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)[0]
-        x = self.layer_norm(x)
-
-        return self.dropout_post(x), encoder_inputs_mask
-
-
-class Decoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(self, t5config: T5Config):
-        super().__init__()
-
-        self.conditioning_emb = nn.Sequential(
-            nn.Linear(t5config.d_model, t5config.d_model * 4, bias=False),
-            nn.SiLU(),
-            nn.Linear(t5config.d_model * 4, t5config.d_model * 4, bias=False),
-            nn.SiLU(),
-        )
-
-        self.position_encoding = nn.Embedding(t5config.targets_length, t5config.d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.continuous_inputs_projection = nn.Linear(t5config.input_dims, t5config.d_model)
-
-        self.dropout = nn.Dropout(p=t5config.dropout_rate)
-
-        t5config.is_decoder = True
-        t5config.is_encoder_decoder = False
-        self.decoders = nn.ModuleList()
-        for lyr_num in range(t5config.num_decoder_layers):
-            # FiLM conditional T5 decoder
-            lyr = DecoderLayer(t5config)
-            self.decoders.append(lyr)
-
-        self.decoder_norm = T5LayerNorm(t5config.d_model)
-
-        self.post_dropout = nn.Dropout(p=t5config.dropout_rate)
-        self.spec_out = nn.Linear(t5config.d_model, t5config.input_dims, bias=False)
-
-    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
-        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
-        return mask.unsqueeze(-3)
-
-    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
-        batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape == (batch,)
-
-        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
-        conditioning_emb = get_timestep_embedding(
-            decoder_noise_time * self.config.t5config.max_decoder_noise_time,
-            embedding_dim=self.config.t5config.d_model,
-            max_period=self.max_decoder_noise_time,
-        )
-
-        conditioning_emb = self.conditioning_emb(conditioning_emb)
-
-        assert conditioning_emb.shape == (batch, self.config.t5config.d_model * 4)
-
-        seq_length = decoder_input_tokens.shape[1]
-
-        # If we want to use relative positions for audio context, we can just offset
-        # this sequence by the length of encodings_and_masks.
-        decoder_positions = torch.broadcast_to(
-            torch.arange(seq_length, device=decoder_input_tokens.device),
-            (batch, seq_length),
-        )
-
-        position_encodings = self.position_encoding(decoder_positions)
-
-        # decoder: No padding present.
-        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
-
-        # Translate encoding masks to encoder-decoder masks.
-        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-
-        inputs = self.continuous_inputs_projection(decoder_input_tokens)
-
-        inputs += position_encodings
-
-        y = self.dropout(inputs)
-
-        # cross attend style: concat encodings
-        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
-        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
-        for lyr in self.decoders:
-            y = lyr(
-                y,
-                conditioning_emb=conditioning_emb,
-                encoder_hidden_states=encoded,
-                encoder_attention_mask=encoder_decoder_mask,
-            )[0]
-
-        y = self.decoder_norm(y)
-        y = self.post_dropout(y)
-
-        spec_out = self.spec_out(y)
-        return spec_out
-
-
-class ContinuousContextTransformer(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(self, t5config: T5Config):
-        super().__init__()
-
-        self.token_encoder = TokenEncoder(t5config=t5config)
-        self.continuous_encoder = ContinuousEncoder(t5config=t5config)
-        self.decoder = Decoder(t5config=t5config)
-
-    def encode(self, input_tokens, continuous_inputs, continuous_mask):
-        tokens_mask = input_tokens > 0
-        tokens_encoded, tokens_mask = self.token_encoder(
-            encoder_input_tokens=input_tokens,
-            encoder_inputs_mask=tokens_mask,
-        )
-
-        continuous_encoded, continuous_mask = self.continuous_encoder(
-            encoder_inputs=continuous_inputs,
-            encoder_inputs_mask=continuous_mask,
-        )
-
-        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
-
-    def decode(self, encodings_and_masks, input_tokens, noise_time):
-        logits = self.decoder(
-            encodings_and_masks=encodings_and_masks,
-            decoder_input_tokens=input_tokens,
-            decoder_noise_time=noise_time,
-        )
-        return logits
-
-    def forward(
-        self,
-        encoder_input_tokens,
-        encoder_continuous_inputs,
-        encoder_continuous_mask,
-        decoder_input_tokens,
-        decoder_noise_time,
-    ):
-        encodings_and_masks = self.encode(
-            input_tokens=encoder_input_tokens,
-            continuous_inputs=encoder_continuous_inputs,
-            continuous_mask=encoder_continuous_mask,
-        )
-
-        return self.decode(
-            encodings_and_masks=encodings_and_masks,
-            input_tokens=decoder_input_tokens,
-            noise_time=decoder_noise_time,
-        )
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index de37e892a7db..850f9f7fba6d 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,2 +1,2 @@
 # flake8: noqa
-from .pipeline_spectrogram_diffusion import SpectrogramDiffusionPipeline
+from .pipeline_spectrogram_diffusion import ContinuousContextTransformer, SpectrogramDiffusionPipeline
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 69f1f092febf..35ada88c9e9e 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -2,12 +2,535 @@
 from typing import Optional
 
 import torch
+import torch.nn as nn
 
-from ...models.t5_attention import ContinuousContextTransformer
+from transformers.models.t5.modeling_t5 import (
+    T5Attention,
+    T5Block,
+    T5Config,
+    T5LayerCrossAttention,
+    T5LayerFF,
+    T5LayerNorm,
+)
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...modeling_utils import ModelMixin
+from ...models.embeddings import get_timestep_embedding
 from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput
 from ...schedulers import DDPMScheduler
 
 
+class FiLMLayer(nn.Module):
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        self.scale_bias = nn.Linear(in_features, out_features * 2)
+
+    def forward(self, x, conditioning_emb):
+        scale_bias = self.scale_bias(conditioning_emb)
+        scale, bias = torch.chunk(scale_bias, 2, -1)
+        return x * (scale + 1.0) + bias
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(config.d_model)
+        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]
+        return outputs
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(config))
+
+        # pre_mlp_layer_norm: layer 2
+        self.layer.append(T5LayerNorm(hidden_size=config.d_model))
+
+        # FiLM layer: 3
+        self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model))
+
+        # MLP + dropout: last layer
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+        if past_key_value is not None:
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        if encoder_hidden_states is not None:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply LayerNorm
+        hidden_states = self.layer[2](hidden_states)
+
+        # FiLM
+        if conditioning_emb is not None:
+            hidden_states = self.layer[3](hidden_states, conditioning_emb)
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class TokenEncoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        max_length: int,
+        vocab_size: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+    ):
+        super().__init__()
+
+        self.token_embedder = nn.Embedding(vocab_size, d_model)
+
+        self.position_encoding = nn.Embedding(max_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            vocab_size=vocab_size,
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            dropout_rate=dropout_rate,
+            feed_forward_proj=feed_forward_proj,
+            is_decoder=False,
+            is_encoder_decoder=False,
+        )
+
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            lyr = T5Block(t5config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(hidden_size=d_model)
+        self.dropout_post = nn.Dropout(p=dropout_rate)
+
+    def forward(self, encoder_input_tokens, encoder_inputs_mask):
+        x = self.token_embedder(encoder_input_tokens)
+
+        seq_length = encoder_input_tokens.shape[1]
+        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
+        x += self.position_encoding(inputs_positions)
+
+        x = self.dropout_pre(x)
+
+        for lyr in self.encoders:
+            x = lyr(x, encoder_inputs_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
+
+
+class ContinuousEncoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int,
+        targets_context_length: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+    ):
+        super().__init__()
+
+        self.input_proj = nn.Linear(input_dims, d_model, bias=False)
+
+        self.position_encoding = nn.Embedding(targets_context_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+            dropout_rate=dropout_rate,
+            is_decoder=False,
+            is_encoder_decoder=False,
+        )
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            lyr = T5Block(t5config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(hidden_size=d_model)
+        self.dropout_post = nn.Dropout(p=dropout_rate)
+
+    def forward(self, encoder_inputs, encoder_inputs_mask):
+        x = self.input_proj(encoder_inputs)
+
+        # terminal relative positional encodings
+        max_positions = encoder_inputs.shape[1]
+        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
+
+        seq_lens = encoder_inputs_mask.sum(-1)
+        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
+        x += self.position_encoding(input_positions)
+
+        x = self.dropout_pre(x)
+
+        for lyr in self.encoders:
+            x = lyr(x, encoder_inputs_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
+
+
+class Decoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int,
+        targets_length: int,
+        max_decoder_noise_time: float,
+        d_model: int,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        dropout_rate: float,
+        feed_forward_proj: str,
+    ):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(d_model, d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(d_model * 4, d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+
+        self.position_encoding = nn.Embedding(targets_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model)
+
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+            dropout_rate=dropout_rate,
+            is_decoder=True,
+            is_encoder_decoder=False,
+        )
+        self.decoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(t5config)
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(d_model)
+
+        self.post_dropout = nn.Dropout(p=dropout_rate)
+        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
+
+    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
+        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        conditioning_emb = get_timestep_embedding(
+            decoder_noise_time * self.config.max_decoder_noise_time,
+            embedding_dim=self.config.d_model,
+            max_period=self.config.max_decoder_noise_time,
+        )
+
+        conditioning_emb = self.conditioning_emb(conditioning_emb)
+
+        assert conditioning_emb.shape == (batch, self.config.d_model * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+
+        inputs += position_encodings
+
+        y = self.dropout(inputs)
+
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+        for lyr in self.decoders:
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out
+
+
+class ContinuousContextTransformer(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int,
+        max_length: int,
+        targets_context_length: int,
+        targets_length: int,
+        max_decoder_noise_time: float,
+        vocab_size: int,
+        d_model: int,
+        dropout_rate: float,
+        num_encoder_layers: int,
+        num_decoder_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str = "gated-gelu",
+    ):
+        super().__init__()
+
+        self.token_encoder = TokenEncoder(
+            max_length=max_length,
+            vocab_size=vocab_size,
+            d_model=d_model,
+            dropout_rate=dropout_rate,
+            num_layers=num_encoder_layers,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+        )
+
+        self.continuous_encoder = ContinuousEncoder(
+            input_dims=input_dims,
+            targets_context_length=targets_context_length,
+            d_model=d_model,
+            dropout_rate=dropout_rate,
+            num_layers=num_encoder_layers,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+        )
+
+        self.decoder = Decoder(
+            input_dims=input_dims,
+            targets_length=targets_length,
+            max_decoder_noise_time=max_decoder_noise_time,
+            d_model=d_model,
+            num_layers=num_decoder_layers,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            dropout_rate=dropout_rate,
+            feed_forward_proj=feed_forward_proj,
+        )
+
+    def encode(self, input_tokens, continuous_inputs, continuous_mask):
+        tokens_mask = input_tokens > 0
+        tokens_encoded, tokens_mask = self.token_encoder(
+            encoder_input_tokens=input_tokens,
+            encoder_inputs_mask=tokens_mask,
+        )
+
+        continuous_encoded, continuous_mask = self.continuous_encoder(
+            encoder_inputs=continuous_inputs,
+            encoder_inputs_mask=continuous_mask,
+        )
+
+        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
+
+    def decode(self, encodings_and_masks, input_tokens, noise_time):
+        logits = self.decoder(
+            encodings_and_masks=encodings_and_masks,
+            decoder_input_tokens=input_tokens,
+            decoder_noise_time=noise_time,
+        )
+        return logits
+
+    def forward(
+        self,
+        encoder_input_tokens,
+        encoder_continuous_inputs,
+        encoder_continuous_mask,
+        decoder_input_tokens,
+        decoder_noise_time,
+    ):
+        encodings_and_masks = self.encode(
+            input_tokens=encoder_input_tokens,
+            continuous_inputs=encoder_continuous_inputs,
+            continuous_mask=encoder_continuous_mask,
+        )
+
+        return self.decode(
+            encodings_and_masks=encodings_and_masks,
+            input_tokens=decoder_input_tokens,
+            noise_time=decoder_noise_time,
+        )
+
+
 class SpectrogramDiffusionPipeline(DiffusionPipeline):
     def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None:
         super().__init__()
diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
index e2a2046969fe..21b7c2a4d7b9 100644
--- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
@@ -34,21 +34,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
-class ContinuousContextTransformer(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
 class Transformer2DModel(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 

From 0251747a3660c7c488bbffeea3bfaad1fb78ded4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 10 Nov 2022 11:09:27 +0100
Subject: [PATCH 027/131] broadcast noise time to batch dim

---
 .../pipeline_spectrogram_diffusion.py                 | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 35ada88c9e9e..64f185af7640 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -503,10 +503,19 @@ def encode(self, input_tokens, continuous_inputs, continuous_mask):
         return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
 
     def decode(self, encodings_and_masks, input_tokens, noise_time):
+        timesteps = noise_time
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=input_tokens.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(input_tokens.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(input_tokens.shape[0], dtype=timesteps.dtype, device=timesteps.device)
+
         logits = self.decoder(
             encodings_and_masks=encodings_and_masks,
             decoder_input_tokens=input_tokens,
-            decoder_noise_time=noise_time,
+            decoder_noise_time=timesteps,
         )
         return logits
 

From 8a54f88a8e1747a2c87b7694c858fcbc3f2527c7 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 10 Nov 2022 14:41:28 +0100
Subject: [PATCH 028/131] fix call

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 64f185af7640..9f8bb1124fd5 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -576,7 +576,7 @@ def __call__(
         encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True)
 
         encodings_and_masks = self.cont_context_trans.encode(
-            encoder_input_tokens=encoder_input_tokens,
+            input_tokens=encoder_input_tokens,
             continuous_inputs=encoder_continuous_inputs,
             continuous_mask=encoder_continuous_mask,
         )

From b6373b896eb228c4b18c76d7ac2dd1d29add800a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 10 Nov 2022 14:53:15 +0100
Subject: [PATCH 029/131] added scale_to_features

---
 .../pipeline_spectrogram_diffusion.py                    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 9f8bb1124fd5..e61f093b811b 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -560,6 +560,15 @@ def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
         # Scale to [min_out, max_out].
         return zero_one * (max_out - min_out) + min_out
 
+    def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
+        """Invert by linearly scaling network outputs to features range."""
+        min_out, max_out = input_range
+        outputs = torch.clip(outputs, min_out, max_out) if clip else outputs
+        # Scale to [0, 1].
+        zero_one = (outputs - min_out) / (max_out - min_out)
+        # Scale to [self.min_value, self.max_value].
+        return zero_one * (self.max_value - self.min_value) + self.min_value
+
     @torch.no_grad()
     def __call__(
         self,

From 5fb437d91854cb771aab0263a5e2b2a5d2d04561 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 10 Nov 2022 16:21:57 +0100
Subject: [PATCH 030/131] fix weights

---
 .../convert_music_spectrogram_to_diffusers.py | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 52d7785fff62..479986237ab0 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -2,11 +2,11 @@
 import argparse
 import os
 
+import numpy as np
+
 import torch
 import torch.nn as nn
 
-import jax
-import tensorflow as tf
 from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline
 from diffusers.pipelines.spectrogram_diffusion import ContinuousContextTransformer
 from music_spectrogram_diffusion import inference
@@ -33,12 +33,14 @@ def load_token_encoder(weights, model):
         )
 
         lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
-        lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
         lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
 
     model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
 
+    return model
+
 
 def load_continuous_encoder(weights, model):
     model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T))
@@ -60,12 +62,14 @@ def load_continuous_encoder(weights, model):
         )
 
         lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
-        lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
         lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
 
     model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
 
+    return model
+
 
 def load_decoder(weights, model):
     model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T))
@@ -112,18 +116,21 @@ def load_decoder(weights, model):
         )
 
         lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
-        lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
 
     model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"]))
 
     model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T))
 
+    return model
+
 
 def load_checkpoint(t5_checkpoint, model):
-    load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder)
-    load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder)
-    load_decoder(t5_checkpoint["decoder"], model.decoder)
+    model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder)
+
+    model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder)
+    model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder)
     return model
 
 
@@ -163,15 +170,10 @@ def main(args):
     model = load_checkpoint(t5_checkpoint["target"], model)
 
     pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler)
-    import pdb
-
-    pdb.set_trace()
-    pipe.save_pretrained(args.output_path)
+    pipe.save_pretrained("kashif")
 
 
 if __name__ == "__main__":
-    jax.config.update("jax_platform_name", "cpu")
-
     parser = argparse.ArgumentParser()
 
     # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.")

From 5591f21ddc49e3b54a55819a5a1f3c2845c36bfa Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 10 Nov 2022 17:04:47 +0100
Subject: [PATCH 031/131] transpose laynorm weight

---
 .../convert_music_spectrogram_to_diffusers.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 479986237ab0..582665c1de54 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -29,15 +29,15 @@ def load_token_encoder(weights, model):
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
         lyr.layer[0].layer_norm.weight = nn.Parameter(
-            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"].T)
         )
 
         lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
-        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T))
 
-    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
+    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"].T))
 
     return model
 
@@ -58,15 +58,15 @@ def load_continuous_encoder(weights, model):
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
         lyr.layer[0].layer_norm.weight = nn.Parameter(
-            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"].T)
         )
 
         lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
-        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T))
 
-    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
+    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"].T))
 
     return model
 
@@ -86,7 +86,7 @@ def load_decoder(weights, model):
     for lyr_num, lyr in enumerate(model.decoders):
         ly_weight = weights[f"layers_{lyr_num}"]
         lyr.layer[0].layer_norm.weight = nn.Parameter(
-            torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"])
+            torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"].T)
         )
 
         lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter(
@@ -106,10 +106,10 @@ def load_decoder(weights, model):
         lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
 
         lyr.layer[1].layer_norm.weight = nn.Parameter(
-            torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"])
+            torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"].T)
         )
 
-        lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+        lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T))
 
         lyr.layer[3].scale_bias.weight = nn.Parameter(
             torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T)
@@ -119,7 +119,7 @@ def load_decoder(weights, model):
         lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
 
-    model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"]))
+    model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"].T))
 
     model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T))
 

From 21b7ea226e6b5a6d8deea71926475e26f3e7cde5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 14 Nov 2022 16:27:59 +0100
Subject: [PATCH 032/131] scale is a vector

---
 .../convert_music_spectrogram_to_diffusers.py | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 582665c1de54..dfef4484c5b9 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -23,22 +23,23 @@ def load_token_encoder(weights, model):
     )
     for lyr_num, lyr in enumerate(model.encoders):
         ly_weight = weights[f"layers_{lyr_num}"]
+        lyr.layer[0].layer_norm.weight = nn.Parameter(
+            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
+        )
+
         attention_weights = ly_weight["attention"]
         lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
         lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
-        lyr.layer[0].layer_norm.weight = nn.Parameter(
-            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"].T)
-        )
+
+        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
 
         lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
-        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T))
-
-    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"].T))
 
+    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
     return model
 
 
@@ -58,15 +59,15 @@ def load_continuous_encoder(weights, model):
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
         lyr.layer[0].layer_norm.weight = nn.Parameter(
-            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"].T)
+            torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"])
         )
 
         lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
-        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T))
+        lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
 
-    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"].T))
+    model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"]))
 
     return model
 
@@ -86,7 +87,7 @@ def load_decoder(weights, model):
     for lyr_num, lyr in enumerate(model.decoders):
         ly_weight = weights[f"layers_{lyr_num}"]
         lyr.layer[0].layer_norm.weight = nn.Parameter(
-            torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"].T)
+            torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"])
         )
 
         lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter(
@@ -106,10 +107,10 @@ def load_decoder(weights, model):
         lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
 
         lyr.layer[1].layer_norm.weight = nn.Parameter(
-            torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"].T)
+            torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"])
         )
 
-        lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"].T))
+        lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
 
         lyr.layer[3].scale_bias.weight = nn.Parameter(
             torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T)
@@ -119,7 +120,7 @@ def load_decoder(weights, model):
         lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
         lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
 
-    model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"].T))
+    model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"]))
 
     model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T))
 
@@ -167,7 +168,7 @@ def main(args):
         feed_forward_proj="gated-gelu",
         max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time,
     )
-    model = load_checkpoint(t5_checkpoint["target"], model)
+    model = load_checkpoint(t5_checkpoint["target"], model).eval()
 
     pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler)
     pipe.save_pretrained("kashif")

From 87ee8a34cf992c7a38a70512eecb02efadbad8d0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 17 Nov 2022 10:41:09 +0100
Subject: [PATCH 033/131] scale the query outputs

---
 .../convert_music_spectrogram_to_diffusers.py | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index dfef4484c5b9..c9f056d7d17b 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -16,7 +16,7 @@
 MODEL = "base_with_context"
 
 
-def load_token_encoder(weights, model):
+def load_token_encoder(weights, model, depth_scaling):
     model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"]))
     model.position_encoding.weight = nn.Parameter(
         torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False
@@ -28,7 +28,9 @@ def load_token_encoder(weights, model):
         )
 
         attention_weights = ly_weight["attention"]
-        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
+            torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling)
+        )
         lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
@@ -43,7 +45,7 @@ def load_token_encoder(weights, model):
     return model
 
 
-def load_continuous_encoder(weights, model):
+def load_continuous_encoder(weights, model, depth_scaling):
     model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T))
 
     model.position_encoding.weight = nn.Parameter(
@@ -54,7 +56,9 @@ def load_continuous_encoder(weights, model):
         ly_weight = weights[f"layers_{lyr_num}"]
         attention_weights = ly_weight["attention"]
 
-        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
+            torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling)
+        )
         lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
@@ -72,7 +76,7 @@ def load_continuous_encoder(weights, model):
     return model
 
 
-def load_decoder(weights, model):
+def load_decoder(weights, model, depth_scaling):
     model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T))
     model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T))
 
@@ -95,13 +99,17 @@ def load_decoder(weights, model):
         )
 
         attention_weights = ly_weight["self_attention"]
-        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
+            torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling)
+        )
         lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
 
         attention_weights = ly_weight["MultiHeadDotProductAttention_0"]
-        lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(
+            torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling)
+        )
         lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
@@ -127,11 +135,13 @@ def load_decoder(weights, model):
     return model
 
 
-def load_checkpoint(t5_checkpoint, model):
-    model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder)
+def load_checkpoint(t5_checkpoint, model, depth_scaling):
+    model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder, depth_scaling)
 
-    model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder)
-    model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder)
+    model.continuous_encoder = load_continuous_encoder(
+        t5_checkpoint["continuous_encoder"], model.continuous_encoder, depth_scaling
+    )
+    model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder, depth_scaling)
     return model
 
 
@@ -168,7 +178,9 @@ def main(args):
         feed_forward_proj="gated-gelu",
         max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time,
     )
-    model = load_checkpoint(t5_checkpoint["target"], model).eval()
+    model = load_checkpoint(
+        t5_checkpoint["target"], model, depth_scaling=synth_model.model.module.config.head_dim**-0.5
+    ).eval()
 
     pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler)
     pipe.save_pretrained("kashif")

From 6deafab654b5dc4691592155a3546b2b2dceb354 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 17 Nov 2022 10:53:00 +0100
Subject: [PATCH 034/131] added comment

---
 scripts/convert_music_spectrogram_to_diffusers.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index c9f056d7d17b..4f4e2b802d0f 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -16,7 +16,7 @@
 MODEL = "base_with_context"
 
 
-def load_token_encoder(weights, model, depth_scaling):
+def load_token_encoder(weights, model, depth_scaling=1.0):
     model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"]))
     model.position_encoding.weight = nn.Parameter(
         torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False
@@ -45,7 +45,7 @@ def load_token_encoder(weights, model, depth_scaling):
     return model
 
 
-def load_continuous_encoder(weights, model, depth_scaling):
+def load_continuous_encoder(weights, model, depth_scaling=1.0):
     model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T))
 
     model.position_encoding.weight = nn.Parameter(
@@ -76,7 +76,7 @@ def load_continuous_encoder(weights, model, depth_scaling):
     return model
 
 
-def load_decoder(weights, model, depth_scaling):
+def load_decoder(weights, model, depth_scaling=1.0):
     model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T))
     model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T))
 
@@ -135,7 +135,7 @@ def load_decoder(weights, model, depth_scaling):
     return model
 
 
-def load_checkpoint(t5_checkpoint, model, depth_scaling):
+def load_checkpoint(t5_checkpoint, model, depth_scaling=1.0):
     model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder, depth_scaling)
 
     model.continuous_encoder = load_continuous_encoder(
@@ -178,6 +178,10 @@ def main(args):
         feed_forward_proj="gated-gelu",
         max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time,
     )
+
+    # NOTE: T5 does not explicitly rescale the attention logits by
+    #       1/sqrt(depth_kq)!  This is folded into the initializers of the
+    #       linear transformations, which is equivalent under Adafactor.
     model = load_checkpoint(
         t5_checkpoint["target"], model, depth_scaling=synth_model.model.module.config.head_dim**-0.5
     ).eval()

From 8830c2bb72397f4558f5d7124003c228358c29f6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 17 Nov 2022 14:51:46 +0100
Subject: [PATCH 035/131] undo scaling

---
 scripts/convert_music_spectrogram_to_diffusers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 4f4e2b802d0f..9b68da20664d 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -182,9 +182,7 @@ def main(args):
     # NOTE: T5 does not explicitly rescale the attention logits by
     #       1/sqrt(depth_kq)!  This is folded into the initializers of the
     #       linear transformations, which is equivalent under Adafactor.
-    model = load_checkpoint(
-        t5_checkpoint["target"], model, depth_scaling=synth_model.model.module.config.head_dim**-0.5
-    ).eval()
+    model = load_checkpoint(t5_checkpoint["target"], model).eval()
 
     pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler)
     pipe.save_pretrained("kashif")

From 3b9e822eda59ba29dec97c31aaf6790bd471bc42 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 17 Nov 2022 15:11:59 +0100
Subject: [PATCH 036/131] undo depth_scaling

---
 .../convert_music_spectrogram_to_diffusers.py | 46 +++++++------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 9b68da20664d..d3fad9380b4f 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -16,7 +16,7 @@
 MODEL = "base_with_context"
 
 
-def load_token_encoder(weights, model, depth_scaling=1.0):
+def load_token_encoder(weights, model):
     model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"]))
     model.position_encoding.weight = nn.Parameter(
         torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False
@@ -28,9 +28,7 @@ def load_token_encoder(weights, model, depth_scaling=1.0):
         )
 
         attention_weights = ly_weight["attention"]
-        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
-            torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling)
-        )
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
         lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
@@ -45,7 +43,7 @@ def load_token_encoder(weights, model, depth_scaling=1.0):
     return model
 
 
-def load_continuous_encoder(weights, model, depth_scaling=1.0):
+def load_continuous_encoder(weights, model):
     model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T))
 
     model.position_encoding.weight = nn.Parameter(
@@ -56,9 +54,7 @@ def load_continuous_encoder(weights, model, depth_scaling=1.0):
         ly_weight = weights[f"layers_{lyr_num}"]
         attention_weights = ly_weight["attention"]
 
-        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
-            torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling)
-        )
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
         lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
@@ -76,7 +72,7 @@ def load_continuous_encoder(weights, model, depth_scaling=1.0):
     return model
 
 
-def load_decoder(weights, model, depth_scaling=1.0):
+def load_decoder(weights, model):
     model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T))
     model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T))
 
@@ -99,17 +95,13 @@ def load_decoder(weights, model, depth_scaling=1.0):
         )
 
         attention_weights = ly_weight["self_attention"]
-        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(
-            torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling)
-        )
+        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
         lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
 
         attention_weights = ly_weight["MultiHeadDotProductAttention_0"]
-        lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(
-            torch.FloatTensor(attention_weights["query"]["kernel"].T * depth_scaling)
-        )
+        lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
         lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
@@ -135,13 +127,11 @@ def load_decoder(weights, model, depth_scaling=1.0):
     return model
 
 
-def load_checkpoint(t5_checkpoint, model, depth_scaling=1.0):
-    model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder, depth_scaling)
+def load_checkpoint(t5_checkpoint, model):
+    model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder)
 
-    model.continuous_encoder = load_continuous_encoder(
-        t5_checkpoint["continuous_encoder"], model.continuous_encoder, depth_scaling
-    )
-    model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder, depth_scaling)
+    model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder)
+    model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder)
     return model
 
 
@@ -179,22 +169,20 @@ def main(args):
         max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time,
     )
 
-    # NOTE: T5 does not explicitly rescale the attention logits by
-    #       1/sqrt(depth_kq)!  This is folded into the initializers of the
-    #       linear transformations, which is equivalent under Adafactor.
     model = load_checkpoint(t5_checkpoint["target"], model).eval()
 
     pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler)
-    pipe.save_pretrained("kashif")
+    if args.save:
+        pipe.save_pretrained(args.output_path)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    # parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.")
-    # parser.add_argument(
-    #     "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
-    # )
+    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.")
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
+    )
     parser.add_argument(
         "--checkpoint_path",
         default=f"{MODEL}/checkpoint_500000",

From 9328701ccb7797b1d88307d0779e2e0d07ce1908 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 17 Nov 2022 21:00:10 +0100
Subject: [PATCH 037/131] inital get_extended_attention_mask

---
 .../pipeline_spectrogram_diffusion.py         | 40 ++++++++++++++-----
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index e61f093b811b..bd2db8e9bce7 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -12,6 +12,8 @@
     T5LayerFF,
     T5LayerNorm,
 )
+from transformers.modeling_utils import ModuleUtilsMixin
+
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...modeling_utils import ModelMixin
@@ -71,10 +73,11 @@ def forward(
         return outputs
 
 
-class DecoderLayer(nn.Module):
+class DecoderLayer(nn.Module, ModuleUtilsMixin):
     def __init__(self, config, has_relative_attention_bias=False):
         super().__init__()
         self.layer = nn.ModuleList()
+        self.config = config
 
         # cond self attention: layer 0
         self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
@@ -122,10 +125,16 @@ def forward(
         else:
             self_attn_past_key_value, cross_attn_past_key_value = None, None
 
+        input_shape = (hidden_states.shape[0], hidden_states.shape[1])
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape[0], input_shape[1], device=hidden_states.device)
+
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
         self_attention_outputs = self.layer[0](
             hidden_states,
             conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
+            attention_mask=extended_attention_mask,
             position_bias=position_bias,
             layer_head_mask=layer_head_mask,
             past_key_value=self_attn_past_key_value,
@@ -148,10 +157,13 @@ def forward(
             else:
                 query_length = None
 
+            input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1])
+            extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape)
+
             cross_attention_outputs = self.layer[1](
                 hidden_states,
                 key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
+                attention_mask=extended_attention_mask,
                 position_bias=encoder_decoder_position_bias,
                 layer_head_mask=cross_attn_layer_head_mask,
                 past_key_value=cross_attn_past_key_value,
@@ -198,7 +210,7 @@ def forward(
         return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
 
 
-class TokenEncoder(ModelMixin, ConfigMixin):
+class TokenEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
     @register_to_config
     def __init__(
         self,
@@ -211,6 +223,7 @@ def __init__(
         d_kv: int,
         d_ff: int,
         feed_forward_proj: str,
+        is_decoder: bool = False,
     ):
         super().__init__()
 
@@ -229,7 +242,7 @@ def __init__(
             d_ff=d_ff,
             dropout_rate=dropout_rate,
             feed_forward_proj=feed_forward_proj,
-            is_decoder=False,
+            is_decoder=is_decoder,
             is_encoder_decoder=False,
         )
 
@@ -250,14 +263,18 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask):
 
         x = self.dropout_pre(x)
 
+        # inverted the attention mask
+        input_shape = encoder_input_tokens.size()
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
+
         for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)[0]
+            x = lyr(x, extended_attention_mask)[0]
         x = self.layer_norm(x)
 
         return self.dropout_post(x), encoder_inputs_mask
 
 
-class ContinuousEncoder(ModelMixin, ConfigMixin):
+class ContinuousEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
     @register_to_config
     def __init__(
         self,
@@ -270,6 +287,7 @@ def __init__(
         d_kv: int,
         d_ff: int,
         feed_forward_proj: str,
+        is_decoder: bool = False,
     ):
         super().__init__()
 
@@ -287,7 +305,7 @@ def __init__(
             d_ff=d_ff,
             feed_forward_proj=feed_forward_proj,
             dropout_rate=dropout_rate,
-            is_decoder=False,
+            is_decoder=is_decoder,
             is_encoder_decoder=False,
         )
         self.encoders = nn.ModuleList()
@@ -311,8 +329,12 @@ def forward(self, encoder_inputs, encoder_inputs_mask):
 
         x = self.dropout_pre(x)
 
+        # inverted the attention mask
+        input_shape = encoder_inputs.size()
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
+
         for lyr in self.encoders:
-            x = lyr(x, encoder_inputs_mask)[0]
+            x = lyr(x, extended_attention_mask)[0]
         x = self.layer_norm(x)
 
         return self.dropout_post(x), encoder_inputs_mask

From f86a785f4aee6b3d19ef92152977bb007a6ec9cf Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 20 Nov 2022 21:49:35 +0100
Subject: [PATCH 038/131] attention_mask is none in self-attention

---
 .../pipeline_spectrogram_diffusion.py         | 33 ++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index bd2db8e9bce7..9d4f540152b8 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -125,16 +125,10 @@ def forward(
         else:
             self_attn_past_key_value, cross_attn_past_key_value = None, None
 
-        input_shape = (hidden_states.shape[0], hidden_states.shape[1])
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape[0], input_shape[1], device=hidden_states.device)
-
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
-
         self_attention_outputs = self.layer[0](
             hidden_states,
             conditioning_emb=conditioning_emb,
-            attention_mask=extended_attention_mask,
+            attention_mask=attention_mask,
             position_bias=position_bias,
             layer_head_mask=layer_head_mask,
             past_key_value=self_attn_past_key_value,
@@ -157,13 +151,13 @@ def forward(
             else:
                 query_length = None
 
-            input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1])
-            extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape)
-
-            cross_attention_outputs = self.layer[1](
+        input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1])
+        extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape)
+        
+        cross_attention_outputs = self.layer[1](
                 hidden_states,
                 key_value_states=encoder_hidden_states,
-                attention_mask=extended_attention_mask,
+                attention_mask=encoder_attention_mask,
                 position_bias=encoder_decoder_position_bias,
                 layer_head_mask=cross_attn_layer_head_mask,
                 past_key_value=cross_attn_past_key_value,
@@ -434,9 +428,24 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         y = self.dropout(inputs)
 
+        import pdb
+
+        pdb.set_trace()
+
         # cross attend style: concat encodings
         encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
         encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+
+        # import pdb
+
+        # pdb.set_trace()
+
+        # # inverted the attention mask
+        # input_shape = encoded.size()
+        # extended_attention_mask = self.get_extended_attention_mask(encoder_decoder_mask, input_shape)
+
+        encoder_decoder_mask = torch.where(encoder_decoder_mask > 0, 0, -1e10)
+
         for lyr in self.decoders:
             y = lyr(
                 y,

From 9905492af78ae1cb4ec74aa2c4a6c3421f0cb7c6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 20 Nov 2022 21:54:02 +0100
Subject: [PATCH 039/131] cleanup

---
 .../pipeline_spectrogram_diffusion.py         | 27 ++++---------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 9d4f540152b8..876b35aaa4b4 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -151,13 +151,13 @@ def forward(
             else:
                 query_length = None
 
-        input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1])
-        extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape)
-        
-        cross_attention_outputs = self.layer[1](
+            input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1])
+            extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape)
+
+            cross_attention_outputs = self.layer[1](
                 hidden_states,
                 key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
+                attention_mask=extended_attention_mask,
                 position_bias=encoder_decoder_position_bias,
                 layer_head_mask=cross_attn_layer_head_mask,
                 past_key_value=cross_attn_past_key_value,
@@ -421,31 +421,14 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         # Translate encoding masks to encoder-decoder masks.
         encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-
         inputs = self.continuous_inputs_projection(decoder_input_tokens)
-
         inputs += position_encodings
-
         y = self.dropout(inputs)
 
-        import pdb
-
-        pdb.set_trace()
-
         # cross attend style: concat encodings
         encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
         encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
 
-        # import pdb
-
-        # pdb.set_trace()
-
-        # # inverted the attention mask
-        # input_shape = encoded.size()
-        # extended_attention_mask = self.get_extended_attention_mask(encoder_decoder_mask, input_shape)
-
-        encoder_decoder_mask = torch.where(encoder_decoder_mask > 0, 0, -1e10)
-
         for lyr in self.decoders:
             y = lyr(
                 y,

From f439e5b2662b780796557685fa7b909168a3167d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 20 Nov 2022 22:23:27 +0100
Subject: [PATCH 040/131] manually invert attention

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py  | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 876b35aaa4b4..78140ddcb022 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -151,13 +151,12 @@ def forward(
             else:
                 query_length = None
 
-            input_shape = (encoder_hidden_states.shape[0], encoder_hidden_states.shape[1])
-            extended_attention_mask = self.get_extended_attention_mask(encoder_attention_mask, input_shape)
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
 
             cross_attention_outputs = self.layer[1](
                 hidden_states,
                 key_value_states=encoder_hidden_states,
-                attention_mask=extended_attention_mask,
+                attention_mask=encoder_extended_attention_mask,
                 position_bias=encoder_decoder_position_bias,
                 layer_head_mask=cross_attn_layer_head_mask,
                 past_key_value=cross_attn_past_key_value,

From dd5dc10e2f89fa177de1844ae05e4f85d595556f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 21 Nov 2022 12:17:32 +0100
Subject: [PATCH 041/131] nn.linear need bias=False

---
 .../convert_music_spectrogram_to_diffusers.py |  2 +-
 .../pipeline_spectrogram_diffusion.py         | 22 +++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index d3fad9380b4f..005ae0ebeb9d 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -179,7 +179,7 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the converted model.")
+    parser.add_argument("--output_path", default=None, type=str, required=True, help="Path to the converted model.")
     parser.add_argument(
         "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
     )
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 78140ddcb022..3e1254b80b3f 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -25,7 +25,7 @@
 class FiLMLayer(nn.Module):
     def __init__(self, in_features, out_features):
         super().__init__()
-        self.scale_bias = nn.Linear(in_features, out_features * 2)
+        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
 
     def forward(self, x, conditioning_emb):
         scale_bias = self.scale_bias(conditioning_emb)
@@ -360,7 +360,7 @@ def __init__(
         self.position_encoding = nn.Embedding(targets_length, d_model)
         self.position_encoding.weight.requires_grad = False
 
-        self.continuous_inputs_projection = nn.Linear(input_dims, d_model)
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
 
         self.dropout = nn.Dropout(p=dropout_rate)
 
@@ -393,16 +393,19 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
         batch, _, _ = decoder_input_tokens.shape
         assert decoder_noise_time.shape == (batch,)
 
+        # TODO remove:
+        # decoder_input_tokens = torch.ones_like(decoder_input_tokens)
+
         # decoder_noise_time is in [0, 1), so rescale to expected timing range.
-        conditioning_emb = get_timestep_embedding(
+        time_steps = get_timestep_embedding(
             decoder_noise_time * self.config.max_decoder_noise_time,
             embedding_dim=self.config.d_model,
             max_period=self.config.max_decoder_noise_time,
         )
 
-        conditioning_emb = self.conditioning_emb(conditioning_emb)
+        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
 
-        assert conditioning_emb.shape == (batch, self.config.d_model * 4)
+        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
 
         seq_length = decoder_input_tokens.shape[1]
 
@@ -415,14 +418,15 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         position_encodings = self.position_encoding(decoder_positions)
 
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+        inputs += position_encodings
+        y = self.dropout(inputs)
+
         # decoder: No padding present.
         decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
 
         # Translate encoding masks to encoder-decoder masks.
         encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-        inputs = self.continuous_inputs_projection(decoder_input_tokens)
-        inputs += position_encodings
-        y = self.dropout(inputs)
 
         # cross attend style: concat encodings
         encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
@@ -614,7 +618,7 @@ def __call__(
             output = self.cont_context_trans.decode(
                 encodings_and_masks=encodings_and_masks,
                 input_tokens=x,
-                noise_time=t,
+                noise_time=t / num_inference_steps,  # rescale to [0, 1)
             )
 
             # 2. compute previous output: x_t -> x_t-1

From d987df010e85a5bcce553f1be6fc4ba5278c57f4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 23 Nov 2022 19:51:02 +0100
Subject: [PATCH 042/131] added T5LayerFFCond

---
 .../convert_music_spectrogram_to_diffusers.py | 13 ++---
 .../pipeline_spectrogram_diffusion.py         | 49 +++++++++++--------
 2 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 005ae0ebeb9d..4fa9f151db5d 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -105,20 +105,17 @@ def load_decoder(weights, model):
         lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
         lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
         lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
-
         lyr.layer[1].layer_norm.weight = nn.Parameter(
             torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"])
         )
 
-        lyr.layer[2].weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
-
-        lyr.layer[3].scale_bias.weight = nn.Parameter(
+        lyr.layer[2].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"]))
+        lyr.layer[2].film.scale_bias.weight = nn.Parameter(
             torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T)
         )
-
-        lyr.layer[4].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
-        lyr.layer[4].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
-        lyr.layer[4].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
+        lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T))
+        lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T))
+        lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T))
 
     model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"]))
 
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 3e1254b80b3f..7e0399e2471b 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -9,8 +9,9 @@
     T5Block,
     T5Config,
     T5LayerCrossAttention,
-    T5LayerFF,
     T5LayerNorm,
+    T5DenseGatedActDense,
+    T5DenseActDense,
 )
 from transformers.modeling_utils import ModuleUtilsMixin
 
@@ -33,6 +34,28 @@ def forward(self, x, conditioning_emb):
         return x * (scale + 1.0) + bias
 
 
+class T5LayerFFCond(nn.Module):
+    def __init__(self, config: T5Config):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = T5DenseGatedActDense(config)
+        else:
+            self.DenseReluDense = T5DenseActDense(config)
+
+        self.film = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states, conditioning_emb=None):
+        forwarded_states = self.layer_norm(hidden_states)
+        if conditioning_emb is not None:
+            forwarded_states = self.film(forwarded_states, conditioning_emb)
+
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
 class T5LayerSelfAttentionCond(nn.Module):
     def __init__(self, config, has_relative_attention_bias=False):
         super().__init__()
@@ -85,14 +108,8 @@ def __init__(self, config, has_relative_attention_bias=False):
         # cross attention: layer 1
         self.layer.append(T5LayerCrossAttention(config))
 
-        # pre_mlp_layer_norm: layer 2
-        self.layer.append(T5LayerNorm(hidden_size=config.d_model))
-
-        # FiLM layer: 3
-        self.layer.append(FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model))
-
-        # MLP + dropout: last layer
-        self.layer.append(T5LayerFF(config))
+        # Film Cond MLP + dropout: last layer
+        self.layer.append(T5LayerFFCond(config))
 
     def forward(
         self,
@@ -178,15 +195,8 @@ def forward(
             # Keep cross-attention outputs and relative position weights
             attention_outputs = attention_outputs + cross_attention_outputs[2:]
 
-        # Apply LayerNorm
-        hidden_states = self.layer[2](hidden_states)
-
-        # FiLM
-        if conditioning_emb is not None:
-            hidden_states = self.layer[3](hidden_states, conditioning_emb)
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
+        # Apply Film Conditional Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
 
         # clamp inf values to enable fp16 training
         if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
@@ -393,9 +403,6 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
         batch, _, _ = decoder_input_tokens.shape
         assert decoder_noise_time.shape == (batch,)
 
-        # TODO remove:
-        # decoder_input_tokens = torch.ones_like(decoder_input_tokens)
-
         # decoder_noise_time is in [0, 1), so rescale to expected timing range.
         time_steps = get_timestep_embedding(
             decoder_noise_time * self.config.max_decoder_noise_time,

From 428fae945ce82371dd4f7495c415e57e1fa0bc46 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 29 Nov 2022 13:59:40 +0100
Subject: [PATCH 043/131] remove to fix conflict

---
 src/diffusers/pipeline_utils.py                   | 14 --------------
 .../utils/dummy_torch_and_accelerate_objects.py   | 15 ---------------
 2 files changed, 29 deletions(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index ca08c1b36ce3..36c2d5b888ef 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -109,20 +109,6 @@ class AudioPipelineOutput(BaseOutput):
     audios: np.ndarray
 
 
-@dataclass
-class MelPipelineOutput(BaseOutput):
-    """
-    Output class for Mel pipelines.
-
-    Args:
-        mels (`np.ndarray`)
-            List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the denoised mel
-            samples of the diffusion pipeline.
-    """
-
-    mels: np.ndarray
-
-
 class DiffusionPipeline(ConfigMixin):
     r"""
     Base class for all models.
diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
index 21b7c2a4d7b9..335e3ca24d2a 100644
--- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
@@ -272,21 +272,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
-class SpectrogramDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
 class DDIMScheduler(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 

From 670331eea986ffd3dc4d848c429c8105b0478a86 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 29 Nov 2022 14:02:38 +0100
Subject: [PATCH 044/131] make style and dummy

---
 scripts/convert_music_spectrogram_to_diffusers.py |  1 -
 src/diffusers/pipeline_utils.py                   | 14 ++++++++++++++
 .../pipeline_spectrogram_diffusion.py             |  7 +++----
 src/diffusers/utils/dummy_pt_objects.py           | 15 +++++++++++++++
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 4fa9f151db5d..718006229880 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -3,7 +3,6 @@
 import os
 
 import numpy as np
-
 import torch
 import torch.nn as nn
 
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 01bcc6a33803..57627c80df41 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -118,6 +118,20 @@ class AudioPipelineOutput(BaseOutput):
     audios: np.ndarray
 
 
+@dataclass
+class MelPipelineOutput(BaseOutput):
+    """
+    Output class for Mel pipelines.
+
+    Args:
+        mels (`np.ndarray`)
+            List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the denoised mel
+            samples of the diffusion pipeline.
+    """
+
+    mels: np.ndarray
+
+
 def is_safetensors_compatible(info) -> bool:
     filenames = set(sibling.rfilename for sibling in info.siblings)
     pt_filenames = set(filename for filename in filenames if filename.endswith(".bin"))
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 7e0399e2471b..c13c7d12e342 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -4,17 +4,16 @@
 import torch
 import torch.nn as nn
 
+from transformers.modeling_utils import ModuleUtilsMixin
 from transformers.models.t5.modeling_t5 import (
     T5Attention,
     T5Block,
     T5Config,
+    T5DenseActDense,
+    T5DenseGatedActDense,
     T5LayerCrossAttention,
     T5LayerNorm,
-    T5DenseGatedActDense,
-    T5DenseActDense,
 )
-from transformers.modeling_utils import ModuleUtilsMixin
-
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...modeling_utils import ModelMixin
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 9846927cb1ce..5d25154c651b 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -287,6 +287,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class SpectrogramDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class DDIMScheduler(metaclass=DummyObject):
     _backends = ["torch"]
 

From f98beebfa9faa7a5a7555ad09d2be30e6257a4b8 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 29 Nov 2022 14:08:06 +0100
Subject: [PATCH 045/131] remove unsed variables

---
 scripts/convert_music_spectrogram_to_diffusers.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 718006229880..1335255a882c 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -2,7 +2,6 @@
 import argparse
 import os
 
-import numpy as np
 import torch
 import torch.nn as nn
 
@@ -145,7 +144,6 @@ def main(args):
     gin_config = inference.parse_training_gin_file(gin_file, gin_overrides)
     synth_model = inference.InferenceModel(args.checkpoint_path, gin_config)
 
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large")
 
     model = ContinuousContextTransformer(

From 37735c0bd30b5a614b3c6262c00c85e22ec5cc6a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 29 Nov 2022 15:11:10 +0100
Subject: [PATCH 046/131] remove predict_epsilon

---
 scripts/convert_music_spectrogram_to_diffusers.py         | 1 -
 .../pipeline_spectrogram_diffusion.py                     | 8 +++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 1335255a882c..76c268e721fc 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -124,7 +124,6 @@ def load_decoder(weights, model):
 
 def load_checkpoint(t5_checkpoint, model):
     model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder)
-
     model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder)
     model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder)
     return model
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index c13c7d12e342..1cc0220980fa 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -253,7 +253,7 @@ def __init__(
             lyr = T5Block(t5config)
             self.encoders.append(lyr)
 
-        self.layer_norm = T5LayerNorm(hidden_size=d_model)
+        self.layer_norm = T5LayerNorm(d_model)
         self.dropout_post = nn.Dropout(p=dropout_rate)
 
     def forward(self, encoder_input_tokens, encoder_inputs_mask):
@@ -315,7 +315,7 @@ def __init__(
             lyr = T5Block(t5config)
             self.encoders.append(lyr)
 
-        self.layer_norm = T5LayerNorm(hidden_size=d_model)
+        self.layer_norm = T5LayerNorm(d_model)
         self.dropout_post = nn.Dropout(p=dropout_rate)
 
     def forward(self, encoder_inputs, encoder_inputs_mask):
@@ -601,8 +601,6 @@ def __call__(
         generator: Optional[torch.Generator] = None,
         num_inference_steps: int = 1000,
         return_dict: bool = True,
-        predict_epsilon: bool = True,
-        **kwargs,
     ):
         target_shape = encoder_continuous_inputs.shape
         encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True)
@@ -628,7 +626,7 @@ def __call__(
             )
 
             # 2. compute previous output: x_t -> x_t-1
-            x = self.scheduler.step(output, t, x, generator=generator, predict_epsilon=predict_epsilon).prev_sample
+            x = self.scheduler.step(output, t, x, generator=generator).prev_sample
 
         mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
         mel = mel.cpu().numpy()

From f9217a7d3ea71c51443efe06c894a722c09762af Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 4 Nov 2022 14:58:52 +0100
Subject: [PATCH 047/131] Move accelerate to a soft-dependency (#1134)

* finish

* finish

* Update src/diffusers/modeling_utils.py

* Update src/diffusers/pipeline_utils.py

Co-authored-by: Anton Lozhkov <anton@huggingface.co>

* more fixes

* fix

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 .../convert_music_spectrogram_to_diffusers.py |  51 ++++--
 .../spectrogram_diffusion/__init__.py         |   7 +-
 .../pipeline_spectrogram_diffusion.py         | 151 +++++-------------
 3 files changed, 78 insertions(+), 131 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 76c268e721fc..2d011662dc74 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 
 from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline
-from diffusers.pipelines.spectrogram_diffusion import ContinuousContextTransformer
+from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
 from music_spectrogram_diffusion import inference
 from t5x import checkpoints
 
@@ -14,7 +14,7 @@
 MODEL = "base_with_context"
 
 
-def load_token_encoder(weights, model):
+def load_notes_encoder(weights, model):
     model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"]))
     model.position_encoding.weight = nn.Parameter(
         torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False
@@ -122,13 +122,6 @@ def load_decoder(weights, model):
     return model
 
 
-def load_checkpoint(t5_checkpoint, model):
-    model.token_encoder = load_token_encoder(t5_checkpoint["token_encoder"], model.token_encoder)
-    model.continuous_encoder = load_continuous_encoder(t5_checkpoint["continuous_encoder"], model.continuous_encoder)
-    model.decoder = load_decoder(t5_checkpoint["decoder"], model.decoder)
-    return model
-
-
 def main(args):
     t5_checkpoint = checkpoints.load_t5x_checkpoint(args.checkpoint_path)
 
@@ -145,26 +138,50 @@ def main(args):
 
     scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2", variance_type="fixed_large")
 
-    model = ContinuousContextTransformer(
-        vocab_size=synth_model.model.module.config.vocab_size,
+    notes_encoder = SpectrogramNotesEncoder(
         max_length=synth_model.sequence_length["inputs"],
+        vocab_size=synth_model.model.module.config.vocab_size,
+        d_model=synth_model.model.module.config.emb_dim,
+        dropout_rate=synth_model.model.module.config.dropout_rate,
+        num_layers=synth_model.model.module.config.num_encoder_layers,
+        num_heads=synth_model.model.module.config.num_heads,
+        d_kv=synth_model.model.module.config.head_dim,
+        d_ff=synth_model.model.module.config.mlp_dim,
+        feed_forward_proj="gated-gelu",
+    )
+
+    continuous_encoder = SpectrogramContEncoder(
         input_dims=synth_model.audio_codec.n_dims,
         targets_context_length=synth_model.sequence_length["targets_context"],
-        targets_length=synth_model.sequence_length["targets"],
         d_model=synth_model.model.module.config.emb_dim,
+        dropout_rate=synth_model.model.module.config.dropout_rate,
+        num_layers=synth_model.model.module.config.num_encoder_layers,
         num_heads=synth_model.model.module.config.num_heads,
-        num_encoder_layers=synth_model.model.module.config.num_encoder_layers,
-        num_decoder_layers=synth_model.model.module.config.num_decoder_layers,
         d_kv=synth_model.model.module.config.head_dim,
         d_ff=synth_model.model.module.config.mlp_dim,
-        dropout_rate=synth_model.model.module.config.dropout_rate,
         feed_forward_proj="gated-gelu",
+    )
+
+    decoder = T5FilmDecoder(
+        input_dims=synth_model.audio_codec.n_dims,
+        targets_length=synth_model.sequence_length["targets_context"],
         max_decoder_noise_time=synth_model.model.module.config.max_decoder_noise_time,
+        d_model=synth_model.model.module.config.emb_dim,
+        num_layers=synth_model.model.module.config.num_decoder_layers,
+        num_heads=synth_model.model.module.config.num_heads,
+        d_kv=synth_model.model.module.config.head_dim,
+        d_ff=synth_model.model.module.config.mlp_dim,
+        dropout_rate=synth_model.model.module.config.dropout_rate,
+        feed_forward_proj="gated-gelu",
     )
 
-    model = load_checkpoint(t5_checkpoint["target"], model).eval()
+    notes_encoder = load_notes_encoder(t5_checkpoint["target"]["token_encoder"], notes_encoder)
+    continuous_encoder = load_continuous_encoder(t5_checkpoint["target"]["continuous_encoder"], continuous_encoder)
+    decoder = load_decoder(t5_checkpoint["target"]["decoder"], decoder)
 
-    pipe = SpectrogramDiffusionPipeline(model, scheduler=scheduler)
+    pipe = SpectrogramDiffusionPipeline(
+        notes_encoder=notes_encoder, continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler
+    )
     if args.save:
         pipe.save_pretrained(args.output_path)
 
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index 850f9f7fba6d..625185f58935 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,2 +1,7 @@
 # flake8: noqa
-from .pipeline_spectrogram_diffusion import ContinuousContextTransformer, SpectrogramDiffusionPipeline
+from .pipeline_spectrogram_diffusion import (
+    SpectrogramNotesEncoder,
+    SpectrogramContEncoder,
+    T5FilmDecoder,
+    SpectrogramDiffusionPipeline,
+)
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 1cc0220980fa..fe6b87f6e80a 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -212,7 +212,7 @@ def forward(
         return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
 
 
-class TokenEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
     @register_to_config
     def __init__(
         self,
@@ -276,7 +276,7 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask):
         return self.dropout_post(x), encoder_inputs_mask
 
 
-class ContinuousEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
     @register_to_config
     def __init__(
         self,
@@ -342,7 +342,7 @@ def forward(self, encoder_inputs, encoder_inputs_mask):
         return self.dropout_post(x), encoder_inputs_mask
 
 
-class Decoder(ModelMixin, ConfigMixin):
+class T5FilmDecoder(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
         self,
@@ -453,74 +453,51 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
         return spec_out
 
 
-class ContinuousContextTransformer(ModelMixin, ConfigMixin):
-    @register_to_config
+class SpectrogramDiffusionPipeline(DiffusionPipeline):
     def __init__(
         self,
-        input_dims: int,
-        max_length: int,
-        targets_context_length: int,
-        targets_length: int,
-        max_decoder_noise_time: float,
-        vocab_size: int,
-        d_model: int,
-        dropout_rate: float,
-        num_encoder_layers: int,
-        num_decoder_layers: int,
-        num_heads: int,
-        d_kv: int,
-        d_ff: int,
-        feed_forward_proj: str = "gated-gelu",
-    ):
+        notes_encoder: SpectrogramNotesEncoder,
+        continuous_encoder: SpectrogramContEncoder,
+        decoder: T5FilmDecoder,
+        scheduler: DDPMScheduler,
+    ) -> None:
         super().__init__()
 
-        self.token_encoder = TokenEncoder(
-            max_length=max_length,
-            vocab_size=vocab_size,
-            d_model=d_model,
-            dropout_rate=dropout_rate,
-            num_layers=num_encoder_layers,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            feed_forward_proj=feed_forward_proj,
-        )
+        # From MELGAN
+        self.min_value = math.log(1e-5)  # Matches MelGAN training.
+        self.max_value = 4.0  # Largest value for most examples
 
-        self.continuous_encoder = ContinuousEncoder(
-            input_dims=input_dims,
-            targets_context_length=targets_context_length,
-            d_model=d_model,
-            dropout_rate=dropout_rate,
-            num_layers=num_encoder_layers,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            feed_forward_proj=feed_forward_proj,
+        self.register_modules(
+            notes_encoder=notes_encoder, continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler
         )
 
-        self.decoder = Decoder(
-            input_dims=input_dims,
-            targets_length=targets_length,
-            max_decoder_noise_time=max_decoder_noise_time,
-            d_model=d_model,
-            num_layers=num_decoder_layers,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            dropout_rate=dropout_rate,
-            feed_forward_proj=feed_forward_proj,
-        )
+    def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
+        """Linearly scale features to network outputs range."""
+        min_out, max_out = output_range
+        if clip:
+            features = torch.clip(features, self.min_value, self.max_value)
+        # Scale to [0, 1].
+        zero_one = (features - self.min_value) / (self.max_value - self.min_value)
+        # Scale to [min_out, max_out].
+        return zero_one * (max_out - min_out) + min_out
+
+    def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
+        """Invert by linearly scaling network outputs to features range."""
+        min_out, max_out = input_range
+        outputs = torch.clip(outputs, min_out, max_out) if clip else outputs
+        # Scale to [0, 1].
+        zero_one = (outputs - min_out) / (max_out - min_out)
+        # Scale to [self.min_value, self.max_value].
+        return zero_one * (self.max_value - self.min_value) + self.min_value
 
     def encode(self, input_tokens, continuous_inputs, continuous_mask):
         tokens_mask = input_tokens > 0
-        tokens_encoded, tokens_mask = self.token_encoder(
-            encoder_input_tokens=input_tokens,
-            encoder_inputs_mask=tokens_mask,
+        tokens_encoded, tokens_mask = self.notes_encoder(
+            encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask
         )
 
         continuous_encoded, continuous_mask = self.continuous_encoder(
-            encoder_inputs=continuous_inputs,
-            encoder_inputs_mask=continuous_mask,
+            encoder_inputs=continuous_inputs, encoder_inputs_mask=continuous_mask
         )
 
         return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
@@ -536,62 +513,10 @@ def decode(self, encodings_and_masks, input_tokens, noise_time):
         timesteps = timesteps * torch.ones(input_tokens.shape[0], dtype=timesteps.dtype, device=timesteps.device)
 
         logits = self.decoder(
-            encodings_and_masks=encodings_and_masks,
-            decoder_input_tokens=input_tokens,
-            decoder_noise_time=timesteps,
+            encodings_and_masks=encodings_and_masks, decoder_input_tokens=input_tokens, decoder_noise_time=timesteps
         )
         return logits
 
-    def forward(
-        self,
-        encoder_input_tokens,
-        encoder_continuous_inputs,
-        encoder_continuous_mask,
-        decoder_input_tokens,
-        decoder_noise_time,
-    ):
-        encodings_and_masks = self.encode(
-            input_tokens=encoder_input_tokens,
-            continuous_inputs=encoder_continuous_inputs,
-            continuous_mask=encoder_continuous_mask,
-        )
-
-        return self.decode(
-            encodings_and_masks=encodings_and_masks,
-            input_tokens=decoder_input_tokens,
-            noise_time=decoder_noise_time,
-        )
-
-
-class SpectrogramDiffusionPipeline(DiffusionPipeline):
-    def __init__(self, cont_context_trans: ContinuousContextTransformer, scheduler: DDPMScheduler) -> None:
-        super().__init__()
-
-        # From MELGAN
-        self.min_value = math.log(1e-5)  # Matches MelGAN training.
-        self.max_value = 4.0  # Largest value for most examples
-
-        self.register_modules(cont_context_trans=cont_context_trans, scheduler=scheduler)
-
-    def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
-        """Linearly scale features to network outputs range."""
-        min_out, max_out = output_range
-        if clip:
-            features = torch.clip(features, self.min_value, self.max_value)
-        # Scale to [0, 1].
-        zero_one = (features - self.min_value) / (self.max_value - self.min_value)
-        # Scale to [min_out, max_out].
-        return zero_one * (max_out - min_out) + min_out
-
-    def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
-        """Invert by linearly scaling network outputs to features range."""
-        min_out, max_out = input_range
-        outputs = torch.clip(outputs, min_out, max_out) if clip else outputs
-        # Scale to [0, 1].
-        zero_one = (outputs - min_out) / (max_out - min_out)
-        # Scale to [self.min_value, self.max_value].
-        return zero_one * (self.max_value - self.min_value) + self.min_value
-
     @torch.no_grad()
     def __call__(
         self,
@@ -605,7 +530,7 @@ def __call__(
         target_shape = encoder_continuous_inputs.shape
         encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True)
 
-        encodings_and_masks = self.cont_context_trans.encode(
+        encodings_and_masks = self.encode(
             input_tokens=encoder_input_tokens,
             continuous_inputs=encoder_continuous_inputs,
             continuous_mask=encoder_continuous_mask,
@@ -619,7 +544,7 @@ def __call__(
         self.scheduler.set_timesteps(num_inference_steps)
 
         for t in self.progress_bar(self.scheduler.timesteps):
-            output = self.cont_context_trans.decode(
+            output = self.decode(
                 encodings_and_masks=encodings_and_masks,
                 input_tokens=x,
                 noise_time=t / num_inference_steps,  # rescale to [0, 1)

From ff51d45f5bcb87756b98b0f5b98953b0f26edc83 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 1 Dec 2022 19:21:53 +0100
Subject: [PATCH 048/131] fix order

---
 src/diffusers/pipelines/spectrogram_diffusion/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index 625185f58935..df245e763cce 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,7 +1,7 @@
 # flake8: noqa
 from .pipeline_spectrogram_diffusion import (
-    SpectrogramNotesEncoder,
     SpectrogramContEncoder,
-    T5FilmDecoder,
     SpectrogramDiffusionPipeline,
+    SpectrogramNotesEncoder,
+    T5FilmDecoder,
 )

From 4a215ddb8ba392940be87bd771f17f4105833023 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 8 Dec 2022 11:37:21 +0100
Subject: [PATCH 049/131] added initial midi to note token data pipeline

---
 .../pipelines/spectrogram_diffusion/data.py   | 439 ++++++++++++++++++
 1 file changed, 439 insertions(+)
 create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/data.py

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/data.py
new file mode 100644
index 000000000000..5a74a20e3c46
--- /dev/null
+++ b/src/diffusers/pipelines/spectrogram_diffusion/data.py
@@ -0,0 +1,439 @@
+from typing import Sequence, Tuple, Optional, MutableMapping, List, Callable, Mapping, Any
+import dataclasses
+import math
+from immutabledict import immutabledict
+
+import numpy as np
+import note_seq
+import torch
+import torch.nn.functional as F
+
+
+SAMPLE_RATE = 16000
+HOP_SIZE = 320
+FRAME_RATE = int(SAMPLE_RATE // HOP_SIZE)
+
+DEFAULT_STEPS_PER_SECOND = 100
+DEFAULT_MAX_SHIFT_SECONDS = 10
+DEFAULT_NUM_VELOCITY_BINS = 1
+
+SLAKH_CLASS_PROGRAMS = immutabledict(
+    {
+        "Acoustic Piano": 0,
+        "Electric Piano": 4,
+        "Chromatic Percussion": 8,
+        "Organ": 16,
+        "Acoustic Guitar": 24,
+        "Clean Electric Guitar": 26,
+        "Distorted Electric Guitar": 29,
+        "Acoustic Bass": 32,
+        "Electric Bass": 33,
+        "Violin": 40,
+        "Viola": 41,
+        "Cello": 42,
+        "Contrabass": 43,
+        "Orchestral Harp": 46,
+        "Timpani": 47,
+        "String Ensemble": 48,
+        "Synth Strings": 50,
+        "Choir and Voice": 52,
+        "Orchestral Hit": 55,
+        "Trumpet": 56,
+        "Trombone": 57,
+        "Tuba": 58,
+        "French Horn": 60,
+        "Brass Section": 61,
+        "Soprano/Alto Sax": 64,
+        "Tenor Sax": 66,
+        "Baritone Sax": 67,
+        "Oboe": 68,
+        "English Horn": 69,
+        "Bassoon": 70,
+        "Clarinet": 71,
+        "Pipe": 73,
+        "Synth Lead": 80,
+        "Synth Pad": 88,
+    }
+)
+
+
+@dataclasses.dataclass
+class NoteRepresentationConfig:
+    """Configuration note representations."""
+
+    onsets_only: bool
+    include_ties: bool
+
+
+@dataclasses.dataclass
+class NoteEventData:
+    pitch: int
+    velocity: Optional[int] = None
+    program: Optional[int] = None
+    is_drum: Optional[bool] = None
+    instrument: Optional[int] = None
+
+
+@dataclasses.dataclass
+class NoteEncodingState:
+    """Encoding state for note transcription, keeping track of active pitches."""
+
+    # velocity bin for active pitches and programs
+    active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(default_factory=dict)
+
+
+@dataclasses.dataclass
+class EventRange:
+    type: str
+    min_value: int
+    max_value: int
+
+
+@dataclasses.dataclass
+class Event:
+    type: str
+    value: int
+
+
+class Codec:
+    """Encode and decode events.
+
+    Useful for declaring what certain ranges of a vocabulary should be used for.
+    This is intended to be used from Python before encoding or after decoding with
+    GenericTokenVocabulary. This class is more lightweight and does not include
+    things like EOS or UNK token handling.
+
+    To ensure that 'shift' events are always the first block of the vocab and
+    start at 0, that event type is required and specified separately.
+    """
+
+    def __init__(self, max_shift_steps: int, steps_per_second: float, event_ranges: List[EventRange]):
+        """Define Codec.
+
+        Args:
+          max_shift_steps: Maximum number of shift steps that can be encoded.
+          steps_per_second: Shift steps will be interpreted as having a duration of
+              1 / steps_per_second.
+          event_ranges: Other supported event types and their ranges.
+        """
+        self.steps_per_second = steps_per_second
+        self._shift_range = EventRange(type="shift", min_value=0, max_value=max_shift_steps)
+        self._event_ranges = [self._shift_range] + event_ranges
+        # Ensure all event types have unique names.
+        assert len(self._event_ranges) == len(set([er.type for er in self._event_ranges]))
+
+    @property
+    def num_classes(self) -> int:
+        return sum(er.max_value - er.min_value + 1 for er in self._event_ranges)
+
+    # The next couple methods are simplified special case methods just for shift
+    # events that are intended to be used from within autograph functions.
+
+    def is_shift_event_index(self, index: int) -> bool:
+        return (self._shift_range.min_value <= index) and (index <= self._shift_range.max_value)
+
+    @property
+    def max_shift_steps(self) -> int:
+        return self._shift_range.max_value
+
+    def encode_event(self, event: Event) -> int:
+        """Encode an event to an index."""
+        offset = 0
+        for er in self._event_ranges:
+            if event.type == er.type:
+                if not er.min_value <= event.value <= er.max_value:
+                    raise ValueError(
+                        f"Event value {event.value} is not within valid range "
+                        f"[{er.min_value}, {er.max_value}] for type {event.type}"
+                    )
+                return offset + event.value - er.min_value
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event type: {event.type}")
+
+    def event_type_range(self, event_type: str) -> Tuple[int, int]:
+        """Return [min_id, max_id] for an event type."""
+        offset = 0
+        for er in self._event_ranges:
+            if event_type == er.type:
+                return offset, offset + (er.max_value - er.min_value)
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event type: {event_type}")
+
+    def decode_event_index(self, index: int) -> Event:
+        """Decode an event index to an Event."""
+        offset = 0
+        for er in self._event_ranges:
+            if offset <= index <= offset + er.max_value - er.min_value:
+                return Event(type=er.type, value=er.min_value + index - offset)
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event index: {index}")
+
+
+def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
+    """
+    equivalent of tf.signal.frame
+    """
+    signal_length = signal.shape[axis]
+    if pad_end:
+        frames_overlap = frame_length - frame_step
+        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
+        pad_size = int(frame_length - rest_samples)
+
+        if pad_size != 0:
+            pad_axis = [0] * signal.ndim
+            pad_axis[axis] = pad_size
+            signal = F.pad(signal, pad_axis, "constant", pad_value)
+    frames = signal.unfold(axis, frame_length, frame_step)
+    return frames
+
+
+def program_to_slakh_program(program):
+    # this is done very hackily, probably should use a custom mapping
+    for slakh_program in sorted(SLAKH_CLASS_PROGRAMS.values(), reverse=True):
+        if program >= slakh_program:
+            return slakh_program
+
+
+def audio_to_frames(
+    samples,
+    hop_size: int,
+    frame_rate: int,
+) -> Tuple[Sequence[Sequence[int]], torch.Tensor]:
+    """Convert audio samples to non-overlapping frames and frame times."""
+    frame_size = hop_size
+    samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], mode="constant")
+
+    # Split audio into frames.
+    frames = frame(
+        torch.Tensor(samples).unsqueeze(0),
+        frame_length=frame_size,
+        frame_step=frame_size,
+        pad_end=False,  # TODO check why its off by 1 here when True
+    )
+
+    num_frames = len(samples) // frame_size
+
+    times = np.arange(num_frames) / frame_rate
+    return frames, times
+
+
+def note_sequence_to_onsets_and_offsets_and_programs(
+    ns: note_seq.NoteSequence,
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
+    """Extract onset & offset times and pitches & programs from a NoteSequence.
+
+    The onset & offset times will not necessarily be in sorted order.
+
+    Args:
+      ns: NoteSequence from which to extract onsets and offsets.
+
+    Returns:
+      times: A list of note onset and offset times.
+      values: A list of NoteEventData objects where velocity is zero for note
+          offsets.
+    """
+    # Sort by program and pitch and put offsets before onsets as a tiebreaker for
+    # subsequent stable sort.
+    notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
+    times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes]
+    values = [
+        NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
+        for note in notes
+        if not note.is_drum
+    ] + [
+        NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum)
+        for note in notes
+    ]
+    return times, values
+
+
+def note_sequence_to_onsets_and_offsets_and_programs(
+    ns: note_seq.NoteSequence,
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
+    """Extract onset & offset times and pitches & programs from a NoteSequence.
+
+    The onset & offset times will not necessarily be in sorted order.
+
+    Args:
+    ns: NoteSequence from which to extract onsets and offsets.
+
+    Returns:
+    times: A list of note onset and offset times.
+    values: A list of NoteEventData objects where velocity is zero for note
+        offsets.
+    """
+    # Sort by program and pitch and put offsets before onsets as a tiebreaker for
+    # subsequent stable sort.
+    notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
+    times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes]
+    values = [
+        NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
+        for note in notes
+        if not note.is_drum
+    ] + [
+        NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum)
+        for note in notes
+    ]
+    return times, values
+
+
+def num_velocity_bins_from_codec(codec: Codec):
+    """Get number of velocity bins from event codec."""
+    lo, hi = codec.event_type_range("velocity")
+    return hi - lo
+
+
+def velocity_to_bin(velocity, num_velocity_bins):
+    if velocity == 0:
+        return 0
+    else:
+        return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY)
+
+
+def note_event_data_to_events(
+    state: Optional[NoteEncodingState],
+    value: NoteEventData,
+    codec: Codec,
+) -> Sequence[Event]:
+    """Convert note event data to a sequence of events."""
+    if value.velocity is None:
+        # onsets only, no program or velocity
+        return [Event("pitch", value.pitch)]
+    else:
+        num_velocity_bins = num_velocity_bins_from_codec(codec)
+        velocity_bin = velocity_to_bin(value.velocity, num_velocity_bins)
+        if value.program is None:
+            # onsets + offsets + velocities only, no programs
+            if state is not None:
+                state.active_pitches[(value.pitch, 0)] = velocity_bin
+            return [Event("velocity", velocity_bin), Event("pitch", value.pitch)]
+        else:
+            if value.is_drum:
+                # drum events use a separate vocabulary
+                return [Event("velocity", velocity_bin), Event("drum", value.pitch)]
+            else:
+                # program + velocity + pitch
+                if state is not None:
+                    state.active_pitches[(value.pitch, value.program)] = velocity_bin
+                return [
+                    Event("program", value.program),
+                    Event("velocity", velocity_bin),
+                    Event("pitch", value.pitch),
+                ]
+
+
+def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]:
+    """Output program and pitch events for active notes plus a final tie event."""
+    events = []
+    for pitch, program in sorted(state.active_pitches.keys(), key=lambda k: k[::-1]):
+        if state.active_pitches[(pitch, program)]:
+            events += [Event("program", program), Event("pitch", pitch)]
+    events.append(Event("tie", 0))
+    return events
+
+
+def encode_and_index_events(
+    state, event_times, event_values, codec, frame_times, encode_event_fn, encoding_state_to_events_fn=None
+):
+    """Encode a sequence of timed events and index to audio frame times.
+
+    Encodes time shifts as repeated single step shifts for later run length
+    encoding.
+
+    Optionally, also encodes a sequence of "state events", keeping track of the
+    current encoding state at each audio frame. This can be used e.g. to prepend
+    events representing the current state to a targets segment.
+
+    Args:
+      state: Initial event encoding state.
+      event_times: Sequence of event times.
+      event_values: Sequence of event values.
+      encode_event_fn: Function that transforms event value into a sequence of one
+          or more Event objects.
+      codec: An Codec object that maps Event objects to indices.
+      frame_times: Time for every audio frame.
+      encoding_state_to_events_fn: Function that transforms encoding state into a
+          sequence of one or more Event objects.
+
+    Returns:
+      events: Encoded events and shifts.
+      event_start_indices: Corresponding start event index for every audio frame.
+          Note: one event can correspond to multiple audio indices due to sampling
+          rate differences. This makes splitting sequences tricky because the same
+          event can appear at the end of one sequence and the beginning of
+          another.
+      event_end_indices: Corresponding end event index for every audio frame. Used
+          to ensure when slicing that one chunk ends where the next begins. Should
+          always be true that event_end_indices[i] = event_start_indices[i + 1].
+      state_events: Encoded "state" events representing the encoding state before
+          each event.
+      state_event_indices: Corresponding state event index for every audio frame.
+    """
+    indices = np.argsort(event_times, kind="stable")
+    event_steps = [round(event_times[i] * codec.steps_per_second) for i in indices]
+    event_values = [event_values[i] for i in indices]
+
+    events = []
+    state_events = []
+    event_start_indices = []
+    state_event_indices = []
+
+    cur_step = 0
+    cur_event_idx = 0
+    cur_state_event_idx = 0
+
+    def fill_event_start_indices_to_cur_step():
+        while (
+            len(event_start_indices) < len(frame_times)
+            and frame_times[len(event_start_indices)] < cur_step / codec.steps_per_second
+        ):
+            event_start_indices.append(cur_event_idx)
+            state_event_indices.append(cur_state_event_idx)
+
+    for event_step, event_value in zip(event_steps, event_values):
+        while event_step > cur_step:
+            events.append(codec.encode_event(Event(type="shift", value=1)))
+            cur_step += 1
+            fill_event_start_indices_to_cur_step()
+            cur_event_idx = len(events)
+            cur_state_event_idx = len(state_events)
+        if encoding_state_to_events_fn:
+            # Dump state to state events *before* processing the next event, because
+            # we want to capture the state prior to the occurrence of the event.
+            for e in encoding_state_to_events_fn(state):
+                state_events.append(codec.encode_event(e))
+
+        for e in encode_event_fn(state, event_value, codec):
+            events.append(codec.encode_event(e))
+
+    # After the last event, continue filling out the event_start_indices array.
+    # The inequality is not strict because if our current step lines up exactly
+    # with (the start of) an audio frame, we need to add an additional shift event
+    # to "cover" that frame.
+    while cur_step / codec.steps_per_second <= frame_times[-1]:
+        events.append(codec.encode_event(Event(type="shift", value=1)))
+        cur_step += 1
+        fill_event_start_indices_to_cur_step()
+        cur_event_idx = len(events)
+
+    # Now fill in event_end_indices. We need this extra array to make sure that
+    # when we slice events, each slice ends exactly where the subsequent slice
+    # begins.
+    event_end_indices = event_start_indices[1:] + [len(events)]
+
+    events = np.array(events)
+    state_events = np.array(state_events)
+    event_start_indices = np.array(event_start_indices)
+    event_end_indices = np.array(event_end_indices)
+    state_event_indices = np.array(state_event_indices)
+
+    return {
+        "inputs": events.astype(np.int32),
+        "event_start_indices": event_start_indices.astype(np.int32),
+        "event_end_indices": event_end_indices.astype(np.int32),
+        "state_events": state_events.astype(np.int32),
+        "state_event_indices": state_event_indices.astype(np.int32),
+    }

From d8544cb502d5298c4a531d9e0ed1fbec543da180 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 8 Dec 2022 11:45:23 +0100
Subject: [PATCH 050/131] added int to int tokenizer

---
 .../pipelines/spectrogram_diffusion/data.py    | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/data.py
index 5a74a20e3c46..8b20ed117f17 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/data.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/data.py
@@ -95,6 +95,24 @@ class Event:
     value: int
 
 
+class Tokenizer:
+    def __init__(self, regular_ids: int):
+        # The special tokens: 0=PAD, 1=EOS, and 2=UNK
+        self._num_special_tokens = 3
+        self._num_regular_tokens = regular_ids
+
+    def encode(self, token_ids):
+        encoded = []
+        for token_id in token_ids:
+            if not 0 <= token_id < self._num_regular_tokens:
+                raise ValueError(
+                    f"token_id {token_id} does not fall within valid range of " f"[0, {self._num_regular_tokens})"
+                )
+            encoded.append(token_id + self._num_special_tokens)
+
+        return encoded
+
+
 class Codec:
     """Encode and decode events.
 

From 5f628432107a9d31feb5566a367bcb665dee3f44 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 8 Dec 2022 11:46:58 +0100
Subject: [PATCH 051/131] remove duplicate

---
 .../pipelines/spectrogram_diffusion/data.py   | 30 -------------------
 1 file changed, 30 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/data.py
index 8b20ed117f17..47ea7980898d 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/data.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/data.py
@@ -268,36 +268,6 @@ def note_sequence_to_onsets_and_offsets_and_programs(
     return times, values
 
 
-def note_sequence_to_onsets_and_offsets_and_programs(
-    ns: note_seq.NoteSequence,
-) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
-    """Extract onset & offset times and pitches & programs from a NoteSequence.
-
-    The onset & offset times will not necessarily be in sorted order.
-
-    Args:
-    ns: NoteSequence from which to extract onsets and offsets.
-
-    Returns:
-    times: A list of note onset and offset times.
-    values: A list of NoteEventData objects where velocity is zero for note
-        offsets.
-    """
-    # Sort by program and pitch and put offsets before onsets as a tiebreaker for
-    # subsequent stable sort.
-    notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
-    times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes]
-    values = [
-        NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
-        for note in notes
-        if not note.is_drum
-    ] + [
-        NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum)
-        for note in notes
-    ]
-    return times, values
-
-
 def num_velocity_bins_from_codec(codec: Codec):
     """Get number of velocity bins from event codec."""
     lo, hi = codec.event_type_range("velocity")

From 505e78a310ef96057cf0e7d9cc2348e08f148e26 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 9 Dec 2022 10:18:46 +0100
Subject: [PATCH 052/131] added logic for segments

---
 .../pipelines/spectrogram_diffusion/data.py   | 46 +++++++++++++------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/data.py
index 47ea7980898d..8b5fd1afdfdc 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/data.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/data.py
@@ -9,6 +9,9 @@
 import torch.nn.functional as F
 
 
+INPUT_FEATURE_LENGTHS = 2048
+TARGET_FEATURE_LENGTHS = 256
+
 SAMPLE_RATE = 16000
 HOP_SIZE = 320
 FRAME_RATE = int(SAMPLE_RATE // HOP_SIZE)
@@ -110,6 +113,12 @@ def encode(self, token_ids):
                 )
             encoded.append(token_id + self._num_special_tokens)
 
+        # Add EOS token
+        encoded.append(1)
+
+        # Pad to till INPUT_FEATURE_LENGTHS
+        encoded = encoded + [0] * (INPUT_FEATURE_LENGTHS - len(encoded))
+
         return encoded
 
 
@@ -274,6 +283,11 @@ def num_velocity_bins_from_codec(codec: Codec):
     return hi - lo
 
 
+# segment an array into segments of length n
+def segment(a, n):
+    return [a[i : i + n] for i in range(0, len(a), n)]
+
+
 def velocity_to_bin(velocity, num_velocity_bins):
     if velocity == 0:
         return 0
@@ -412,16 +426,22 @@ def fill_event_start_indices_to_cur_step():
     # begins.
     event_end_indices = event_start_indices[1:] + [len(events)]
 
-    events = np.array(events)
-    state_events = np.array(state_events)
-    event_start_indices = np.array(event_start_indices)
-    event_end_indices = np.array(event_end_indices)
-    state_event_indices = np.array(state_event_indices)
-
-    return {
-        "inputs": events.astype(np.int32),
-        "event_start_indices": event_start_indices.astype(np.int32),
-        "event_end_indices": event_end_indices.astype(np.int32),
-        "state_events": state_events.astype(np.int32),
-        "state_event_indices": state_event_indices.astype(np.int32),
-    }
+    events = np.array(events).astype(np.int32)
+    state_events = np.array(state_events).astype(np.int32)
+    event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTHS)
+    event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTHS)
+    state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTHS)
+
+    outputs = []
+    for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices):
+        outputs.append(
+            {
+                "inputs": events,
+                "event_start_indices": start_indices,
+                "event_end_indices": end_indices,
+                "state_events": state_events,
+                "state_event_indices": event_indices,
+            }
+        )
+
+    return outputs

From 52f7896a1ee5c9a5208b1aca24a8fd3b1e3e4e3b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 9 Dec 2022 10:49:12 +0100
Subject: [PATCH 053/131] add melgan to pipeline

---
 scripts/convert_music_spectrogram_to_diffusers.py      | 10 ++++++++--
 .../pipeline_spectrogram_diffusion.py                  |  8 +++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 2d011662dc74..d9ef3340b2e0 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 
-from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline
+from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline, OnnxRuntimeModel
 from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
 from music_spectrogram_diffusion import inference
 from t5x import checkpoints
@@ -179,8 +179,14 @@ def main(args):
     continuous_encoder = load_continuous_encoder(t5_checkpoint["target"]["continuous_encoder"], continuous_encoder)
     decoder = load_decoder(t5_checkpoint["target"]["decoder"], decoder)
 
+    melgan = OnnxRuntimeModel.from_pretrained("kashif/soundstream_mel_decoder")
+
     pipe = SpectrogramDiffusionPipeline(
-        notes_encoder=notes_encoder, continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler
+        notes_encoder=notes_encoder,
+        continuous_encoder=continuous_encoder,
+        decoder=decoder,
+        scheduler=scheduler,
+        melgan=melgan,
     )
     if args.save:
         pipe.save_pretrained(args.output_path)
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index fe6b87f6e80a..20ff8cc5851f 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -18,6 +18,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...modeling_utils import ModelMixin
 from ...models.embeddings import get_timestep_embedding
+from ...onnx_utils import OnnxRuntimeModel
 from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput
 from ...schedulers import DDPMScheduler
 
@@ -460,6 +461,7 @@ def __init__(
         continuous_encoder: SpectrogramContEncoder,
         decoder: T5FilmDecoder,
         scheduler: DDPMScheduler,
+        melgan: OnnxRuntimeModel,
     ) -> None:
         super().__init__()
 
@@ -468,7 +470,11 @@ def __init__(
         self.max_value = 4.0  # Largest value for most examples
 
         self.register_modules(
-            notes_encoder=notes_encoder, continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler
+            notes_encoder=notes_encoder,
+            continuous_encoder=continuous_encoder,
+            decoder=decoder,
+            scheduler=scheduler,
+            melgan=melgan,
         )
 
     def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):

From 1e267765ae93e3d680d1956546c1299f812d9a03 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 9 Dec 2022 12:45:16 +0100
Subject: [PATCH 054/131] move autoregressive gen into pipeline

---
 src/diffusers/pipeline_utils.py               |  14 --
 .../{data.py => midi_utils.py}                |  14 +-
 .../pipeline_spectrogram_diffusion.py         | 140 ++++++++++++++----
 3 files changed, 119 insertions(+), 49 deletions(-)
 rename src/diffusers/pipelines/spectrogram_diffusion/{data.py => midi_utils.py} (98%)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 57627c80df41..01bcc6a33803 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -118,20 +118,6 @@ class AudioPipelineOutput(BaseOutput):
     audios: np.ndarray
 
 
-@dataclass
-class MelPipelineOutput(BaseOutput):
-    """
-    Output class for Mel pipelines.
-
-    Args:
-        mels (`np.ndarray`)
-            List of denoised samples of shape `(batch_size, time, num_channels)`. Numpy array present the denoised mel
-            samples of the diffusion pipeline.
-    """
-
-    mels: np.ndarray
-
-
 def is_safetensors_compatible(info) -> bool:
     filenames = set(sibling.rfilename for sibling in info.siblings)
     pt_filenames = set(filename for filename in filenames if filename.endswith(".bin"))
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/data.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
similarity index 98%
rename from src/diffusers/pipelines/spectrogram_diffusion/data.py
rename to src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
index 8b5fd1afdfdc..d94b7e1e8777 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/data.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -9,8 +9,8 @@
 import torch.nn.functional as F
 
 
-INPUT_FEATURE_LENGTHS = 2048
-TARGET_FEATURE_LENGTHS = 256
+INPUT_FEATURE_LENGTH = 2048
+TARGET_FEATURE_LENGTH = 256
 
 SAMPLE_RATE = 16000
 HOP_SIZE = 320
@@ -116,8 +116,8 @@ def encode(self, token_ids):
         # Add EOS token
         encoded.append(1)
 
-        # Pad to till INPUT_FEATURE_LENGTHS
-        encoded = encoded + [0] * (INPUT_FEATURE_LENGTHS - len(encoded))
+        # Pad to till INPUT_FEATURE_LENGTH
+        encoded = encoded + [0] * (INPUT_FEATURE_LENGTH - len(encoded))
 
         return encoded
 
@@ -428,9 +428,9 @@ def fill_event_start_indices_to_cur_step():
 
     events = np.array(events).astype(np.int32)
     state_events = np.array(state_events).astype(np.int32)
-    event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTHS)
-    event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTHS)
-    state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTHS)
+    event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
 
     outputs = []
     for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices):
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 20ff8cc5851f..88d8c2fc3f26 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -1,9 +1,11 @@
 import math
 from typing import Optional
 
+import numpy as np
 import torch
 import torch.nn as nn
 
+import note_seq
 from transformers.modeling_utils import ModuleUtilsMixin
 from transformers.models.t5.modeling_t5 import (
     T5Attention,
@@ -19,9 +21,31 @@
 from ...modeling_utils import ModelMixin
 from ...models.embeddings import get_timestep_embedding
 from ...onnx_utils import OnnxRuntimeModel
-from ...pipeline_utils import DiffusionPipeline, MelPipelineOutput
+from ...pipeline_utils import DiffusionPipeline, AudioPipelineOutput
 from ...schedulers import DDPMScheduler
 
+from .midi_utils import (
+    program_to_slakh_program,
+    audio_to_frames,
+    SAMPLE_RATE,
+    HOP_SIZE,
+    FRAME_RATE,
+    DEFAULT_MAX_SHIFT_SECONDS,
+    DEFAULT_STEPS_PER_SECOND,
+    DEFAULT_NUM_VELOCITY_BINS,
+    TARGET_FEATURE_LENGTH,
+    note_sequence_to_onsets_and_offsets_and_programs,
+    Codec,
+    EventRange,
+    encode_and_index_events,
+    NoteEncodingState,
+    note_event_data_to_events,
+    note_encoding_state_to_events,
+    NoteRepresentationConfig,
+    note_representation_processor_chain,
+    Tokenizer,
+)
+
 
 class FiLMLayer(nn.Module):
     def __init__(self, in_features, out_features):
@@ -468,6 +492,7 @@ def __init__(
         # From MELGAN
         self.min_value = math.log(1e-5)  # Matches MelGAN training.
         self.max_value = 4.0  # Largest value for most examples
+        self.n_dims = 128
 
         self.register_modules(
             notes_encoder=notes_encoder,
@@ -526,43 +551,102 @@ def decode(self, encodings_and_masks, input_tokens, noise_time):
     @torch.no_grad()
     def __call__(
         self,
-        encoder_input_tokens,
-        encoder_continuous_inputs,
-        encoder_continuous_mask,
+        midi_file,
         generator: Optional[torch.Generator] = None,
         num_inference_steps: int = 1000,
         return_dict: bool = True,
     ):
-        target_shape = encoder_continuous_inputs.shape
-        encoder_continuous_inputs = self.scale_features(encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True)
-
-        encodings_and_masks = self.encode(
-            input_tokens=encoder_input_tokens,
-            continuous_inputs=encoder_continuous_inputs,
-            continuous_mask=encoder_continuous_mask,
+        ns = note_seq.midi_file_to_note_sequence(midi_file)
+        ns_sus = note_seq.apply_sustain_control_changes(ns)
+
+        for note in ns_sus.notes:
+            if not note.is_drum:
+                note.program = program_to_slakh_program(note.program)
+
+        samples = np.zeros(int(ns_sus.total_time * SAMPLE_RATE))
+
+        _, frame_times = audio_to_frames(samples, HOP_SIZE, FRAME_RATE)
+        times, values = note_sequence_to_onsets_and_offsets_and_programs(ns_sus)
+
+        codec = Codec(
+            max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND,
+            steps_per_second=DEFAULT_STEPS_PER_SECOND,
+            event_ranges=[
+                EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+                EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS),
+                EventRange("tie", 0, 0),
+                EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM),
+                EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+            ],
+        )
+        tokenizer = Tokenizer(codec.num_classes)
+
+        events = encode_and_index_events(
+            state=NoteEncodingState(),
+            event_times=times,
+            event_values=values,
+            frame_times=frame_times,
+            codec=codec,
+            encode_event_fn=note_event_data_to_events,
+            encoding_state_to_events_fn=note_encoding_state_to_events,
         )
 
-        # Sample gaussian noise to begin loop
-        x = torch.randn(target_shape, generator=generator)
-        x = x.to(self.device)
+        note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True)
+        events = [note_representation_processor_chain(event, codec, note_representation_config) for event in events]
+        input_tokens = [tokenizer.encode(event["inputs"]) for event in events]
 
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
+        pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims])
+        full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
 
-        for t in self.progress_bar(self.scheduler.timesteps):
-            output = self.decode(
-                encodings_and_masks=encodings_and_masks,
-                input_tokens=x,
-                noise_time=t / num_inference_steps,  # rescale to [0, 1)
-            )
+        for i, encoder_input_tokens in enumerate(input_tokens):
+            encoder_continuous_inputs = pred_mel[:1]
+            if i == 0:
+                # The first chunk has no previous context.
+                encoder_continuous_mask = np.zeros((1, TARGET_FEATURE_LENGTH))
+            else:
+                # The full song pipeline does not feed in a context feature, so the mask
+                # will be all 0s after the feature converter. Because we know we're
+                # feeding in a full context chunk from the previous prediction, set it
+                # to all 1s.
+                encoder_continuous_mask = np.ones((1, TARGET_FEATURE_LENGTH))
+
+                target_shape = encoder_continuous_inputs.shape
+                encoder_continuous_inputs = self.scale_features(
+                    encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
+                )
+
+                encodings_and_masks = self.encode(
+                    input_tokens=encoder_input_tokens.to(self.device),
+                    continuous_inputs=encoder_continuous_inputs.to(self.device),
+                    continuous_mask=encoder_continuous_mask.to(self.device),
+                )
+
+                # Sample gaussian noise to begin loop
+                x = torch.randn(target_shape, generator=generator)
+                x = x.to(self.device)
+
+                # set step values
+                self.scheduler.set_timesteps(num_inference_steps)
+
+                # Denoising diffusion loop
+                for t in self.progress_bar(self.scheduler.timesteps):
+                    output = self.decode(
+                        encodings_and_masks=encodings_and_masks,
+                        input_tokens=x,
+                        noise_time=t / num_inference_steps,  # rescale to [0, 1)
+                    )
+
+                    # Compute previous output: x_t -> x_t-1
+                    x = self.scheduler.step(output, t, x, generator=generator).prev_sample
+
+                mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
+                pred_mel = mel.cpu().numpy()
 
-            # 2. compute previous output: x_t -> x_t-1
-            x = self.scheduler.step(output, t, x, generator=generator).prev_sample
+            full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
 
-        mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
-        mel = mel.cpu().numpy()
+        full_pred_audio = self.melgan(input_features=full_pred_mel.astype(np.float32))
 
         if not return_dict:
-            return (mel,)
+            return (full_pred_audio,)
 
-        return MelPipelineOutput(mels=mel)
+        return AudioPipelineOutput(audios=full_pred_audio)

From a643c8b2e6cafbc457626296781fcc474526d756 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 9 Dec 2022 12:56:53 +0100
Subject: [PATCH 055/131] added note_representation_processor_chain

---
 .../convert_music_spectrogram_to_diffusers.py |   2 +-
 .../spectrogram_diffusion/midi_utils.py       | 213 ++++++++++++++++--
 .../pipeline_spectrogram_diffusion.py         |  39 ++--
 3 files changed, 216 insertions(+), 38 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index d9ef3340b2e0..1090e5c31fc7 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 
-from diffusers import DDPMScheduler, SpectrogramDiffusionPipeline, OnnxRuntimeModel
+from diffusers import DDPMScheduler, OnnxRuntimeModel, SpectrogramDiffusionPipeline
 from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
 from music_spectrogram_diffusion import inference
 from t5x import checkpoints
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
index d94b7e1e8777..0a30bc807d2f 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -1,13 +1,29 @@
-from typing import Sequence, Tuple, Optional, MutableMapping, List, Callable, Mapping, Any
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import dataclasses
 import math
-from immutabledict import immutabledict
+from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Sequence, Tuple
 
 import numpy as np
-import note_seq
 import torch
 import torch.nn.functional as F
 
+import note_seq
+from immutabledict import immutabledict
+
 
 INPUT_FEATURE_LENGTH = 2048
 TARGET_FEATURE_LENGTH = 256
@@ -109,7 +125,7 @@ def encode(self, token_ids):
         for token_id in token_ids:
             if not 0 <= token_id < self._num_regular_tokens:
                 raise ValueError(
-                    f"token_id {token_id} does not fall within valid range of " f"[0, {self._num_regular_tokens})"
+                    f"token_id {token_id} does not fall within valid range of [0, {self._num_regular_tokens})"
                 )
             encoded.append(token_id + self._num_special_tokens)
 
@@ -125,13 +141,12 @@ def encode(self, token_ids):
 class Codec:
     """Encode and decode events.
 
-    Useful for declaring what certain ranges of a vocabulary should be used for.
-    This is intended to be used from Python before encoding or after decoding with
-    GenericTokenVocabulary. This class is more lightweight and does not include
-    things like EOS or UNK token handling.
+    Useful for declaring what certain ranges of a vocabulary should be used for. This is intended to be used from
+    Python before encoding or after decoding with GenericTokenVocabulary. This class is more lightweight and does not
+    include things like EOS or UNK token handling.
 
-    To ensure that 'shift' events are always the first block of the vocab and
-    start at 0, that event type is required and specified separately.
+    To ensure that 'shift' events are always the first block of the vocab and start at 0, that event type is required
+    and specified separately.
     """
 
     def __init__(self, max_shift_steps: int, steps_per_second: float, event_ranges: List[EventRange]):
@@ -199,6 +214,39 @@ def decode_event_index(self, index: int) -> Event:
         raise ValueError(f"Unknown event index: {index}")
 
 
+@dataclasses.dataclass
+class ProgramGranularity:
+    # both tokens_map_fn and program_map_fn should be idempotent
+    tokens_map_fn: Callable[[Sequence[int], Codec], Sequence[int]]
+    program_map_fn: Callable[[int], int]
+
+
+def drop_programs(tokens, codec: Codec):
+    """Drops program change events from a token sequence."""
+    min_program_id, max_program_id = codec.event_type_range("program")
+    return tokens[(tokens < min_program_id) | (tokens > max_program_id)]
+
+
+def programs_to_midi_classes(tokens, codec):
+    """Modifies program events to be the first program in the MIDI class."""
+    min_program_id, max_program_id = codec.event_type_range("program")
+    is_program = (tokens >= min_program_id) & (tokens <= max_program_id)
+    return tf.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens)
+
+
+PROGRAM_GRANULARITIES = {
+    # "flat" granularity; drop program change tokens and set NoteSequence
+    # programs to zero
+    "flat": ProgramGranularity(tokens_map_fn=drop_programs, program_map_fn=lambda program: 0),
+    # map each program to the first program in its MIDI class
+    "midi_class": ProgramGranularity(
+        tokens_map_fn=programs_to_midi_classes, program_map_fn=lambda program: 8 * (program // 8)
+    ),
+    # leave programs as is
+    "full": ProgramGranularity(tokens_map_fn=lambda tokens, codec: tokens, program_map_fn=lambda program: program),
+}
+
+
 def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
     """
     equivalent of tf.signal.frame
@@ -258,8 +306,8 @@ def note_sequence_to_onsets_and_offsets_and_programs(
       ns: NoteSequence from which to extract onsets and offsets.
 
     Returns:
-      times: A list of note onset and offset times.
-      values: A list of NoteEventData objects where velocity is zero for note
+      times: A list of note onset and offset times. values: A list of NoteEventData objects where velocity is zero for
+      note
           offsets.
     """
     # Sort by program and pitch and put offsets before onsets as a tiebreaker for
@@ -342,12 +390,10 @@ def encode_and_index_events(
 ):
     """Encode a sequence of timed events and index to audio frame times.
 
-    Encodes time shifts as repeated single step shifts for later run length
-    encoding.
+    Encodes time shifts as repeated single step shifts for later run length encoding.
 
-    Optionally, also encodes a sequence of "state events", keeping track of the
-    current encoding state at each audio frame. This can be used e.g. to prepend
-    events representing the current state to a targets segment.
+    Optionally, also encodes a sequence of "state events", keeping track of the current encoding state at each audio
+    frame. This can be used e.g. to prepend events representing the current state to a targets segment.
 
     Args:
       state: Initial event encoding state.
@@ -361,15 +407,13 @@ def encode_and_index_events(
           sequence of one or more Event objects.
 
     Returns:
-      events: Encoded events and shifts.
-      event_start_indices: Corresponding start event index for every audio frame.
-          Note: one event can correspond to multiple audio indices due to sampling
-          rate differences. This makes splitting sequences tricky because the same
-          event can appear at the end of one sequence and the beginning of
+      events: Encoded events and shifts. event_start_indices: Corresponding start event index for every audio frame.
+          Note: one event can correspond to multiple audio indices due to sampling rate differences. This makes
+          splitting sequences tricky because the same event can appear at the end of one sequence and the beginning of
           another.
       event_end_indices: Corresponding end event index for every audio frame. Used
-          to ensure when slicing that one chunk ends where the next begins. Should
-          always be true that event_end_indices[i] = event_start_indices[i + 1].
+          to ensure when slicing that one chunk ends where the next begins. Should always be true that
+          event_end_indices[i] = event_start_indices[i + 1].
       state_events: Encoded "state" events representing the encoding state before
           each event.
       state_event_indices: Corresponding state event index for every audio frame.
@@ -445,3 +489,124 @@ def fill_event_start_indices_to_cur_step():
         )
 
     return outputs
+
+
+def extract_sequence_with_indices(features, state_events_end_token=None, feature_key="inputs"):
+    """Extract target sequence corresponding to audio token segment."""
+    features = features.copy()
+    start_idx = features["event_start_indices"][0]
+    end_idx = features["event_end_indices"][-1]
+
+    features[feature_key] = features[feature_key][start_idx:end_idx]
+
+    if state_events_end_token is not None:
+        # Extract the state events corresponding to the audio start token, and
+        # prepend them to the targets array.
+        state_event_start_idx = features["state_event_indices"][0]
+        state_event_end_idx = state_event_start_idx + 1
+        while features["state_events"][state_event_end_idx - 1] != state_events_end_token:
+            state_event_end_idx += 1
+        features[feature_key] = np.concatenate(
+            [
+                features["state_events"][state_event_start_idx:state_event_end_idx],
+                features[feature_key],
+            ],
+            axis=0,
+        )
+
+    return features
+
+
+def map_midi_programs(
+    feature, codec: Codec, granularity_type: str = "full", feature_key: str = "inputs"
+) -> Mapping[str, Any]:
+    """Apply MIDI program map to token sequences."""
+    granularity = PROGRAM_GRANULARITIES[granularity_type]
+
+    feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], codec)
+    return feature
+
+
+def run_length_encode_shifts_fn(
+    features,
+    codec: Codec,
+    feature_key: str = "inputs",
+    state_change_event_types: Sequence[str] = (),
+) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
+    """Return a function that run-length encodes shifts for a given codec.
+
+    Args:
+      codec: The Codec to use for shift events.
+      feature_key: The feature key for which to run-length encode shifts.
+      state_change_event_types: A list of event types that represent state
+          changes; tokens corresponding to these event types will be interpreted as state changes and redundant ones
+          will be removed.
+
+    Returns:
+      A preprocessing function that run-length encodes single-step shifts.
+    """
+    state_change_event_ranges = [codec.event_type_range(event_type) for event_type in state_change_event_types]
+
+    def run_length_encode_shifts(features: MutableMapping[str, Any]) -> Mapping[str, Any]:
+        """Combine leading/interior shifts, trim trailing shifts.
+
+        Args:
+          features: Dict of features to process.
+
+        Returns:
+          A dict of features.
+        """
+        events = features[feature_key]
+
+        shift_steps = 0
+        total_shift_steps = 0
+        output = np.array([], dtype=np.int32)
+
+        current_state = np.zeros(len(state_change_event_ranges), dtype=np.int32)
+
+        for event in events:
+            if codec.is_shift_event_index(event):
+                shift_steps += 1
+                total_shift_steps += 1
+
+            else:
+                # If this event is a state change and has the same value as the current
+                # state, we can skip it entirely.
+                is_redundant = False
+                for i, (min_index, max_index) in enumerate(state_change_event_ranges):
+                    if (min_index <= event) and (event <= max_index):
+                        if current_state[i] == event:
+                            is_redundant = True
+                        current_state[i] = event
+                if is_redundant:
+                    continue
+
+                # Once we've reached a non-shift event, RLE all previous shift events
+                # before outputting the non-shift event.
+                if shift_steps > 0:
+                    shift_steps = total_shift_steps
+                    while shift_steps > 0:
+                        output_steps = np.minimum(codec.max_shift_steps, shift_steps)
+                        output = np.concatenate([output, [output_steps]], axis=0)
+                        shift_steps -= output_steps
+                output = np.concatenate([output, [event]], axis=0)
+
+        features[feature_key] = output
+        return features
+
+    return run_length_encode_shifts(features)
+
+
+def note_representation_processor_chain(features, codec: Codec, note_representation_config: NoteRepresentationConfig):
+    tie_token = codec.encode_event(Event("tie", 0))
+    state_events_end_token = tie_token if note_representation_config.include_ties else None
+
+    features = extract_sequence_with_indices(
+        features, state_events_end_token=state_events_end_token, feature_key="inputs"
+    )
+
+    features = map_midi_programs(features, codec)
+
+    features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"])
+
+    return features
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 88d8c2fc3f26..da419c76e8cd 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from typing import Optional
 
@@ -21,29 +35,28 @@
 from ...modeling_utils import ModelMixin
 from ...models.embeddings import get_timestep_embedding
 from ...onnx_utils import OnnxRuntimeModel
-from ...pipeline_utils import DiffusionPipeline, AudioPipelineOutput
+from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from ...schedulers import DDPMScheduler
-
 from .midi_utils import (
-    program_to_slakh_program,
-    audio_to_frames,
-    SAMPLE_RATE,
-    HOP_SIZE,
-    FRAME_RATE,
     DEFAULT_MAX_SHIFT_SECONDS,
-    DEFAULT_STEPS_PER_SECOND,
     DEFAULT_NUM_VELOCITY_BINS,
+    DEFAULT_STEPS_PER_SECOND,
+    FRAME_RATE,
+    HOP_SIZE,
+    SAMPLE_RATE,
     TARGET_FEATURE_LENGTH,
-    note_sequence_to_onsets_and_offsets_and_programs,
     Codec,
     EventRange,
-    encode_and_index_events,
     NoteEncodingState,
-    note_event_data_to_events,
-    note_encoding_state_to_events,
     NoteRepresentationConfig,
-    note_representation_processor_chain,
     Tokenizer,
+    audio_to_frames,
+    encode_and_index_events,
+    note_encoding_state_to_events,
+    note_event_data_to_events,
+    note_representation_processor_chain,
+    note_sequence_to_onsets_and_offsets_and_programs,
+    program_to_slakh_program,
 )
 
 
From 202b8105922e24e5e5ead1514df4bcda2649e21c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 9 Dec 2022 13:21:54 +0100
Subject: [PATCH 056/131] fix dtypes

---
 .../pipeline_spectrogram_diffusion.py             | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index da419c76e8cd..8343a262fef0 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -608,20 +608,20 @@ def __call__(
         events = [note_representation_processor_chain(event, codec, note_representation_config) for event in events]
         input_tokens = [tokenizer.encode(event["inputs"]) for event in events]
 
-        pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims])
+        pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
         full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
 
         for i, encoder_input_tokens in enumerate(input_tokens):
-            encoder_continuous_inputs = pred_mel[:1]
+            encoder_continuous_inputs = torch.from_numpy(pred_mel[:1].copy()).to(self.device)
             if i == 0:
                 # The first chunk has no previous context.
-                encoder_continuous_mask = np.zeros((1, TARGET_FEATURE_LENGTH))
+                encoder_continuous_mask = np.zeros((1, TARGET_FEATURE_LENGTH), dtype=np.bool)
             else:
                 # The full song pipeline does not feed in a context feature, so the mask
                 # will be all 0s after the feature converter. Because we know we're
                 # feeding in a full context chunk from the previous prediction, set it
                 # to all 1s.
-                encoder_continuous_mask = np.ones((1, TARGET_FEATURE_LENGTH))
+                encoder_continuous_mask = np.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool)
 
                 target_shape = encoder_continuous_inputs.shape
                 encoder_continuous_inputs = self.scale_features(
@@ -629,9 +629,9 @@ def __call__(
                 )
 
                 encodings_and_masks = self.encode(
-                    input_tokens=encoder_input_tokens.to(self.device),
-                    continuous_inputs=encoder_continuous_inputs.to(self.device),
-                    continuous_mask=encoder_continuous_mask.to(self.device),
+                    input_tokens=torch.IntTensor([encoder_input_tokens]).to(self.device),
+                    continuous_inputs=encoder_continuous_inputs,
+                    continuous_mask=torch.from_numpy(encoder_continuous_mask.copy()).to(self.device),
                 )
 
                 # Sample gaussian noise to begin loop
@@ -656,6 +656,7 @@ def __call__(
                 pred_mel = mel.cpu().numpy()
 
             full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
+            print("Generated segment", i)
 
         full_pred_audio = self.melgan(input_features=full_pred_mel.astype(np.float32))
 

From 085d766a8f72c7ae3d5abe5d1cb5ace0ec2e982f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 9 Dec 2022 13:46:57 +0100
Subject: [PATCH 057/131] remove immutabledict req

---
 .../spectrogram_diffusion/midi_utils.py       | 75 +++++++++----------
 1 file changed, 36 insertions(+), 39 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
index 0a30bc807d2f..215ead82c90d 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -22,7 +22,6 @@
 import torch.nn.functional as F
 
 import note_seq
-from immutabledict import immutabledict
 
 
 INPUT_FEATURE_LENGTH = 2048
@@ -36,44 +35,42 @@
 DEFAULT_MAX_SHIFT_SECONDS = 10
 DEFAULT_NUM_VELOCITY_BINS = 1
 
-SLAKH_CLASS_PROGRAMS = immutabledict(
-    {
-        "Acoustic Piano": 0,
-        "Electric Piano": 4,
-        "Chromatic Percussion": 8,
-        "Organ": 16,
-        "Acoustic Guitar": 24,
-        "Clean Electric Guitar": 26,
-        "Distorted Electric Guitar": 29,
-        "Acoustic Bass": 32,
-        "Electric Bass": 33,
-        "Violin": 40,
-        "Viola": 41,
-        "Cello": 42,
-        "Contrabass": 43,
-        "Orchestral Harp": 46,
-        "Timpani": 47,
-        "String Ensemble": 48,
-        "Synth Strings": 50,
-        "Choir and Voice": 52,
-        "Orchestral Hit": 55,
-        "Trumpet": 56,
-        "Trombone": 57,
-        "Tuba": 58,
-        "French Horn": 60,
-        "Brass Section": 61,
-        "Soprano/Alto Sax": 64,
-        "Tenor Sax": 66,
-        "Baritone Sax": 67,
-        "Oboe": 68,
-        "English Horn": 69,
-        "Bassoon": 70,
-        "Clarinet": 71,
-        "Pipe": 73,
-        "Synth Lead": 80,
-        "Synth Pad": 88,
-    }
-)
+SLAKH_CLASS_PROGRAMS = {
+    "Acoustic Piano": 0,
+    "Electric Piano": 4,
+    "Chromatic Percussion": 8,
+    "Organ": 16,
+    "Acoustic Guitar": 24,
+    "Clean Electric Guitar": 26,
+    "Distorted Electric Guitar": 29,
+    "Acoustic Bass": 32,
+    "Electric Bass": 33,
+    "Violin": 40,
+    "Viola": 41,
+    "Cello": 42,
+    "Contrabass": 43,
+    "Orchestral Harp": 46,
+    "Timpani": 47,
+    "String Ensemble": 48,
+    "Synth Strings": 50,
+    "Choir and Voice": 52,
+    "Orchestral Hit": 55,
+    "Trumpet": 56,
+    "Trombone": 57,
+    "Tuba": 58,
+    "French Horn": 60,
+    "Brass Section": 61,
+    "Soprano/Alto Sax": 64,
+    "Tenor Sax": 66,
+    "Baritone Sax": 67,
+    "Oboe": 68,
+    "English Horn": 69,
+    "Bassoon": 70,
+    "Clarinet": 71,
+    "Pipe": 73,
+    "Synth Lead": 80,
+    "Synth Pad": 88,
+}
 
 
 @dataclasses.dataclass

From 3edc9e19de85e4c3d9a6c5720ed8f3ba80473906 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 9 Dec 2022 14:11:59 +0100
Subject: [PATCH 058/131] initial doc

---
 .../api/pipelines/spectrogram_diffusion.mdx   | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 docs/source/api/pipelines/spectrogram_diffusion.mdx

diff --git a/docs/source/api/pipelines/spectrogram_diffusion.mdx b/docs/source/api/pipelines/spectrogram_diffusion.mdx
new file mode 100644
index 000000000000..e38b43043e51
--- /dev/null
+++ b/docs/source/api/pipelines/spectrogram_diffusion.mdx
@@ -0,0 +1,32 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+ # Multi-instrument Music Synthesis with Spectrogram Diffusion
+
+ ## Overview
+
+[Spectrogram Diffusion](https://arxiv.org/abs/2206.05408) by Hawthorne et al.
+
+An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes.
+
+The original codebase of this implementation can be found [here](https://github.com/magenta/music-spectrogram-diffusion).
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_spectrogram_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion) | *Unconditional Audio Generation* | - |
+
+
+## SpectrogramDiffusionPipeline
+[[autodoc]] SpectrogramDiffusionPipeline
+    - __call__

From 5472ef576664e36cefe06311bcb632faa753f590 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 14 Dec 2022 16:50:13 +0100
Subject: [PATCH 059/131] use np.where

---
 src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
index 215ead82c90d..5d0114a1549f 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -228,7 +228,7 @@ def programs_to_midi_classes(tokens, codec):
     """Modifies program events to be the first program in the MIDI class."""
     min_program_id, max_program_id = codec.event_type_range("program")
     is_program = (tokens >= min_program_id) & (tokens <= max_program_id)
-    return tf.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens)
+    return np.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens)
 
 
 PROGRAM_GRANULARITIES = {

From 87b5914d987ef33966b2cac57a23b8bf6b421f42 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 19 Dec 2022 15:56:19 +0100
Subject: [PATCH 060/131] require note_seq

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index f1aafd85658f..14fdd469cb39 100644
--- a/setup.py
+++ b/setup.py
@@ -94,6 +94,7 @@
     "k-diffusion",
     "librosa",
     "modelcards>=0.1.4",
+    "note_seq",
     "numpy",
     "parameterized",
     "pytest",

From cf24a45976d09baabdf458592e196f85f1053da9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 19 Dec 2022 15:57:09 +0100
Subject: [PATCH 061/131] fix typo

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 14fdd469cb39..b251d853333a 100644
--- a/setup.py
+++ b/setup.py
@@ -94,7 +94,7 @@
     "k-diffusion",
     "librosa",
     "modelcards>=0.1.4",
-    "note_seq",
+    "note-seq",
     "numpy",
     "parameterized",
     "pytest",

From 00465c4d975a0e2698ca0e2b54aacaec0c4860c8 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 19 Dec 2022 16:48:58 +0100
Subject: [PATCH 062/131] update dependency

---
 src/diffusers/dependency_versions_table.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 1ef1edc14629..fc46ec3730d5 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -18,6 +18,7 @@
     "k-diffusion": "k-diffusion",
     "librosa": "librosa",
     "modelcards": "modelcards>=0.1.4",
+    "note-seq": "note-seq",
     "numpy": "numpy",
     "parameterized": "parameterized",
     "pytest": "pytest",

From cd097b488894f92993573c60dce89e926c2a4b09 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 19 Dec 2022 16:55:15 +0100
Subject: [PATCH 063/131] added note-seq to test

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index b251d853333a..51e2b91101b6 100644
--- a/setup.py
+++ b/setup.py
@@ -186,6 +186,7 @@ def run(self):
     "datasets",
     "k-diffusion",
     "librosa",
+    "note-seq",
     "parameterized",
     "pytest",
     "pytest-timeout",

From 04ac770efaeb0b128a5d0840cdbb68f98f8432b6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 20 Dec 2022 17:50:40 +0100
Subject: [PATCH 064/131] added is_note_seq_available

---
 src/diffusers/__init__.py                     |  9 +++++++++
 src/diffusers/pipelines/__init__.py           | 10 +++++++++-
 src/diffusers/utils/__init__.py               |  1 +
 .../utils/dummy_torch_and_note_seq_objects.py | 19 +++++++++++++++++++
 src/diffusers/utils/import_utils.py           | 18 ++++++++++++++++++
 5 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 src/diffusers/utils/dummy_torch_and_note_seq_objects.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 2c8ac5e9a466..685d248c649f 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -8,6 +8,7 @@
     is_inflect_available,
     is_k_diffusion_available,
     is_librosa_available,
+    is_note_seq_available,
     is_onnx_available,
     is_scipy_available,
     is_torch_available,
@@ -144,6 +145,14 @@
 else:
     from .pipelines import AudioDiffusionPipeline, Mel
 
+try:
+    if not (is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils.dummy_torch_and_note_seq_objects import *  # noqa F403
+else:
+    from .pipelines import SpectrogramDiffusionPipeline
+
 try:
     if not is_flax_available():
         raise OptionalDependencyNotAvailable()
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b2936c9fa08a..50bb4abb02ff 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -3,6 +3,7 @@
     is_flax_available,
     is_k_diffusion_available,
     is_librosa_available,
+    is_note_seq_available,
     is_onnx_available,
     is_torch_available,
     is_transformers_available,
@@ -23,7 +24,6 @@
     from .pndm import PNDMPipeline
     from .repaint import RePaintPipeline
     from .score_sde_ve import ScoreSdeVePipeline
-    from .spectrogram_diffusion import SpectrogramDiffusionPipeline
     from .stochastic_karras_ve import KarrasVePipeline
 
 try:
@@ -34,6 +34,14 @@
 else:
     from .audio_diffusion import AudioDiffusionPipeline, Mel
 
+try:
+    if not (is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils.dummy_torch_and_note_seq_objects import *  # noqa F403
+else:
+    from .spectrogram_diffusion import SpectrogramDiffusionPipeline
+
 try:
     if not (is_torch_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index e5a4d323e3eb..e6336f037e9e 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -33,6 +33,7 @@
     is_k_diffusion_available,
     is_librosa_available,
     is_modelcards_available,
+    is_note_seq_available,
     is_onnx_available,
     is_safetensors_available,
     is_scipy_available,
diff --git a/src/diffusers/utils/dummy_torch_and_note_seq_objects.py b/src/diffusers/utils/dummy_torch_and_note_seq_objects.py
new file mode 100644
index 000000000000..288bec68ef2a
--- /dev/null
+++ b/src/diffusers/utils/dummy_torch_and_note_seq_objects.py
@@ -0,0 +1,19 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+class SpectrogramDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "note_seq"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "note_seq"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "note_seq"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "note_seq"])
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index ad3ab69f66b8..7dad57443eda 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -217,6 +217,13 @@
 except importlib_metadata.PackageNotFoundError:
     _k_diffusion_available = False
 
+_note_seq_available = importlib.util.find_spec("note_seq") is not None
+try:
+    _note_seq_version = importlib_metadata.version("note_seq")
+    logger.debug(f"Successfully imported note-seq version {_note_seq_version}")
+except importlib_metadata.PackageNotFoundError:
+    _note_seq_available = False
+
 
 def is_torch_available():
     return _torch_available
@@ -274,6 +281,10 @@ def is_k_diffusion_available():
     return _k_diffusion_available
 
 
+def is_note_seq_available():
+    return _note_seq_available
+
+
 # docstyle-ignore
 FLAX_IMPORT_ERROR = """
 {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
@@ -328,6 +339,12 @@ def is_k_diffusion_available():
 install k-diffusion`
 """
 
+# docstyle-ignore
+NOTE_SEQ_IMPORT_ERROR = """
+{0} requires the note-seq library but it was not found in your environment. You can install it with pip: `pip
+install note-seq`
+"""
+
 
 BACKENDS_MAPPING = OrderedDict(
     [
@@ -340,6 +357,7 @@ def is_k_diffusion_available():
         ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
         ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
         ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
+        ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
     ]
 )
 

From 2afaf2768e9013c378b851ceb2662a90bf9c2f33 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 20 Dec 2022 19:04:52 +0100
Subject: [PATCH 065/131] fix import

---
 src/diffusers/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 685d248c649f..37edc5378a6d 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -55,7 +55,6 @@
         PNDMPipeline,
         RePaintPipeline,
         ScoreSdeVePipeline,
-        SpectrogramDiffusionPipeline,
     )
     from .schedulers import (
         DDIMScheduler,

From 3acb123a08bd26936b96f92d08e719173a680aae Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Dec 2022 15:48:18 +0100
Subject: [PATCH 066/131] added toc

---
 docs/source/_toctree.yml                |  2 ++
 src/diffusers/utils/dummy_pt_objects.py | 15 ---------------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 52ad170dc33a..9ade1b88d3e9 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -132,6 +132,8 @@
       title: "RePaint"
     - local: api/pipelines/audio_diffusion
       title: "Audio Diffusion"
+    - local: api/pipelines/spectrogram_diffusion
+      title: "Spectrogram Diffusion"
     title: "Pipelines"
   - sections:
     - local: api/schedulers/overview
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 1e4e9d070448..63a7d258a902 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -332,21 +332,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
-class SpectrogramDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class DDIMScheduler(metaclass=DummyObject):
     _backends = ["torch"]
 

From b9d0842a9d8d32be2fc3d8a2c3c70fd394a3b02d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 30 Dec 2022 17:46:22 +0100
Subject: [PATCH 067/131] added example usage

---
 .../api/pipelines/spectrogram_diffusion.mdx    | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/docs/source/api/pipelines/spectrogram_diffusion.mdx b/docs/source/api/pipelines/spectrogram_diffusion.mdx
index e38b43043e51..a5117d90b067 100644
--- a/docs/source/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/api/pipelines/spectrogram_diffusion.mdx
@@ -18,7 +18,7 @@ specific language governing permissions and limitations under the License.
 
 An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes.
 
-The original codebase of this implementation can be found [here](https://github.com/magenta/music-spectrogram-diffusion).
+The original codebase of this implementation can be found at [magenta/music-spectrogram-diffusion)](https://github.com/magenta/music-spectrogram-diffusion).
 
 ## Available Pipelines:
 
@@ -27,6 +27,20 @@ The original codebase of this implementation can be found [here](https://github.
 | [pipeline_spectrogram_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion) | *Unconditional Audio Generation* | - |
 
 
+## Example usage
+
+```python
+from diffusers import SpectrogramDiffusionPipeline
+
+pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+pipe = pipe.to("cuda")
+
+output = pipe("beethoven_hammerklavier_2.mid")
+
+audio = output.audios[0]
+```
+
 ## SpectrogramDiffusionPipeline
 [[autodoc]] SpectrogramDiffusionPipeline
-    - __call__
+	- all
+	- __call__

From f3b4ad4c62f13fa651205afe214298531535f57c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 18 Jan 2023 13:24:32 +0100
Subject: [PATCH 068/131] undo for now

---
 docs/source/_toctree.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 9ade1b88d3e9..52ad170dc33a 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -132,8 +132,6 @@
       title: "RePaint"
     - local: api/pipelines/audio_diffusion
       title: "Audio Diffusion"
-    - local: api/pipelines/spectrogram_diffusion
-      title: "Spectrogram Diffusion"
     title: "Pipelines"
   - sections:
     - local: api/schedulers/overview

From 50908b82874c4ca031372b9455b34476c2fbfb0c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 18 Jan 2023 13:31:19 +0100
Subject: [PATCH 069/131] moved docs

---
 docs/source/en/_toctree.yml                                  | 2 ++
 docs/source/{ => en}/api/pipelines/spectrogram_diffusion.mdx | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)
 rename docs/source/{ => en}/api/pipelines/spectrogram_diffusion.mdx (97%)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 2c0d94fcc16b..3ee18bd7ecfb 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -120,6 +120,8 @@
       title: Safe Stable Diffusion
     - local: api/pipelines/score_sde_ve
       title: Score SDE VE
+    - local: api/pipelines/spectrogram_diffusion
+      title: "Spectrogram Diffusion"
     - sections:
       - local: api/pipelines/stable_diffusion/overview
         title: Overview
diff --git a/docs/source/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
similarity index 97%
rename from docs/source/api/pipelines/spectrogram_diffusion.mdx
rename to docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index a5117d90b067..816d729e4c27 100644
--- a/docs/source/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

From cfd0c7f1ad05940be877f83fd3f9f9b1c800b833 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Jan 2023 13:29:34 +0100
Subject: [PATCH 070/131] fix merge

---
 src/diffusers/utils/import_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 248d4edc6c49..d69d1d89d16a 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -354,6 +354,7 @@ def is_wandb_available():
 NOTE_SEQ_IMPORT_ERROR = """
 {0} requires the note-seq library but it was not found in your environment. You can install it with pip: `pip
 install note-seq`
+"""
 
 # docstyle-ignore
 WANDB_IMPORT_ERROR = """

From 71ed0dc26ab4bcde0ccf4d44055246afb88d3bca Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Jan 2023 13:42:16 +0100
Subject: [PATCH 071/131] fix imports

---
 .../spectrogram_diffusion/midi_utils.py        |  8 +++++++-
 .../pipeline_spectrogram_diffusion.py          | 18 ++++++++++++------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
index 5d0114a1549f..401645e13ecd 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -21,7 +21,13 @@
 import torch
 import torch.nn.functional as F
 
-import note_seq
+from ...utils import is_note_seq_available
+
+
+if is_note_seq_available():
+    import note_seq
+else:
+    raise ImportError("Please install note-seq via `pip install note-seq`")
 
 
 INPUT_FEATURE_LENGTH = 2048
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 8343a262fef0..0ee1dac96cf2 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 
 import math
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torch.nn as nn
 
-import note_seq
 from transformers.modeling_utils import ModuleUtilsMixin
 from transformers.models.t5.modeling_t5 import (
     T5Attention,
@@ -32,11 +31,11 @@
 )
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...modeling_utils import ModelMixin
 from ...models.embeddings import get_timestep_embedding
-from ...onnx_utils import OnnxRuntimeModel
-from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ...models import ModelMixin
 from ...schedulers import DDPMScheduler
+from ..onnx_utils import OnnxRuntimeModel
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .midi_utils import (
     DEFAULT_MAX_SHIFT_SECONDS,
     DEFAULT_NUM_VELOCITY_BINS,
@@ -58,6 +57,13 @@
     note_sequence_to_onsets_and_offsets_and_programs,
     program_to_slakh_program,
 )
+from ...utils import is_note_seq_available
+
+
+if is_note_seq_available():
+    import note_seq
+else:
+    raise ImportError("Please install note-seq via `pip install note-seq`")
 
 
 class FiLMLayer(nn.Module):
@@ -568,7 +574,7 @@ def __call__(
         generator: Optional[torch.Generator] = None,
         num_inference_steps: int = 1000,
         return_dict: bool = True,
-    ):
+    ) -> Union[AudioPipelineOutput, Tuple]:
         ns = note_seq.midi_file_to_note_sequence(midi_file)
         ns_sus = note_seq.apply_sustain_control_changes(ns)
 

From e4af28e9382ad94e6064f3437853374102279944 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 09:17:30 +0100
Subject: [PATCH 072/131] predict first segment

---
 .../pipeline_spectrogram_diffusion.py         | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 0ee1dac96cf2..537ba9891ea7 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -629,37 +629,37 @@ def __call__(
                 # to all 1s.
                 encoder_continuous_mask = np.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool)
 
-                target_shape = encoder_continuous_inputs.shape
-                encoder_continuous_inputs = self.scale_features(
-                    encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
-                )
+            target_shape = encoder_continuous_inputs.shape
+            encoder_continuous_inputs = self.scale_features(
+                encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
+            )
 
-                encodings_and_masks = self.encode(
-                    input_tokens=torch.IntTensor([encoder_input_tokens]).to(self.device),
-                    continuous_inputs=encoder_continuous_inputs,
-                    continuous_mask=torch.from_numpy(encoder_continuous_mask.copy()).to(self.device),
-                )
+            encodings_and_masks = self.encode(
+                input_tokens=torch.IntTensor([encoder_input_tokens]).to(self.device),
+                continuous_inputs=encoder_continuous_inputs,
+                continuous_mask=torch.from_numpy(encoder_continuous_mask.copy()).to(self.device),
+            )
 
-                # Sample gaussian noise to begin loop
-                x = torch.randn(target_shape, generator=generator)
-                x = x.to(self.device)
+            # Sample gaussian noise to begin loop
+            x = torch.randn(target_shape, generator=generator)
+            x = x.to(self.device)
 
-                # set step values
-                self.scheduler.set_timesteps(num_inference_steps)
+            # set step values
+            self.scheduler.set_timesteps(num_inference_steps)
 
-                # Denoising diffusion loop
-                for t in self.progress_bar(self.scheduler.timesteps):
-                    output = self.decode(
-                        encodings_and_masks=encodings_and_masks,
-                        input_tokens=x,
-                        noise_time=t / num_inference_steps,  # rescale to [0, 1)
-                    )
+            # Denoising diffusion loop
+            for t in self.progress_bar(self.scheduler.timesteps):
+                output = self.decode(
+                    encodings_and_masks=encodings_and_masks,
+                    input_tokens=x,
+                    noise_time=t / num_inference_steps,  # rescale to [0, 1)
+                )
 
-                    # Compute previous output: x_t -> x_t-1
-                    x = self.scheduler.step(output, t, x, generator=generator).prev_sample
+                # Compute previous output: x_t -> x_t-1
+                x = self.scheduler.step(output, t, x, generator=generator).prev_sample
 
-                mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
-                pred_mel = mel.cpu().numpy()
+            mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
+            pred_mel = mel.cpu().numpy()
 
             full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
             print("Generated segment", i)

From 8f74e27a8fb136f7f8ce6a33f1d4944602257293 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 10:53:43 +0100
Subject: [PATCH 073/131] avoid un-needed copy to and from cpu

---
 .../pipeline_spectrogram_diffusion.py         | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 537ba9891ea7..43fa692cd966 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -57,7 +57,7 @@
     note_sequence_to_onsets_and_offsets_and_programs,
     program_to_slakh_program,
 )
-from ...utils import is_note_seq_available
+from ...utils import is_note_seq_available, randn_tensor
 
 
 if is_note_seq_available():
@@ -451,7 +451,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
             decoder_noise_time * self.config.max_decoder_noise_time,
             embedding_dim=self.config.d_model,
             max_period=self.config.max_decoder_noise_time,
-        )
+        ).to(dtype=self.dtype)
 
         conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
 
@@ -616,33 +616,40 @@ def __call__(
 
         pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
         full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
+        ones = torch.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool, device=self.device)
 
         for i, encoder_input_tokens in enumerate(input_tokens):
-            encoder_continuous_inputs = torch.from_numpy(pred_mel[:1].copy()).to(self.device)
             if i == 0:
+                encoder_continuous_inputs = torch.from_numpy(pred_mel[:1].copy()).to(
+                    device=self.device, dtype=self.decoder.dtype
+                )
                 # The first chunk has no previous context.
-                encoder_continuous_mask = np.zeros((1, TARGET_FEATURE_LENGTH), dtype=np.bool)
+                encoder_continuous_mask = torch.zeros((1, TARGET_FEATURE_LENGTH), dtype=np.bool, device=self.device)
             else:
+                encoder_continuous_inputs = mel[:1]
                 # The full song pipeline does not feed in a context feature, so the mask
                 # will be all 0s after the feature converter. Because we know we're
                 # feeding in a full context chunk from the previous prediction, set it
                 # to all 1s.
-                encoder_continuous_mask = np.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool)
+                encoder_continuous_mask = ones
 
-            target_shape = encoder_continuous_inputs.shape
             encoder_continuous_inputs = self.scale_features(
                 encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
             )
 
             encodings_and_masks = self.encode(
-                input_tokens=torch.IntTensor([encoder_input_tokens]).to(self.device),
+                input_tokens=torch.IntTensor([encoder_input_tokens]).to(device=self.device),
                 continuous_inputs=encoder_continuous_inputs,
-                continuous_mask=torch.from_numpy(encoder_continuous_mask.copy()).to(self.device),
+                continuous_mask=encoder_continuous_mask,
             )
 
-            # Sample gaussian noise to begin loop
-            x = torch.randn(target_shape, generator=generator)
-            x = x.to(self.device)
+            # Sample encoder_continuous_inputs shaped gaussian noise to begin loop
+            x = randn_tensor(
+                shape=encoder_continuous_inputs.shape,
+                generator=generator,
+                device=self.device,
+                dtype=self.decoder.dtype,
+            )
 
             # set step values
             self.scheduler.set_timesteps(num_inference_steps)
@@ -659,7 +666,7 @@ def __call__(
                 x = self.scheduler.step(output, t, x, generator=generator).prev_sample
 
             mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
-            pred_mel = mel.cpu().numpy()
+            pred_mel = mel.cpu().float().numpy()
 
             full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
             print("Generated segment", i)

From bbddffa3d039c63930f4cdd1cb7f378819c6498c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 11:01:44 +0100
Subject: [PATCH 074/131] make style

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 43fa692cd966..a2b447438197 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -31,9 +31,10 @@
 )
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...models.embeddings import get_timestep_embedding
 from ...models import ModelMixin
+from ...models.embeddings import get_timestep_embedding
 from ...schedulers import DDPMScheduler
+from ...utils import is_note_seq_available, randn_tensor
 from ..onnx_utils import OnnxRuntimeModel
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .midi_utils import (
@@ -57,7 +58,6 @@
     note_sequence_to_onsets_and_offsets_and_programs,
     program_to_slakh_program,
 )
-from ...utils import is_note_seq_available, randn_tensor
 
 
 if is_note_seq_available():

From 908d8acc3959783b51732b4b06a057f56d680525 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 30 Jan 2023 14:45:02 +0100
Subject: [PATCH 075/131] Copyright

---
 src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py    | 2 +-
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
index 401645e13ecd..783c92702370 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -1,5 +1,5 @@
 # Copyright 2022 The Music Spectrogram Diffusion Authors.
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index a2b447438197..36b4f606320d 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -1,4 +1,5 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 9e57320ee8f34edbecd20223f16a5a6199c2c033 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 8 Feb 2023 08:54:17 +0100
Subject: [PATCH 076/131] fix style

---
 scripts/convert_music_spectrogram_to_diffusers.py             | 4 ++--
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py   | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 1090e5c31fc7..4481a4489514 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -4,11 +4,11 @@
 
 import torch
 import torch.nn as nn
+from music_spectrogram_diffusion import inference
+from t5x import checkpoints
 
 from diffusers import DDPMScheduler, OnnxRuntimeModel, SpectrogramDiffusionPipeline
 from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
-from music_spectrogram_diffusion import inference
-from t5x import checkpoints
 
 
 MODEL = "base_with_context"
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 36b4f606320d..282cbd1939e5 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -19,7 +19,6 @@
 import numpy as np
 import torch
 import torch.nn as nn
-
 from transformers.modeling_utils import ModuleUtilsMixin
 from transformers.models.t5.modeling_t5 import (
     T5Attention,
@@ -627,7 +626,6 @@ def __call__(
                 # The first chunk has no previous context.
                 encoder_continuous_mask = torch.zeros((1, TARGET_FEATURE_LENGTH), dtype=np.bool, device=self.device)
             else:
-                encoder_continuous_inputs = mel[:1]
                 # The full song pipeline does not feed in a context feature, so the mask
                 # will be all 0s after the feature converter. Because we know we're
                 # feeding in a full context chunk from the previous prediction, set it
@@ -667,6 +665,7 @@ def __call__(
                 x = self.scheduler.step(output, t, x, generator=generator).prev_sample
 
             mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
+            encoder_continuous_inputs = mel[:1]
             pred_mel = mel.cpu().float().numpy()
 
             full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)

From e8b73d0736431a22837421e6ffe912a39450b313 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 11:40:37 +0000
Subject: [PATCH 077/131] add test and fix inference steps

---
 elise_format0.mid                             | Bin 0 -> 14210 bytes
 elise_format0.mid.1                           | Bin 0 -> 14210 bytes
 elise_format0.mid.10                          | Bin 0 -> 14210 bytes
 elise_format0.mid.11                          | Bin 0 -> 14210 bytes
 elise_format0.mid.12                          | Bin 0 -> 14210 bytes
 elise_format0.mid.2                           | Bin 0 -> 14210 bytes
 elise_format0.mid.3                           | Bin 0 -> 14210 bytes
 elise_format0.mid.4                           | Bin 0 -> 14210 bytes
 elise_format0.mid.5                           | Bin 0 -> 14210 bytes
 elise_format0.mid.6                           | Bin 0 -> 14210 bytes
 elise_format0.mid.7                           | Bin 0 -> 14210 bytes
 elise_format0.mid.8                           | Bin 0 -> 14210 bytes
 elise_format0.mid.9                           | Bin 0 -> 14210 bytes
 .../pipeline_spectrogram_diffusion.py         |   6 +-
 .../test_spectrogram_diffusion.py             |  56 ++++++++++++++++++
 15 files changed, 59 insertions(+), 3 deletions(-)
 create mode 100644 elise_format0.mid
 create mode 100644 elise_format0.mid.1
 create mode 100644 elise_format0.mid.10
 create mode 100644 elise_format0.mid.11
 create mode 100644 elise_format0.mid.12
 create mode 100644 elise_format0.mid.2
 create mode 100644 elise_format0.mid.3
 create mode 100644 elise_format0.mid.4
 create mode 100644 elise_format0.mid.5
 create mode 100644 elise_format0.mid.6
 create mode 100644 elise_format0.mid.7
 create mode 100644 elise_format0.mid.8
 create mode 100644 elise_format0.mid.9
 create mode 100644 tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py

diff --git a/elise_format0.mid b/elise_format0.mid
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.1 b/elise_format0.mid.1
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.10 b/elise_format0.mid.10
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.11 b/elise_format0.mid.11
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.12 b/elise_format0.mid.12
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.2 b/elise_format0.mid.2
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.3 b/elise_format0.mid.3
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.4 b/elise_format0.mid.4
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.5 b/elise_format0.mid.5
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.6 b/elise_format0.mid.6
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.7 b/elise_format0.mid.7
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.8 b/elise_format0.mid.8
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/elise_format0.mid.9 b/elise_format0.mid.9
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 282cbd1939e5..9bfec87415e1 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -572,7 +572,7 @@ def __call__(
         self,
         midi_file,
         generator: Optional[torch.Generator] = None,
-        num_inference_steps: int = 1000,
+        num_inference_steps: int = 100,
         return_dict: bool = True,
     ) -> Union[AudioPipelineOutput, Tuple]:
         ns = note_seq.midi_file_to_note_sequence(midi_file)
@@ -618,7 +618,7 @@ def __call__(
         full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
         ones = torch.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool, device=self.device)
 
-        for i, encoder_input_tokens in enumerate(input_tokens):
+        for i, encoder_input_tokens in enumerate(input_tokens[:2]):
             if i == 0:
                 encoder_continuous_inputs = torch.from_numpy(pred_mel[:1].copy()).to(
                     device=self.device, dtype=self.decoder.dtype
@@ -658,7 +658,7 @@ def __call__(
                 output = self.decode(
                     encodings_and_masks=encodings_and_masks,
                     input_tokens=x,
-                    noise_time=t / num_inference_steps,  # rescale to [0, 1)
+                    noise_time=t / self.scheduler.config.num_train_timesteps,  # rescale to [0, 1)
                 )
 
                 # Compute previous output: x_t -> x_t-1
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
new file mode 100644
index 000000000000..f63947c419ba
--- /dev/null
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+import os
+
+from diffusers import SpectrogramDiffusionPipeline
+from diffusers.utils import slow, require_torch_gpu, torch_device
+import scipy
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+@slow
+@require_torch_gpu
+class PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_spectrogram(self):
+        device = torch_device
+
+        url = "http://www.piano-midi.de/midis/beethoven/elise_format0.mid"
+
+        os.system(f"wget {url}")
+
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(url.split("/")[-1], num_inference_steps=100)
+        audio = output.audios[0]
+        rate = 16_000
+        scipy.io.wavfile.write("/home/patrick_huggingface_co/audios/beet.wav", rate, audio[0])
+
+        print("Finished")

From 7dda0594fad299fbdbabdbdc8ae36e9878f30407 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 11:40:52 +0000
Subject: [PATCH 078/131] remove bogus files

---
 elise_format0.mid    | Bin 14210 -> 0 bytes
 elise_format0.mid.1  | Bin 14210 -> 0 bytes
 elise_format0.mid.10 | Bin 14210 -> 0 bytes
 elise_format0.mid.11 | Bin 14210 -> 0 bytes
 elise_format0.mid.12 | Bin 14210 -> 0 bytes
 elise_format0.mid.2  | Bin 14210 -> 0 bytes
 elise_format0.mid.3  | Bin 14210 -> 0 bytes
 elise_format0.mid.4  | Bin 14210 -> 0 bytes
 elise_format0.mid.5  | Bin 14210 -> 0 bytes
 elise_format0.mid.6  | Bin 14210 -> 0 bytes
 elise_format0.mid.7  | Bin 14210 -> 0 bytes
 elise_format0.mid.8  | Bin 14210 -> 0 bytes
 elise_format0.mid.9  | Bin 14210 -> 0 bytes
 13 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 elise_format0.mid
 delete mode 100644 elise_format0.mid.1
 delete mode 100644 elise_format0.mid.10
 delete mode 100644 elise_format0.mid.11
 delete mode 100644 elise_format0.mid.12
 delete mode 100644 elise_format0.mid.2
 delete mode 100644 elise_format0.mid.3
 delete mode 100644 elise_format0.mid.4
 delete mode 100644 elise_format0.mid.5
 delete mode 100644 elise_format0.mid.6
 delete mode 100644 elise_format0.mid.7
 delete mode 100644 elise_format0.mid.8
 delete mode 100644 elise_format0.mid.9

diff --git a/elise_format0.mid b/elise_format0.mid
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.1 b/elise_format0.mid.1
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.10 b/elise_format0.mid.10
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.11 b/elise_format0.mid.11
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.12 b/elise_format0.mid.12
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.2 b/elise_format0.mid.2
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.3 b/elise_format0.mid.3
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.4 b/elise_format0.mid.4
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.5 b/elise_format0.mid.5
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.6 b/elise_format0.mid.6
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.7 b/elise_format0.mid.7
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.8 b/elise_format0.mid.8
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/elise_format0.mid.9 b/elise_format0.mid.9
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I


From 19e60135a47502eef44eedfd63b85c246b47c543 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 14:43:07 +0200
Subject: [PATCH 079/131] reorder models

---
 _                                             | 328 +++++++++++++
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/models/__init__.py              |   1 +
 src/diffusers/models/t5_film_transformer.py   | 328 +++++++++++++
 .../spectrogram_diffusion/__init__.py         |   3 +-
 .../continous_encoder.py                      |  92 ++++
 .../spectrogram_diffusion/notes_encoder.py    |  86 ++++
 .../pipeline_spectrogram_diffusion.py         | 449 +-----------------
 tests/fixtures/elise_format0.mid              | Bin 0 -> 14210 bytes
 .../test_spectrogram_diffusion.py             |  26 +-
 10 files changed, 861 insertions(+), 453 deletions(-)
 create mode 100644 _
 create mode 100644 src/diffusers/models/t5_film_transformer.py
 create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/continous_encoder.py
 create mode 100644 src/diffusers/pipelines/spectrogram_diffusion/notes_encoder.py
 create mode 100644 tests/fixtures/elise_format0.mid

diff --git a/_ b/_
new file mode 100644
index 000000000000..8fc9522cf348
--- /dev/null
+++ b/_
@@ -0,0 +1,328 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from transformers.models.t5.modeling_t5 import (
+    T5Attention,
+    T5Config,
+    T5DenseActDense,
+    T5DenseGatedActDense,
+    T5LayerCrossAttention,
+    T5LayerNorm,
+)
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .modeling_utils import ModelMixin
+from .embeddings import get_timestep_embedding
+
+
+class FiLMLayer(nn.Module):
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
+
+    def forward(self, x, conditioning_emb):
+        scale_bias = self.scale_bias(conditioning_emb)
+        scale, bias = torch.chunk(scale_bias, 2, -1)
+        return x * (scale + 1.0) + bias
+
+
+class T5LayerFFCond(nn.Module):
+    def __init__(self, config: T5Config):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = T5DenseGatedActDense(config)
+        else:
+            self.DenseReluDense = T5DenseActDense(config)
+
+        self.film = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states, conditioning_emb=None):
+        forwarded_states = self.layer_norm(hidden_states)
+        if conditioning_emb is not None:
+            forwarded_states = self.film(forwarded_states, conditioning_emb)
+
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(config.d_model)
+        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]
+        return outputs
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer = nn.ModuleList()
+        self.config = config
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(config))
+
+        # Film Cond MLP + dropout: last layer
+        self.layer.append(T5LayerFFCond(config))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+        if past_key_value is not None:
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        if encoder_hidden_states is not None:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_extended_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Film Conditional Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class T5FilmDecoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int,
+        targets_length: int,
+        max_decoder_noise_time: float,
+        d_model: int,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        dropout_rate: float,
+        feed_forward_proj: str,
+    ):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(d_model, d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(d_model * 4, d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+
+        self.position_encoding = nn.Embedding(targets_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
+
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+            dropout_rate=dropout_rate,
+            is_decoder=True,
+            is_encoder_decoder=False,
+        )
+        self.decoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(t5config)
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(d_model)
+
+        self.post_dropout = nn.Dropout(p=dropout_rate)
+        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
+
+    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
+        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        time_steps = get_timestep_embedding(
+            decoder_noise_time * self.config.max_decoder_noise_time,
+            embedding_dim=self.config.d_model,
+            max_period=self.config.max_decoder_noise_time,
+        ).to(dtype=self.dtype)
+
+        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
+
+        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+        inputs += position_encodings
+        y = self.dropout(inputs)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+
+        for lyr in self.decoders:
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index bcc0c46c0c68..816b23035080 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -38,6 +38,7 @@
         ModelMixin,
         PriorTransformer,
         Transformer2DModel,
+        T5FilmDecoder,
         UNet1DModel,
         UNet2DConditionModel,
         UNet2DModel,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 474b8412560e..8648308b235c 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -25,6 +25,7 @@
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
     from .vq_model import VQModel
+    from .t5_film_transformer import T5FilmDecoder
 
 if is_flax_available():
     from .unet_2d_condition_flax import FlaxUNet2DConditionModel
diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
new file mode 100644
index 000000000000..8fc9522cf348
--- /dev/null
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -0,0 +1,328 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from transformers.models.t5.modeling_t5 import (
+    T5Attention,
+    T5Config,
+    T5DenseActDense,
+    T5DenseGatedActDense,
+    T5LayerCrossAttention,
+    T5LayerNorm,
+)
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .modeling_utils import ModelMixin
+from .embeddings import get_timestep_embedding
+
+
+class FiLMLayer(nn.Module):
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
+
+    def forward(self, x, conditioning_emb):
+        scale_bias = self.scale_bias(conditioning_emb)
+        scale, bias = torch.chunk(scale_bias, 2, -1)
+        return x * (scale + 1.0) + bias
+
+
+class T5LayerFFCond(nn.Module):
+    def __init__(self, config: T5Config):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = T5DenseGatedActDense(config)
+        else:
+            self.DenseReluDense = T5DenseActDense(config)
+
+        self.film = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states, conditioning_emb=None):
+        forwarded_states = self.layer_norm(hidden_states)
+        if conditioning_emb is not None:
+            forwarded_states = self.film(forwarded_states, conditioning_emb)
+
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(config.d_model)
+        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]
+        return outputs
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.layer = nn.ModuleList()
+        self.config = config
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(config))
+
+        # Film Cond MLP + dropout: last layer
+        self.layer.append(T5LayerFFCond(config))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+        if past_key_value is not None:
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        if encoder_hidden_states is not None:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_extended_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Film Conditional Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class T5FilmDecoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int,
+        targets_length: int,
+        max_decoder_noise_time: float,
+        d_model: int,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        dropout_rate: float,
+        feed_forward_proj: str,
+    ):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(d_model, d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(d_model * 4, d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+
+        self.position_encoding = nn.Embedding(targets_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
+
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+            dropout_rate=dropout_rate,
+            is_decoder=True,
+            is_encoder_decoder=False,
+        )
+        self.decoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(t5config)
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(d_model)
+
+        self.post_dropout = nn.Dropout(p=dropout_rate)
+        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
+
+    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
+        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        time_steps = get_timestep_embedding(
+            decoder_noise_time * self.config.max_decoder_noise_time,
+            embedding_dim=self.config.d_model,
+            max_period=self.config.max_decoder_noise_time,
+        ).to(dtype=self.dtype)
+
+        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
+
+        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+        inputs += position_encodings
+        y = self.dropout(inputs)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+
+        for lyr in self.decoders:
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index df245e763cce..04badb03af1d 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,7 +1,8 @@
 # flake8: noqa
+from .notes_encoder import SpectrogramNotesEncoder
+from .continous_encoder import SpectrogramContEncoder
 from .pipeline_spectrogram_diffusion import (
     SpectrogramContEncoder,
     SpectrogramDiffusionPipeline,
-    SpectrogramNotesEncoder,
     T5FilmDecoder,
 )
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/continous_encoder.py b/src/diffusers/pipelines/spectrogram_diffusion/continous_encoder.py
new file mode 100644
index 000000000000..556136d4023d
--- /dev/null
+++ b/src/diffusers/pipelines/spectrogram_diffusion/continous_encoder.py
@@ -0,0 +1,92 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from transformers.modeling_utils import ModuleUtilsMixin
+from transformers.models.t5.modeling_t5 import (
+    T5Block,
+    T5Config,
+    T5LayerNorm,
+)
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+
+class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int,
+        targets_context_length: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+        is_decoder: bool = False,
+    ):
+        super().__init__()
+
+        self.input_proj = nn.Linear(input_dims, d_model, bias=False)
+
+        self.position_encoding = nn.Embedding(targets_context_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+            dropout_rate=dropout_rate,
+            is_decoder=is_decoder,
+            is_encoder_decoder=False,
+        )
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            lyr = T5Block(t5config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(d_model)
+        self.dropout_post = nn.Dropout(p=dropout_rate)
+
+    def forward(self, encoder_inputs, encoder_inputs_mask):
+        x = self.input_proj(encoder_inputs)
+
+        # terminal relative positional encodings
+        max_positions = encoder_inputs.shape[1]
+        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
+
+        seq_lens = encoder_inputs_mask.sum(-1)
+        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
+        x += self.position_encoding(input_positions)
+
+        x = self.dropout_pre(x)
+
+        # inverted the attention mask
+        input_shape = encoder_inputs.size()
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
+
+        for lyr in self.encoders:
+            x = lyr(x, extended_attention_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/notes_encoder.py b/src/diffusers/pipelines/spectrogram_diffusion/notes_encoder.py
new file mode 100644
index 000000000000..94eaa176f3e5
--- /dev/null
+++ b/src/diffusers/pipelines/spectrogram_diffusion/notes_encoder.py
@@ -0,0 +1,86 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from transformers.modeling_utils import ModuleUtilsMixin
+from transformers.models.t5.modeling_t5 import T5Block, T5Config, T5LayerNorm
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+
+class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+    @register_to_config
+    def __init__(
+        self,
+        max_length: int,
+        vocab_size: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+        is_decoder: bool = False,
+    ):
+        super().__init__()
+
+        self.token_embedder = nn.Embedding(vocab_size, d_model)
+
+        self.position_encoding = nn.Embedding(max_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            vocab_size=vocab_size,
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            dropout_rate=dropout_rate,
+            feed_forward_proj=feed_forward_proj,
+            is_decoder=is_decoder,
+            is_encoder_decoder=False,
+        )
+
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            lyr = T5Block(t5config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(d_model)
+        self.dropout_post = nn.Dropout(p=dropout_rate)
+
+    def forward(self, encoder_input_tokens, encoder_inputs_mask):
+        x = self.token_embedder(encoder_input_tokens)
+
+        seq_length = encoder_input_tokens.shape[1]
+        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
+        x += self.position_encoding(inputs_positions)
+
+        x = self.dropout_pre(x)
+
+        # inverted the attention mask
+        input_shape = encoder_input_tokens.size()
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
+
+        for lyr in self.encoders:
+            x = lyr(x, extended_attention_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 9bfec87415e1..e4ac791b2596 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -18,25 +18,15 @@
 
 import numpy as np
 import torch
-import torch.nn as nn
-from transformers.modeling_utils import ModuleUtilsMixin
-from transformers.models.t5.modeling_t5 import (
-    T5Attention,
-    T5Block,
-    T5Config,
-    T5DenseActDense,
-    T5DenseGatedActDense,
-    T5LayerCrossAttention,
-    T5LayerNorm,
-)
 
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin
-from ...models.embeddings import get_timestep_embedding
+from .notes_encoder import SpectrogramNotesEncoder
+from .continous_encoder import SpectrogramContEncoder
 from ...schedulers import DDPMScheduler
 from ...utils import is_note_seq_available, randn_tensor
 from ..onnx_utils import OnnxRuntimeModel
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ...models import T5FilmDecoder
+
 from .midi_utils import (
     DEFAULT_MAX_SHIFT_SECONDS,
     DEFAULT_NUM_VELOCITY_BINS,
@@ -66,437 +56,6 @@
     raise ImportError("Please install note-seq via `pip install note-seq`")
 
 
-class FiLMLayer(nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
-
-    def forward(self, x, conditioning_emb):
-        scale_bias = self.scale_bias(conditioning_emb)
-        scale, bias = torch.chunk(scale_bias, 2, -1)
-        return x * (scale + 1.0) + bias
-
-
-class T5LayerFFCond(nn.Module):
-    def __init__(self, config: T5Config):
-        super().__init__()
-        if config.is_gated_act:
-            self.DenseReluDense = T5DenseGatedActDense(config)
-        else:
-            self.DenseReluDense = T5DenseActDense(config)
-
-        self.film = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states, conditioning_emb=None):
-        forwarded_states = self.layer_norm(hidden_states)
-        if conditioning_emb is not None:
-            forwarded_states = self.film(forwarded_states, conditioning_emb)
-
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5LayerSelfAttentionCond(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer_norm = T5LayerNorm(config.d_model)
-        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        # pre_self_attention_layer_norm
-        normed_hidden_states = self.layer_norm(hidden_states)
-
-        if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
-
-        # Self-attention block
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]
-        return outputs
-
-
-class DecoderLayer(nn.Module, ModuleUtilsMixin):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer = nn.ModuleList()
-        self.config = config
-
-        # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
-
-        # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(config))
-
-        # Film Cond MLP + dropout: last layer
-        self.layer.append(T5LayerFFCond(config))
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-        if past_key_value is not None:
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            if len(past_key_value) != expected_num_past_key_values:
-                raise ValueError(
-                    f"There should be {expected_num_past_key_values} past states. "
-                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                    f"Got {len(past_key_value)} past key / value states"
-                )
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        if encoder_hidden_states is not None:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
-            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_extended_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-        # Apply Film Conditional Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_value_state,) + attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
-class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
-    @register_to_config
-    def __init__(
-        self,
-        max_length: int,
-        vocab_size: int,
-        d_model: int,
-        dropout_rate: float,
-        num_layers: int,
-        num_heads: int,
-        d_kv: int,
-        d_ff: int,
-        feed_forward_proj: str,
-        is_decoder: bool = False,
-    ):
-        super().__init__()
-
-        self.token_embedder = nn.Embedding(vocab_size, d_model)
-
-        self.position_encoding = nn.Embedding(max_length, d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.dropout_pre = nn.Dropout(p=dropout_rate)
-
-        t5config = T5Config(
-            vocab_size=vocab_size,
-            d_model=d_model,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            dropout_rate=dropout_rate,
-            feed_forward_proj=feed_forward_proj,
-            is_decoder=is_decoder,
-            is_encoder_decoder=False,
-        )
-
-        self.encoders = nn.ModuleList()
-        for lyr_num in range(num_layers):
-            lyr = T5Block(t5config)
-            self.encoders.append(lyr)
-
-        self.layer_norm = T5LayerNorm(d_model)
-        self.dropout_post = nn.Dropout(p=dropout_rate)
-
-    def forward(self, encoder_input_tokens, encoder_inputs_mask):
-        x = self.token_embedder(encoder_input_tokens)
-
-        seq_length = encoder_input_tokens.shape[1]
-        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
-        x += self.position_encoding(inputs_positions)
-
-        x = self.dropout_pre(x)
-
-        # inverted the attention mask
-        input_shape = encoder_input_tokens.size()
-        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
-
-        for lyr in self.encoders:
-            x = lyr(x, extended_attention_mask)[0]
-        x = self.layer_norm(x)
-
-        return self.dropout_post(x), encoder_inputs_mask
-
-
-class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
-    @register_to_config
-    def __init__(
-        self,
-        input_dims: int,
-        targets_context_length: int,
-        d_model: int,
-        dropout_rate: float,
-        num_layers: int,
-        num_heads: int,
-        d_kv: int,
-        d_ff: int,
-        feed_forward_proj: str,
-        is_decoder: bool = False,
-    ):
-        super().__init__()
-
-        self.input_proj = nn.Linear(input_dims, d_model, bias=False)
-
-        self.position_encoding = nn.Embedding(targets_context_length, d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.dropout_pre = nn.Dropout(p=dropout_rate)
-
-        t5config = T5Config(
-            d_model=d_model,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            feed_forward_proj=feed_forward_proj,
-            dropout_rate=dropout_rate,
-            is_decoder=is_decoder,
-            is_encoder_decoder=False,
-        )
-        self.encoders = nn.ModuleList()
-        for lyr_num in range(num_layers):
-            lyr = T5Block(t5config)
-            self.encoders.append(lyr)
-
-        self.layer_norm = T5LayerNorm(d_model)
-        self.dropout_post = nn.Dropout(p=dropout_rate)
-
-    def forward(self, encoder_inputs, encoder_inputs_mask):
-        x = self.input_proj(encoder_inputs)
-
-        # terminal relative positional encodings
-        max_positions = encoder_inputs.shape[1]
-        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
-
-        seq_lens = encoder_inputs_mask.sum(-1)
-        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
-        x += self.position_encoding(input_positions)
-
-        x = self.dropout_pre(x)
-
-        # inverted the attention mask
-        input_shape = encoder_inputs.size()
-        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
-
-        for lyr in self.encoders:
-            x = lyr(x, extended_attention_mask)[0]
-        x = self.layer_norm(x)
-
-        return self.dropout_post(x), encoder_inputs_mask
-
-
-class T5FilmDecoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(
-        self,
-        input_dims: int,
-        targets_length: int,
-        max_decoder_noise_time: float,
-        d_model: int,
-        num_layers: int,
-        num_heads: int,
-        d_kv: int,
-        d_ff: int,
-        dropout_rate: float,
-        feed_forward_proj: str,
-    ):
-        super().__init__()
-
-        self.conditioning_emb = nn.Sequential(
-            nn.Linear(d_model, d_model * 4, bias=False),
-            nn.SiLU(),
-            nn.Linear(d_model * 4, d_model * 4, bias=False),
-            nn.SiLU(),
-        )
-
-        self.position_encoding = nn.Embedding(targets_length, d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
-
-        self.dropout = nn.Dropout(p=dropout_rate)
-
-        t5config = T5Config(
-            d_model=d_model,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            feed_forward_proj=feed_forward_proj,
-            dropout_rate=dropout_rate,
-            is_decoder=True,
-            is_encoder_decoder=False,
-        )
-        self.decoders = nn.ModuleList()
-        for lyr_num in range(num_layers):
-            # FiLM conditional T5 decoder
-            lyr = DecoderLayer(t5config)
-            self.decoders.append(lyr)
-
-        self.decoder_norm = T5LayerNorm(d_model)
-
-        self.post_dropout = nn.Dropout(p=dropout_rate)
-        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
-
-    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
-        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
-        return mask.unsqueeze(-3)
-
-    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
-        batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape == (batch,)
-
-        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
-        time_steps = get_timestep_embedding(
-            decoder_noise_time * self.config.max_decoder_noise_time,
-            embedding_dim=self.config.d_model,
-            max_period=self.config.max_decoder_noise_time,
-        ).to(dtype=self.dtype)
-
-        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
-
-        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
-
-        seq_length = decoder_input_tokens.shape[1]
-
-        # If we want to use relative positions for audio context, we can just offset
-        # this sequence by the length of encodings_and_masks.
-        decoder_positions = torch.broadcast_to(
-            torch.arange(seq_length, device=decoder_input_tokens.device),
-            (batch, seq_length),
-        )
-
-        position_encodings = self.position_encoding(decoder_positions)
-
-        inputs = self.continuous_inputs_projection(decoder_input_tokens)
-        inputs += position_encodings
-        y = self.dropout(inputs)
-
-        # decoder: No padding present.
-        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
-
-        # Translate encoding masks to encoder-decoder masks.
-        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-
-        # cross attend style: concat encodings
-        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
-        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
-
-        for lyr in self.decoders:
-            y = lyr(
-                y,
-                conditioning_emb=conditioning_emb,
-                encoder_hidden_states=encoded,
-                encoder_attention_mask=encoder_decoder_mask,
-            )[0]
-
-        y = self.decoder_norm(y)
-        y = self.post_dropout(y)
-
-        spec_out = self.spec_out(y)
-        return spec_out
-
-
 class SpectrogramDiffusionPipeline(DiffusionPipeline):
     def __init__(
         self,
diff --git a/tests/fixtures/elise_format0.mid b/tests/fixtures/elise_format0.mid
new file mode 100644
index 0000000000000000000000000000000000000000..33dbabe7ab1d4d28e43d9911255a510a8a672d77
GIT binary patch
literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

literal 0
HcmV?d00001

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index f63947c419ba..d9188603662d 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -28,6 +28,9 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
+MIDI_FILE = "./tests/fixtures/elise_format0.mid"
+
+
 @slow
 @require_torch_gpu
 class PipelineIntegrationTests(unittest.TestCase):
@@ -37,20 +40,29 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def test_spectrogram(self):
+    def test_spectrogram_fast(self):
         device = torch_device
 
-        url = "http://www.piano-midi.de/midis/beethoven/elise_format0.mid"
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        output = pipe(MIDI_FILE, num_inference_steps=2, generator=generator)
+        audio = output.audios[0]
 
-        os.system(f"wget {url}")
+        assert abs(np.abs(audio).sum() - 3612.841) < 1e-3
+
+        print("Finished")
+
+    def test_spectrogram(self):
+        device = torch_device
 
         pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
-        output = pipe(url.split("/")[-1], num_inference_steps=100)
+        output = pipe(MIDI_FILE, num_inference_steps=100)
         audio = output.audios[0]
         rate = 16_000
-        scipy.io.wavfile.write("/home/patrick_huggingface_co/audios/beet.wav", rate, audio[0])
-
-        print("Finished")
+        scipy.io.wavfile.write("/home/patrick/audios/beet.wav", rate, audio[0])

From 0a1b02b7e74a59ee23fd75f11c3899b504b2d376 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 14:43:14 +0200
Subject: [PATCH 080/131] up

---
 _ | 328 --------------------------------------------------------------
 1 file changed, 328 deletions(-)
 delete mode 100644 _

diff --git a/_ b/_
deleted file mode 100644
index 8fc9522cf348..000000000000
--- a/_
+++ /dev/null
@@ -1,328 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from torch import nn
-from transformers.models.t5.modeling_t5 import (
-    T5Attention,
-    T5Config,
-    T5DenseActDense,
-    T5DenseGatedActDense,
-    T5LayerCrossAttention,
-    T5LayerNorm,
-)
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .modeling_utils import ModelMixin
-from .embeddings import get_timestep_embedding
-
-
-class FiLMLayer(nn.Module):
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
-
-    def forward(self, x, conditioning_emb):
-        scale_bias = self.scale_bias(conditioning_emb)
-        scale, bias = torch.chunk(scale_bias, 2, -1)
-        return x * (scale + 1.0) + bias
-
-
-class T5LayerFFCond(nn.Module):
-    def __init__(self, config: T5Config):
-        super().__init__()
-        if config.is_gated_act:
-            self.DenseReluDense = T5DenseGatedActDense(config)
-        else:
-            self.DenseReluDense = T5DenseActDense(config)
-
-        self.film = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states, conditioning_emb=None):
-        forwarded_states = self.layer_norm(hidden_states)
-        if conditioning_emb is not None:
-            forwarded_states = self.film(forwarded_states, conditioning_emb)
-
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5LayerSelfAttentionCond(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer_norm = T5LayerNorm(config.d_model)
-        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        # pre_self_attention_layer_norm
-        normed_hidden_states = self.layer_norm(hidden_states)
-
-        if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
-
-        # Self-attention block
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]
-        return outputs
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer = nn.ModuleList()
-        self.config = config
-
-        # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
-
-        # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(config))
-
-        # Film Cond MLP + dropout: last layer
-        self.layer.append(T5LayerFFCond(config))
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-        if past_key_value is not None:
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            if len(past_key_value) != expected_num_past_key_values:
-                raise ValueError(
-                    f"There should be {expected_num_past_key_values} past states. "
-                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                    f"Got {len(past_key_value)} past key / value states"
-                )
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        if encoder_hidden_states is not None:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
-            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_extended_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-        # Apply Film Conditional Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_value_state,) + attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
-class T5FilmDecoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(
-        self,
-        input_dims: int,
-        targets_length: int,
-        max_decoder_noise_time: float,
-        d_model: int,
-        num_layers: int,
-        num_heads: int,
-        d_kv: int,
-        d_ff: int,
-        dropout_rate: float,
-        feed_forward_proj: str,
-    ):
-        super().__init__()
-
-        self.conditioning_emb = nn.Sequential(
-            nn.Linear(d_model, d_model * 4, bias=False),
-            nn.SiLU(),
-            nn.Linear(d_model * 4, d_model * 4, bias=False),
-            nn.SiLU(),
-        )
-
-        self.position_encoding = nn.Embedding(targets_length, d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
-
-        self.dropout = nn.Dropout(p=dropout_rate)
-
-        t5config = T5Config(
-            d_model=d_model,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            feed_forward_proj=feed_forward_proj,
-            dropout_rate=dropout_rate,
-            is_decoder=True,
-            is_encoder_decoder=False,
-        )
-        self.decoders = nn.ModuleList()
-        for lyr_num in range(num_layers):
-            # FiLM conditional T5 decoder
-            lyr = DecoderLayer(t5config)
-            self.decoders.append(lyr)
-
-        self.decoder_norm = T5LayerNorm(d_model)
-
-        self.post_dropout = nn.Dropout(p=dropout_rate)
-        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
-
-    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
-        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
-        return mask.unsqueeze(-3)
-
-    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
-        batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape == (batch,)
-
-        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
-        time_steps = get_timestep_embedding(
-            decoder_noise_time * self.config.max_decoder_noise_time,
-            embedding_dim=self.config.d_model,
-            max_period=self.config.max_decoder_noise_time,
-        ).to(dtype=self.dtype)
-
-        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
-
-        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
-
-        seq_length = decoder_input_tokens.shape[1]
-
-        # If we want to use relative positions for audio context, we can just offset
-        # this sequence by the length of encodings_and_masks.
-        decoder_positions = torch.broadcast_to(
-            torch.arange(seq_length, device=decoder_input_tokens.device),
-            (batch, seq_length),
-        )
-
-        position_encodings = self.position_encoding(decoder_positions)
-
-        inputs = self.continuous_inputs_projection(decoder_input_tokens)
-        inputs += position_encodings
-        y = self.dropout(inputs)
-
-        # decoder: No padding present.
-        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
-
-        # Translate encoding masks to encoder-decoder masks.
-        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-
-        # cross attend style: concat encodings
-        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
-        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
-
-        for lyr in self.decoders:
-            y = lyr(
-                y,
-                conditioning_emb=conditioning_emb,
-                encoder_hidden_states=encoded,
-                encoder_attention_mask=encoder_decoder_mask,
-            )[0]
-
-        y = self.decoder_norm(y)
-        y = self.post_dropout(y)
-
-        spec_out = self.spec_out(y)
-        return spec_out

From 17d0edff57971a1ff4da4bf574a01ad20c3b045a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 15:32:38 +0200
Subject: [PATCH 081/131] remove transformers dependency

---
 src/diffusers/models/attention.py           |  17 ++
 src/diffusers/models/t5_film_transformer.py | 309 ++++++++++++--------
 2 files changed, 204 insertions(+), 122 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 3cdc7177a411..c68d66aced16 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -515,3 +515,20 @@ def forward(self, x, emb):
         x = F.group_norm(x, self.num_groups, eps=self.eps)
         x = x * (1 + scale) + shift
         return x
+
+
+class FiLMLayer(nn.Module):
+    """
+    FiLM Layer
+    """
+
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        # TOOD(PVP) - rename scale_bias layer
+        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
+
+    def forward(self, x, conditioning_emb):
+        emb = self.scale_bias(conditioning_emb)
+        scale, shift = torch.chunk(emb, 2, -1)
+        x = x * (1 + scale) + shift
+        return x
diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index 8fc9522cf348..63ae9053a311 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -13,42 +13,181 @@
 # limitations under the License.
 import torch
 from torch import nn
-from transformers.models.t5.modeling_t5 import (
-    T5Attention,
-    T5Config,
-    T5DenseActDense,
-    T5DenseGatedActDense,
-    T5LayerCrossAttention,
-    T5LayerNorm,
-)
-
+import math
 from ..configuration_utils import ConfigMixin, register_to_config
 from .modeling_utils import ModelMixin
 from .embeddings import get_timestep_embedding
+from .attention import FiLMLayer
+
+
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
+        super().__init__()
+        self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states=None,
+        attention_mask=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,)
+        return outputs
 
 
-class FiLMLayer(nn.Module):
-    def __init__(self, in_features, out_features):
+class T5Attention(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
         super().__init__()
-        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
+        self.d_model = d_model
+        self.key_value_proj_dim = d_kv
+        self.n_heads = num_heads
+        self.dropout = dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
 
-    def forward(self, x, conditioning_emb):
-        scale_bias = self.scale_bias(conditioning_emb)
-        scale, bias = torch.chunk(scale_bias, 2, -1)
-        return x * (scale + 1.0) + bias
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
 
+        self.gradient_checkpointing = False
 
-class T5LayerFFCond(nn.Module):
-    def __init__(self, config: T5Config):
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            else:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            return hidden_states
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(hidden_states, self.k, key_value_states)
+        value_states = project(hidden_states, self.v, key_value_states)
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+        if mask is not None:
+            scores += mask
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        return (attn_output,)
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
         super().__init__()
-        if config.is_gated_act:
-            self.DenseReluDense = T5DenseGatedActDense(config)
-        else:
-            self.DenseReluDense = T5DenseActDense(config)
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
 
-        self.film = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class T5DenseGatedActDense(nn.Module):
+    def __init__(self, d_model, d_ff, dropout_rate):
+        super().__init__()
+        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
+        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
+        self.wo = nn.Linear(d_ff, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.act = NewGELUActivation()
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFFCond(nn.Module):
+    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
+        super().__init__()
+        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
+        self.film = FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
 
     def forward(self, hidden_states, conditioning_emb=None):
         forwarded_states = self.layer_norm(hidden_states)
@@ -61,23 +200,18 @@ def forward(self, hidden_states, conditioning_emb=None):
 
 
 class T5LayerSelfAttentionCond(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
         super().__init__()
-        self.layer_norm = T5LayerNorm(config.d_model)
-        self.FiLMLayer = FiLMLayer(in_features=config.d_model * 4, out_features=config.d_model)
-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.dropout = nn.Dropout(config.dropout_rate)
+        self.layer_norm = T5LayerNorm(d_model)
+        self.FiLMLayer = FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.dropout = nn.Dropout(dropout_rate)
 
     def forward(
         self,
         hidden_states,
         conditioning_emb=None,
         attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
     ):
         # pre_self_attention_layer_norm
         normed_hidden_states = self.layer_norm(hidden_states)
@@ -89,75 +223,41 @@ def forward(
         attention_output = self.SelfAttention(
             normed_hidden_states,
             mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
         )
         hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]
+        outputs = (hidden_states,)
         return outputs
 
 
 class DecoderLayer(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
+    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
         super().__init__()
         self.layer = nn.ModuleList()
-        self.config = config
 
         # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(config, has_relative_attention_bias=has_relative_attention_bias))
+        self.layer.append(T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate))
 
         # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(config))
+        self.layer.append(T5LayerCrossAttention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
 
         # Film Cond MLP + dropout: last layer
-        self.layer.append(T5LayerFFCond(config))
+        self.layer.append(T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
 
     def forward(
         self,
         hidden_states,
         conditioning_emb=None,
         attention_mask=None,
-        position_bias=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
     ):
-        if past_key_value is not None:
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            if len(past_key_value) != expected_num_past_key_values:
-                raise ValueError(
-                    f"There should be {expected_num_past_key_values} past states. "
-                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                    f"Got {len(past_key_value)} past key / value states"
-                )
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
         self_attention_outputs = self.layer[0](
             hidden_states,
             conditioning_emb=conditioning_emb,
             attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
         )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+        hidden_states = self_attention_outputs[0]
 
         # clamp inf values to enable fp16 training
         if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
@@ -167,23 +267,12 @@ def forward(
         if encoder_hidden_states is not None:
             # the actual query length is unknown for cross attention
             # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
             encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
 
             cross_attention_outputs = self.layer[1](
                 hidden_states,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_extended_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
             )
             hidden_states = cross_attention_outputs[0]
 
@@ -192,13 +281,6 @@ def forward(
                 clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
         # Apply Film Conditional Feed Forward layer
         hidden_states = self.layer[-1](hidden_states, conditioning_emb)
 
@@ -207,30 +289,23 @@ def forward(
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_value_state,) + attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+        return (hidden_states,)
 
 
 class T5FilmDecoder(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
         self,
-        input_dims: int,
-        targets_length: int,
-        max_decoder_noise_time: float,
-        d_model: int,
-        num_layers: int,
-        num_heads: int,
-        d_kv: int,
-        d_ff: int,
-        dropout_rate: float,
-        feed_forward_proj: str,
+        input_dims: int = 128,
+        targets_length: int = 256,
+        max_decoder_noise_time: float = 2000.0,
+        d_model: int = 768,
+        num_layers: int = 12,
+        num_heads: int = 12,
+        d_kv: int = 64,
+        d_ff: int = 2048,
+        dropout_rate: float = 0.1,
+        feed_forward_proj: str = "gated-gelu",
     ):
         super().__init__()
 
@@ -248,20 +323,10 @@ def __init__(
 
         self.dropout = nn.Dropout(p=dropout_rate)
 
-        t5config = T5Config(
-            d_model=d_model,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            feed_forward_proj=feed_forward_proj,
-            dropout_rate=dropout_rate,
-            is_decoder=True,
-            is_encoder_decoder=False,
-        )
         self.decoders = nn.ModuleList()
         for lyr_num in range(num_layers):
             # FiLM conditional T5 decoder
-            lyr = DecoderLayer(t5config)
+            lyr = DecoderLayer(d_model=d_model, d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, dropout_rate=dropout_rate)
             self.decoders.append(lyr)
 
         self.decoder_norm = T5LayerNorm(d_model)

From 658080c6ba7ec781ed174575debe2e8ee3ddb5b8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 16:17:59 +0200
Subject: [PATCH 082/131] make work with diffusers cross attention

---
 _                                             | 399 ++++++++++++++++++
 .../convert_music_spectrogram_to_diffusers.py |  16 +-
 src/diffusers/models/cross_attention.py       |   8 +-
 src/diffusers/models/t5_film_transformer.py   |  41 +-
 .../test_spectrogram_diffusion.py             |   7 +-
 5 files changed, 448 insertions(+), 23 deletions(-)
 create mode 100644 _

diff --git a/_ b/_
new file mode 100644
index 000000000000..c0e9cf5bc7dd
--- /dev/null
+++ b/_
@@ -0,0 +1,399 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+import math
+from ..configuration_utils import ConfigMixin, register_to_config
+from .modeling_utils import ModelMixin
+from .embeddings import get_timestep_embedding
+from .attention import FiLMLayer
+from .cross_attention import CrossAttention
+
+
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
+        super().__init__()
+        self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states=None,
+        attention_mask=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,)
+        return outputs
+
+
+class T5Attention(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
+        super().__init__()
+        self.d_model = d_model
+        self.key_value_proj_dim = d_kv
+        self.n_heads = num_heads
+        self.dropout = dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            else:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            return hidden_states
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(hidden_states, self.k, key_value_states)
+        value_states = project(hidden_states, self.v, key_value_states)
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+        if mask is not None:
+            scores += mask
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        return (attn_output,)
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class T5DenseGatedActDense(nn.Module):
+    def __init__(self, d_model, d_ff, dropout_rate):
+        super().__init__()
+        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
+        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
+        self.wo = nn.Linear(d_ff, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.act = NewGELUActivation()
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFFCond(nn.Module):
+    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
+        super().__init__()
+        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
+        self.film = FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_states, conditioning_emb=None):
+        forwarded_states = self.layer_norm(hidden_states)
+        if conditioning_emb is not None:
+            forwarded_states = self.film(forwarded_states, conditioning_emb)
+
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(d_model)
+        self.FiLMLayer = FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def set_weights(self):
+        self.attention.to_q.weight = nn.Parameter(self.SelfAttention.q.weight.data)
+        self.attention.to_k.weight = nn.Parameter(self.SelfAttention.k.weight.data)
+        self.attention.to_v.weight = nn.Parameter(self.SelfAttention.v.weight.data)
+        self.attention.to_out[0].weight = nn.Parameter(self.SelfAttention.o.weight.data)
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+    ):
+        self.set_weights()
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        attention_output = self.SelfAttention(normed_hidden_states)
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,)
+        return outputs
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
+
+        # Film Cond MLP + dropout: last layer
+        self.layer.append(T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+    ):
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+        )
+        hidden_states = self_attention_outputs[0]
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        if encoder_hidden_states is not None:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_extended_attention_mask,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        # Apply Film Conditional Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return (hidden_states,)
+
+
+class T5FilmDecoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int = 128,
+        targets_length: int = 256,
+        max_decoder_noise_time: float = 2000.0,
+        d_model: int = 768,
+        num_layers: int = 12,
+        num_heads: int = 12,
+        d_kv: int = 64,
+        d_ff: int = 2048,
+        dropout_rate: float = 0.1,
+        feed_forward_proj: str = "gated-gelu",
+    ):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(d_model, d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(d_model * 4, d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+
+        self.position_encoding = nn.Embedding(targets_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
+
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        self.decoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(d_model=d_model, d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, dropout_rate=dropout_rate)
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(d_model)
+
+        self.post_dropout = nn.Dropout(p=dropout_rate)
+        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
+
+    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
+        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        time_steps = get_timestep_embedding(
+            decoder_noise_time * self.config.max_decoder_noise_time,
+            embedding_dim=self.config.d_model,
+            max_period=self.config.max_decoder_noise_time,
+        ).to(dtype=self.dtype)
+
+        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
+
+        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+        inputs += position_encodings
+        y = self.dropout(inputs)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+
+        for lyr in self.decoders:
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out
diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 4481a4489514..0eb63166cbfc 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -93,16 +93,16 @@ def load_decoder(weights, model):
         )
 
         attention_weights = ly_weight["self_attention"]
-        lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
-        lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
-        lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
-        lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+        lyr.layer[0].attention.to_q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[0].attention.to_k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[0].attention.to_v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[0].attention.to_out[0].weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
 
         attention_weights = ly_weight["MultiHeadDotProductAttention_0"]
-        lyr.layer[1].EncDecAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
-        lyr.layer[1].EncDecAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
-        lyr.layer[1].EncDecAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
-        lyr.layer[1].EncDecAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
+        lyr.layer[1].attention.to_q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T))
+        lyr.layer[1].attention.to_k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T))
+        lyr.layer[1].attention.to_v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T))
+        lyr.layer[1].attention.to_out[0].weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T))
         lyr.layer[1].layer_norm.weight = nn.Parameter(
             torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"])
         )
diff --git a/src/diffusers/models/cross_attention.py b/src/diffusers/models/cross_attention.py
index baccdd83f202..8cf23649fa3a 100644
--- a/src/diffusers/models/cross_attention.py
+++ b/src/diffusers/models/cross_attention.py
@@ -59,6 +59,8 @@ def __init__(
         cross_attention_norm: bool = False,
         added_kv_proj_dim: Optional[int] = None,
         norm_num_groups: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
         processor: Optional["AttnProcessor"] = None,
     ):
         super().__init__()
@@ -68,7 +70,7 @@ def __init__(
         self.upcast_softmax = upcast_softmax
         self.cross_attention_norm = cross_attention_norm
 
-        self.scale = dim_head**-0.5
+        self.scale = dim_head**-0.5 if scale_qk else 1
 
         self.heads = heads
         # for slice_size > 0 the attention score computation
@@ -95,7 +97,7 @@ def __init__(
             self.add_v_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
 
         self.to_out = nn.ModuleList([])
-        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Linear(inner_dim, query_dim, bias=out_bias))
         self.to_out.append(nn.Dropout(dropout))
 
         # set attention processor
@@ -292,7 +294,7 @@ def __call__(
         encoder_hidden_states=None,
         attention_mask=None,
     ):
-        batch_size, sequence_length, _ = hidden_states.shape
+        batch_size, sequence_length, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         query = attn.to_q(hidden_states)
 
diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index 63ae9053a311..315ae880b288 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -18,6 +18,7 @@
 from .modeling_utils import ModelMixin
 from .embeddings import get_timestep_embedding
 from .attention import FiLMLayer
+from .cross_attention import CrossAttention
 
 
 class NewGELUActivation(nn.Module):
@@ -33,23 +34,36 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 class T5LayerCrossAttention(nn.Module):
     def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
         super().__init__()
-        self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        # self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
         self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
         self.dropout = nn.Dropout(dropout_rate)
 
+    def set_weights(self):
+        self.attention.to_q.weight = nn.Parameter(self.EncDecAttention.q.weight.data)
+        self.attention.to_k.weight = nn.Parameter(self.EncDecAttention.k.weight.data)
+        self.attention.to_v.weight = nn.Parameter(self.EncDecAttention.v.weight.data)
+        self.attention.to_out[0].weight = nn.Parameter(self.EncDecAttention.o.weight.data)
+
     def forward(
         self,
         hidden_states,
         key_value_states=None,
         attention_mask=None,
     ):
+        # self.set_weights()
         normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(
+#        attention_output = self.EncDecAttention(
+#            normed_hidden_states,
+#            mask=attention_mask,
+#            key_value_states=key_value_states,
+#        )
+        attention_output_comp = self.attention(
             normed_hidden_states,
-            mask=attention_mask,
-            key_value_states=key_value_states,
+            encoder_hidden_states=key_value_states,
+            attention_mask=attention_mask.squeeze(),
         )
-        layer_output = hidden_states + self.dropout(attention_output[0])
+        layer_output = hidden_states + self.dropout(attention_output_comp)
         outputs = (layer_output,)
         return outputs
 
@@ -204,15 +218,23 @@ def __init__(self, d_model, d_kv, num_heads, dropout_rate):
         super().__init__()
         self.layer_norm = T5LayerNorm(d_model)
         self.FiLMLayer = FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        # self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
         self.dropout = nn.Dropout(dropout_rate)
 
+    def set_weights(self):
+        self.attention.to_q.weight = nn.Parameter(self.SelfAttention.q.weight.data)
+        self.attention.to_k.weight = nn.Parameter(self.SelfAttention.k.weight.data)
+        self.attention.to_v.weight = nn.Parameter(self.SelfAttention.v.weight.data)
+        self.attention.to_out[0].weight = nn.Parameter(self.SelfAttention.o.weight.data)
+
     def forward(
         self,
         hidden_states,
         conditioning_emb=None,
         attention_mask=None,
     ):
+        # self.set_weights()
         # pre_self_attention_layer_norm
         normed_hidden_states = self.layer_norm(hidden_states)
 
@@ -220,10 +242,9 @@ def forward(
             normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
 
         # Self-attention block
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-        )
+        # attention_output = self.SelfAttention(normed_hidden_states)
+        # print("comp", (attention_output[0] - attention_output_comp).abs().sum())
+        attention_output = self.attention(normed_hidden_states)
         hidden_states = hidden_states + self.dropout(attention_output[0])
         outputs = (hidden_states,)
         return outputs
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index d9188603662d..2d9a5dc1f66c 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -43,12 +43,15 @@ def tearDown(self):
     def test_spectrogram_fast(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+        # pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick/t5_note", low_cpu_mem_usage=False)
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick/t5_note")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.manual_seed(0)
         output = pipe(MIDI_FILE, num_inference_steps=2, generator=generator)
+
+        # pipe.save_pretrained("/home/patrick/t5_note")
         audio = output.audios[0]
 
         assert abs(np.abs(audio).sum() - 3612.841) < 1e-3
@@ -58,7 +61,7 @@ def test_spectrogram_fast(self):
     def test_spectrogram(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick/t5_note")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 

From 49fbce7b920832668f528cee0decac47671005a0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 16:37:49 +0200
Subject: [PATCH 083/131] clean more

---
 @                                             | 411 ++++++++++++++++
 src/diffusers/models/attention.py             |  17 -
 src/diffusers/models/t5_film_transformer.py   | 437 +++++++++---------
 .../test_spectrogram_diffusion.py             |   3 +-
 4 files changed, 628 insertions(+), 240 deletions(-)
 create mode 100644 @

diff --git a/@ b/@
new file mode 100644
index 000000000000..42ffc0f49326
--- /dev/null
+++ b/@
@@ -0,0 +1,411 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+import math
+from ..configuration_utils import ConfigMixin, register_to_config
+from .modeling_utils import ModelMixin
+from .embeddings import get_timestep_embedding
+from .attention import FiLMLayer
+from .cross_attention import CrossAttention
+
+
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
+        super().__init__()
+        # self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def set_weights(self):
+        self.attention.to_q.weight = nn.Parameter(self.EncDecAttention.q.weight.data)
+        self.attention.to_k.weight = nn.Parameter(self.EncDecAttention.k.weight.data)
+        self.attention.to_v.weight = nn.Parameter(self.EncDecAttention.v.weight.data)
+        self.attention.to_out[0].weight = nn.Parameter(self.EncDecAttention.o.weight.data)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states=None,
+        attention_mask=None,
+    ):
+        # self.set_weights()
+        normed_hidden_states = self.layer_norm(hidden_states)
+#        attention_output = self.EncDecAttention(
+#            normed_hidden_states,
+#            mask=attention_mask,
+#            key_value_states=key_value_states,
+#        )
+        attention_output = self.attention(
+            normed_hidden_states,
+            encoder_hidden_states=key_value_states,
+            attention_mask=attention_mask.squeeze(),
+        )
+        layer_output = hidden_states + self.dropout(attention_output)
+        return layer_output
+
+
+class T5Attention(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
+        super().__init__()
+        self.d_model = d_model
+        self.key_value_proj_dim = d_kv
+        self.n_heads = num_heads
+        self.dropout = dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            else:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            return hidden_states
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(hidden_states, self.k, key_value_states)
+        value_states = project(hidden_states, self.v, key_value_states)
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+        if mask is not None:
+            scores += mask
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        return attn_output
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class T5DenseGatedActDense(nn.Module):
+    def __init__(self, d_model, d_ff, dropout_rate):
+        super().__init__()
+        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
+        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
+        self.wo = nn.Linear(d_ff, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.act = NewGELUActivation()
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFFCond(nn.Module):
+    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
+        super().__init__()
+        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
+        self.film = FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_states, conditioning_emb=None):
+        forwarded_states = self.layer_norm(hidden_states)
+        if conditioning_emb is not None:
+            forwarded_states = self.film(forwarded_states, conditioning_emb)
+
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(d_model)
+        self.FiLMLayer = FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        # self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def set_weights(self):
+        self.attention.to_q.weight = nn.Parameter(self.SelfAttention.q.weight.data)
+        self.attention.to_k.weight = nn.Parameter(self.SelfAttention.k.weight.data)
+        self.attention.to_v.weight = nn.Parameter(self.SelfAttention.v.weight.data)
+        self.attention.to_out[0].weight = nn.Parameter(self.SelfAttention.o.weight.data)
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+    ):
+        # self.set_weights()
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        # attention_output = self.SelfAttention(normed_hidden_states)
+        # print("comp", (attention_output[0] - attention_output_comp).abs().sum())
+        attention_output = self.attention(normed_hidden_states)
+        hidden_states = hidden_states + self.dropout(attention_output)
+
+        return hidden_states
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
+
+        # Film Cond MLP + dropout: last layer
+        self.layer.append(T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+    ):
+        hidden_states = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+        )
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        if encoder_hidden_states is not None:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
+
+            hidden_states = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_extended_attention_mask,
+            )
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        # Apply Film Conditional Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return (hidden_states,)
+
+
+class T5FilmDecoder(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int = 128,
+        targets_length: int = 256,
+        max_decoder_noise_time: float = 2000.0,
+        d_model: int = 768,
+        num_layers: int = 12,
+        num_heads: int = 12,
+        d_kv: int = 64,
+        d_ff: int = 2048,
+        dropout_rate: float = 0.1,
+        feed_forward_proj: str = "gated-gelu",
+    ):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(d_model, d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(d_model * 4, d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+
+        self.position_encoding = nn.Embedding(targets_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
+
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        self.decoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(d_model=d_model, d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, dropout_rate=dropout_rate)
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(d_model)
+
+        self.post_dropout = nn.Dropout(p=dropout_rate)
+        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
+
+    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
+        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        time_steps = get_timestep_embedding(
+            decoder_noise_time * self.config.max_decoder_noise_time,
+            embedding_dim=self.config.d_model,
+            max_period=self.config.max_decoder_noise_time,
+        ).to(dtype=self.dtype)
+
+        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
+
+        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+        inputs += position_encodings
+        y = self.dropout(inputs)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+
+        for lyr in self.decoders:
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index c68d66aced16..3cdc7177a411 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -515,20 +515,3 @@ def forward(self, x, emb):
         x = F.group_norm(x, self.num_groups, eps=self.eps)
         x = x * (1 + scale) + shift
         return x
-
-
-class FiLMLayer(nn.Module):
-    """
-    FiLM Layer
-    """
-
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        # TOOD(PVP) - rename scale_bias layer
-        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
-
-    def forward(self, x, conditioning_emb):
-        emb = self.scale_bias(conditioning_emb)
-        scale, shift = torch.chunk(emb, 2, -1)
-        x = x * (1 + scale) + shift
-        return x
diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index 315ae880b288..26f8d59d63a4 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -17,57 +17,9 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from .modeling_utils import ModelMixin
 from .embeddings import get_timestep_embedding
-from .attention import FiLMLayer
 from .cross_attention import CrossAttention
 
 
-class NewGELUActivation(nn.Module):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
-
-
-class T5LayerCrossAttention(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
-        super().__init__()
-        # self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
-        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
-        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def set_weights(self):
-        self.attention.to_q.weight = nn.Parameter(self.EncDecAttention.q.weight.data)
-        self.attention.to_k.weight = nn.Parameter(self.EncDecAttention.k.weight.data)
-        self.attention.to_v.weight = nn.Parameter(self.EncDecAttention.v.weight.data)
-        self.attention.to_out[0].weight = nn.Parameter(self.EncDecAttention.o.weight.data)
-
-    def forward(
-        self,
-        hidden_states,
-        key_value_states=None,
-        attention_mask=None,
-    ):
-        # self.set_weights()
-        normed_hidden_states = self.layer_norm(hidden_states)
-#        attention_output = self.EncDecAttention(
-#            normed_hidden_states,
-#            mask=attention_mask,
-#            key_value_states=key_value_states,
-#        )
-        attention_output_comp = self.attention(
-            normed_hidden_states,
-            encoder_hidden_states=key_value_states,
-            attention_mask=attention_mask.squeeze(),
-        )
-        layer_output = hidden_states + self.dropout(attention_output_comp)
-        outputs = (layer_output,)
-        return outputs
-
-
 class T5Attention(nn.Module):
     def __init__(self, d_model, d_kv, num_heads, dropout_rate):
         super().__init__()
@@ -142,175 +94,7 @@ def project(hidden_states, proj_layer, key_value_states):
         attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
         attn_output = self.o(attn_output)
 
-        return (attn_output,)
-
-
-class T5LayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
-
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class T5DenseGatedActDense(nn.Module):
-    def __init__(self, d_model, d_ff, dropout_rate):
-        super().__init__()
-        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
-        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
-        self.wo = nn.Linear(d_ff, d_model, bias=False)
-        self.dropout = nn.Dropout(dropout_rate)
-        self.act = NewGELUActivation()
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-
-        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
-        # See https://github.com/huggingface/transformers/issues/20287
-        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
-        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
-            hidden_states = hidden_states.to(self.wo.weight.dtype)
-
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5LayerFFCond(nn.Module):
-    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
-        super().__init__()
-        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
-        self.film = FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(self, hidden_states, conditioning_emb=None):
-        forwarded_states = self.layer_norm(hidden_states)
-        if conditioning_emb is not None:
-            forwarded_states = self.film(forwarded_states, conditioning_emb)
-
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5LayerSelfAttentionCond(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
-        super().__init__()
-        self.layer_norm = T5LayerNorm(d_model)
-        self.FiLMLayer = FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        # self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
-        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def set_weights(self):
-        self.attention.to_q.weight = nn.Parameter(self.SelfAttention.q.weight.data)
-        self.attention.to_k.weight = nn.Parameter(self.SelfAttention.k.weight.data)
-        self.attention.to_v.weight = nn.Parameter(self.SelfAttention.v.weight.data)
-        self.attention.to_out[0].weight = nn.Parameter(self.SelfAttention.o.weight.data)
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-    ):
-        # self.set_weights()
-        # pre_self_attention_layer_norm
-        normed_hidden_states = self.layer_norm(hidden_states)
-
-        if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
-
-        # Self-attention block
-        # attention_output = self.SelfAttention(normed_hidden_states)
-        # print("comp", (attention_output[0] - attention_output_comp).abs().sum())
-        attention_output = self.attention(normed_hidden_states)
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,)
-        return outputs
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
-        super().__init__()
-        self.layer = nn.ModuleList()
-
-        # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate))
-
-        # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
-
-        # Film Cond MLP + dropout: last layer
-        self.layer.append(T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-    ):
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-        )
-        hidden_states = self_attention_outputs[0]
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        if encoder_hidden_states is not None:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_extended_attention_mask,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        # Apply Film Conditional Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        return (hidden_states,)
+        return attn_output
 
 
 class T5FilmDecoder(ModelMixin, ConfigMixin):
@@ -326,7 +110,6 @@ def __init__(
         d_kv: int = 64,
         d_ff: int = 2048,
         dropout_rate: float = 0.1,
-        feed_forward_proj: str = "gated-gelu",
     ):
         super().__init__()
 
@@ -355,8 +138,8 @@ def __init__(
         self.post_dropout = nn.Dropout(p=dropout_rate)
         self.spec_out = nn.Linear(d_model, input_dims, bias=False)
 
-    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
-        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+    def encoder_decoder_mask(self, query_input, key_input):
+        mask = torch.mul(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
         return mask.unsqueeze(-3)
 
     def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
@@ -390,7 +173,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
         y = self.dropout(inputs)
 
         # decoder: No padding present.
-        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
+        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device, dtype=inputs.dtype)
 
         # Translate encoding masks to encoder-decoder masks.
         encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
@@ -412,3 +195,215 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         spec_out = self.spec_out(y)
         return spec_out
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate))
+
+        # cross attention: layer 1
+        self.layer.append(T5LayerCrossAttention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
+
+        # Film Cond MLP + dropout: last layer
+        self.layer.append(T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+    ):
+        hidden_states = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+        )
+
+        if encoder_hidden_states is not None:
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10).to(encoder_hidden_states.dtype)
+
+            hidden_states = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_extended_attention_mask,
+            )
+
+        # Apply Film Conditional Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
+
+        return (hidden_states,)
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(d_model)
+        self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        # self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def set_weights(self):
+        self.attention.to_q.weight = nn.Parameter(self.SelfAttention.q.weight.data)
+        self.attention.to_k.weight = nn.Parameter(self.SelfAttention.k.weight.data)
+        self.attention.to_v.weight = nn.Parameter(self.SelfAttention.v.weight.data)
+        self.attention.to_out[0].weight = nn.Parameter(self.SelfAttention.o.weight.data)
+
+    def forward(
+        self,
+        hidden_states,
+        conditioning_emb=None,
+        attention_mask=None,
+    ):
+        # self.set_weights()
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        # attention_output = self.SelfAttention(normed_hidden_states)
+        # print("comp", (attention_output[0] - attention_output_comp).abs().sum())
+        attention_output = self.attention(normed_hidden_states)
+        hidden_states = hidden_states + self.dropout(attention_output)
+
+        return hidden_states
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
+        super().__init__()
+        # self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def set_weights(self):
+        self.attention.to_q.weight = nn.Parameter(self.EncDecAttention.q.weight.data)
+        self.attention.to_k.weight = nn.Parameter(self.EncDecAttention.k.weight.data)
+        self.attention.to_v.weight = nn.Parameter(self.EncDecAttention.v.weight.data)
+        self.attention.to_out[0].weight = nn.Parameter(self.EncDecAttention.o.weight.data)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states=None,
+        attention_mask=None,
+    ):
+        # self.set_weights()
+        normed_hidden_states = self.layer_norm(hidden_states)
+#        attention_output = self.EncDecAttention(
+#            normed_hidden_states,
+#            mask=attention_mask,
+#            key_value_states=key_value_states,
+#        )
+        attention_output = self.attention(
+            normed_hidden_states,
+            encoder_hidden_states=key_value_states,
+            attention_mask=attention_mask.squeeze(),
+        )
+        layer_output = hidden_states + self.dropout(attention_output)
+        return layer_output
+
+
+class T5LayerFFCond(nn.Module):
+    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
+        super().__init__()
+        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
+        self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_states, conditioning_emb=None):
+        forwarded_states = self.layer_norm(hidden_states)
+        if conditioning_emb is not None:
+            forwarded_states = self.film(forwarded_states, conditioning_emb)
+
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5DenseGatedActDense(nn.Module):
+    def __init__(self, d_model, d_ff, dropout_rate):
+        super().__init__()
+        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
+        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
+        self.wo = nn.Linear(d_ff, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.act = NewGELUActivation()
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+class T5FiLMLayer(nn.Module):
+    """
+    FiLM Layer
+    """
+
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        # TOOD(PVP) - rename scale_bias layer
+        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
+
+    def forward(self, x, conditioning_emb):
+        emb = self.scale_bias(conditioning_emb)
+        scale, shift = torch.chunk(emb, 2, -1)
+        x = x * (1 + scale) + shift
+        return x
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 2d9a5dc1f66c..51e94a821401 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -18,7 +18,6 @@
 
 import numpy as np
 import torch
-import os
 
 from diffusers import SpectrogramDiffusionPipeline
 from diffusers.utils import slow, require_torch_gpu, torch_device
@@ -61,7 +60,7 @@ def test_spectrogram_fast(self):
     def test_spectrogram(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick/t5_note")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick/t5_note", torch_dtype=torch.float16)
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 

From fa2d918c28dcf5d5b4aabf19af8eac304a31a77d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 16:37:59 +0200
Subject: [PATCH 084/131] remove @

---
 @ | 411 --------------------------------------------------------------
 1 file changed, 411 deletions(-)
 delete mode 100644 @

diff --git a/@ b/@
deleted file mode 100644
index 42ffc0f49326..000000000000
--- a/@
+++ /dev/null
@@ -1,411 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from torch import nn
-import math
-from ..configuration_utils import ConfigMixin, register_to_config
-from .modeling_utils import ModelMixin
-from .embeddings import get_timestep_embedding
-from .attention import FiLMLayer
-from .cross_attention import CrossAttention
-
-
-class NewGELUActivation(nn.Module):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
-
-
-class T5LayerCrossAttention(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
-        super().__init__()
-        # self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
-        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
-        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def set_weights(self):
-        self.attention.to_q.weight = nn.Parameter(self.EncDecAttention.q.weight.data)
-        self.attention.to_k.weight = nn.Parameter(self.EncDecAttention.k.weight.data)
-        self.attention.to_v.weight = nn.Parameter(self.EncDecAttention.v.weight.data)
-        self.attention.to_out[0].weight = nn.Parameter(self.EncDecAttention.o.weight.data)
-
-    def forward(
-        self,
-        hidden_states,
-        key_value_states=None,
-        attention_mask=None,
-    ):
-        # self.set_weights()
-        normed_hidden_states = self.layer_norm(hidden_states)
-#        attention_output = self.EncDecAttention(
-#            normed_hidden_states,
-#            mask=attention_mask,
-#            key_value_states=key_value_states,
-#        )
-        attention_output = self.attention(
-            normed_hidden_states,
-            encoder_hidden_states=key_value_states,
-            attention_mask=attention_mask.squeeze(),
-        )
-        layer_output = hidden_states + self.dropout(attention_output)
-        return layer_output
-
-
-class T5Attention(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
-        super().__init__()
-        self.d_model = d_model
-        self.key_value_proj_dim = d_kv
-        self.n_heads = num_heads
-        self.dropout = dropout_rate
-        self.inner_dim = self.n_heads * self.key_value_proj_dim
-
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        mask=None,
-        key_value_states=None,
-    ):
-        """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-        """
-        # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = hidden_states.shape[:2]
-
-        def shape(states):
-            """projection"""
-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-        def unshape(states):
-            """reshape"""
-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-        def project(hidden_states, proj_layer, key_value_states):
-            """projects hidden states correctly to key/query states"""
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            else:
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            return hidden_states
-        # get query states
-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-
-        # get key/value states
-        key_states = project(hidden_states, self.k, key_value_states)
-        value_states = project(hidden_states, self.v, key_value_states)
-
-        # compute scores
-        scores = torch.matmul(
-            query_states, key_states.transpose(3, 2)
-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-        if mask is not None:
-            scores += mask
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
-
-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
-
-        return attn_output
-
-
-class T5LayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
-
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class T5DenseGatedActDense(nn.Module):
-    def __init__(self, d_model, d_ff, dropout_rate):
-        super().__init__()
-        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
-        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
-        self.wo = nn.Linear(d_ff, d_model, bias=False)
-        self.dropout = nn.Dropout(dropout_rate)
-        self.act = NewGELUActivation()
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-
-        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
-        # See https://github.com/huggingface/transformers/issues/20287
-        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
-        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
-            hidden_states = hidden_states.to(self.wo.weight.dtype)
-
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5LayerFFCond(nn.Module):
-    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
-        super().__init__()
-        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
-        self.film = FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(self, hidden_states, conditioning_emb=None):
-        forwarded_states = self.layer_norm(hidden_states)
-        if conditioning_emb is not None:
-            forwarded_states = self.film(forwarded_states, conditioning_emb)
-
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5LayerSelfAttentionCond(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
-        super().__init__()
-        self.layer_norm = T5LayerNorm(d_model)
-        self.FiLMLayer = FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        # self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
-        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def set_weights(self):
-        self.attention.to_q.weight = nn.Parameter(self.SelfAttention.q.weight.data)
-        self.attention.to_k.weight = nn.Parameter(self.SelfAttention.k.weight.data)
-        self.attention.to_v.weight = nn.Parameter(self.SelfAttention.v.weight.data)
-        self.attention.to_out[0].weight = nn.Parameter(self.SelfAttention.o.weight.data)
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-    ):
-        # self.set_weights()
-        # pre_self_attention_layer_norm
-        normed_hidden_states = self.layer_norm(hidden_states)
-
-        if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
-
-        # Self-attention block
-        # attention_output = self.SelfAttention(normed_hidden_states)
-        # print("comp", (attention_output[0] - attention_output_comp).abs().sum())
-        attention_output = self.attention(normed_hidden_states)
-        hidden_states = hidden_states + self.dropout(attention_output)
-
-        return hidden_states
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
-        super().__init__()
-        self.layer = nn.ModuleList()
-
-        # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate))
-
-        # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
-
-        # Film Cond MLP + dropout: last layer
-        self.layer.append(T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-    ):
-        hidden_states = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-        )
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        if encoder_hidden_states is not None:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
-
-            hidden_states = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_extended_attention_mask,
-            )
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        # Apply Film Conditional Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        return (hidden_states,)
-
-
-class T5FilmDecoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(
-        self,
-        input_dims: int = 128,
-        targets_length: int = 256,
-        max_decoder_noise_time: float = 2000.0,
-        d_model: int = 768,
-        num_layers: int = 12,
-        num_heads: int = 12,
-        d_kv: int = 64,
-        d_ff: int = 2048,
-        dropout_rate: float = 0.1,
-        feed_forward_proj: str = "gated-gelu",
-    ):
-        super().__init__()
-
-        self.conditioning_emb = nn.Sequential(
-            nn.Linear(d_model, d_model * 4, bias=False),
-            nn.SiLU(),
-            nn.Linear(d_model * 4, d_model * 4, bias=False),
-            nn.SiLU(),
-        )
-
-        self.position_encoding = nn.Embedding(targets_length, d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
-
-        self.dropout = nn.Dropout(p=dropout_rate)
-
-        self.decoders = nn.ModuleList()
-        for lyr_num in range(num_layers):
-            # FiLM conditional T5 decoder
-            lyr = DecoderLayer(d_model=d_model, d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, dropout_rate=dropout_rate)
-            self.decoders.append(lyr)
-
-        self.decoder_norm = T5LayerNorm(d_model)
-
-        self.post_dropout = nn.Dropout(p=dropout_rate)
-        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
-
-    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
-        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
-        return mask.unsqueeze(-3)
-
-    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
-        batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape == (batch,)
-
-        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
-        time_steps = get_timestep_embedding(
-            decoder_noise_time * self.config.max_decoder_noise_time,
-            embedding_dim=self.config.d_model,
-            max_period=self.config.max_decoder_noise_time,
-        ).to(dtype=self.dtype)
-
-        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
-
-        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
-
-        seq_length = decoder_input_tokens.shape[1]
-
-        # If we want to use relative positions for audio context, we can just offset
-        # this sequence by the length of encodings_and_masks.
-        decoder_positions = torch.broadcast_to(
-            torch.arange(seq_length, device=decoder_input_tokens.device),
-            (batch, seq_length),
-        )
-
-        position_encodings = self.position_encoding(decoder_positions)
-
-        inputs = self.continuous_inputs_projection(decoder_input_tokens)
-        inputs += position_encodings
-        y = self.dropout(inputs)
-
-        # decoder: No padding present.
-        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
-
-        # Translate encoding masks to encoder-decoder masks.
-        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-
-        # cross attend style: concat encodings
-        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
-        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
-
-        for lyr in self.decoders:
-            y = lyr(
-                y,
-                conditioning_emb=conditioning_emb,
-                encoder_hidden_states=encoded,
-                encoder_attention_mask=encoder_decoder_mask,
-            )[0]
-
-        y = self.decoder_norm(y)
-        y = self.post_dropout(y)
-
-        spec_out = self.spec_out(y)
-        return spec_out

From dc2a226f65b506d4c9d32f11d9645a6343d0fd7a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 16:40:26 +0200
Subject: [PATCH 085/131] improve further

---
 src/diffusers/models/t5_film_transformer.py | 108 +-------------------
 1 file changed, 1 insertion(+), 107 deletions(-)

diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index 26f8d59d63a4..9b151b5d2484 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -20,83 +20,6 @@
 from .cross_attention import CrossAttention
 
 
-class T5Attention(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
-        super().__init__()
-        self.d_model = d_model
-        self.key_value_proj_dim = d_kv
-        self.n_heads = num_heads
-        self.dropout = dropout_rate
-        self.inner_dim = self.n_heads * self.key_value_proj_dim
-
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        mask=None,
-        key_value_states=None,
-    ):
-        """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-        """
-        # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = hidden_states.shape[:2]
-
-        def shape(states):
-            """projection"""
-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-        def unshape(states):
-            """reshape"""
-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-        def project(hidden_states, proj_layer, key_value_states):
-            """projects hidden states correctly to key/query states"""
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            else:
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            return hidden_states
-        # get query states
-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-
-        # get key/value states
-        key_states = project(hidden_states, self.k, key_value_states)
-        value_states = project(hidden_states, self.v, key_value_states)
-
-        # compute scores
-        scores = torch.matmul(
-            query_states, key_states.transpose(3, 2)
-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-        if mask is not None:
-            scores += mask
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
-
-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
-
-        return attn_output
-
-
 class T5FilmDecoder(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
@@ -246,23 +169,15 @@ def __init__(self, d_model, d_kv, num_heads, dropout_rate):
         super().__init__()
         self.layer_norm = T5LayerNorm(d_model)
         self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        # self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
         self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
         self.dropout = nn.Dropout(dropout_rate)
 
-    def set_weights(self):
-        self.attention.to_q.weight = nn.Parameter(self.SelfAttention.q.weight.data)
-        self.attention.to_k.weight = nn.Parameter(self.SelfAttention.k.weight.data)
-        self.attention.to_v.weight = nn.Parameter(self.SelfAttention.v.weight.data)
-        self.attention.to_out[0].weight = nn.Parameter(self.SelfAttention.o.weight.data)
-
     def forward(
         self,
         hidden_states,
         conditioning_emb=None,
         attention_mask=None,
     ):
-        # self.set_weights()
         # pre_self_attention_layer_norm
         normed_hidden_states = self.layer_norm(hidden_states)
 
@@ -270,9 +185,8 @@ def forward(
             normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
 
         # Self-attention block
-        # attention_output = self.SelfAttention(normed_hidden_states)
-        # print("comp", (attention_output[0] - attention_output_comp).abs().sum())
         attention_output = self.attention(normed_hidden_states)
+
         hidden_states = hidden_states + self.dropout(attention_output)
 
         return hidden_states
@@ -281,30 +195,17 @@ def forward(
 class T5LayerCrossAttention(nn.Module):
     def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
         super().__init__()
-        # self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
         self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
         self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
         self.dropout = nn.Dropout(dropout_rate)
 
-    def set_weights(self):
-        self.attention.to_q.weight = nn.Parameter(self.EncDecAttention.q.weight.data)
-        self.attention.to_k.weight = nn.Parameter(self.EncDecAttention.k.weight.data)
-        self.attention.to_v.weight = nn.Parameter(self.EncDecAttention.v.weight.data)
-        self.attention.to_out[0].weight = nn.Parameter(self.EncDecAttention.o.weight.data)
-
     def forward(
         self,
         hidden_states,
         key_value_states=None,
         attention_mask=None,
     ):
-        # self.set_weights()
         normed_hidden_states = self.layer_norm(hidden_states)
-#        attention_output = self.EncDecAttention(
-#            normed_hidden_states,
-#            mask=attention_mask,
-#            key_value_states=key_value_states,
-#        )
         attention_output = self.attention(
             normed_hidden_states,
             encoder_hidden_states=key_value_states,
@@ -347,12 +248,6 @@ def forward(self, hidden_states):
         hidden_states = hidden_gelu * hidden_linear
         hidden_states = self.dropout(hidden_states)
 
-        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
-        # See https://github.com/huggingface/transformers/issues/20287
-        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
-        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
-            hidden_states = hidden_states.to(self.wo.weight.dtype)
-
         hidden_states = self.wo(hidden_states)
         return hidden_states
 
@@ -399,7 +294,6 @@ class T5FiLMLayer(nn.Module):
 
     def __init__(self, in_features, out_features):
         super().__init__()
-        # TOOD(PVP) - rename scale_bias layer
         self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
 
     def forward(self, x, conditioning_emb):

From f9b96419d5059afcc059239b243b36202765f5ca Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 21:16:42 +0000
Subject: [PATCH 086/131] up

---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py   | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 51e94a821401..ff839354eb65 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -42,15 +42,13 @@ def tearDown(self):
     def test_spectrogram_fast(self):
         device = torch_device
 
-        # pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick/t5_note", low_cpu_mem_usage=False)
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick/t5_note")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick_huggingface_co/music-spectrogram-diffusion")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.manual_seed(0)
         output = pipe(MIDI_FILE, num_inference_steps=2, generator=generator)
 
-        # pipe.save_pretrained("/home/patrick/t5_note")
         audio = output.audios[0]
 
         assert abs(np.abs(audio).sum() - 3612.841) < 1e-3
@@ -60,11 +58,11 @@ def test_spectrogram_fast(self):
     def test_spectrogram(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick/t5_note", torch_dtype=torch.float16)
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick_huggingface_co/music-spectrogram-diffusion", torch_dtype=torch.float16)
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
         output = pipe(MIDI_FILE, num_inference_steps=100)
         audio = output.audios[0]
         rate = 16_000
-        scipy.io.wavfile.write("/home/patrick/audios/beet.wav", rate, audio[0])
+        scipy.io.wavfile.write("/home/patrick_huggingface_co/audios/beet.wav", rate, audio[0])

From 1bd68f2345e5d753129757bef0c083e8b96548f9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Feb 2023 22:24:06 +0000
Subject: [PATCH 087/131] uP

---
 _ | 399 --------------------------------------------------------------
 1 file changed, 399 deletions(-)
 delete mode 100644 _

diff --git a/_ b/_
deleted file mode 100644
index c0e9cf5bc7dd..000000000000
--- a/_
+++ /dev/null
@@ -1,399 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from torch import nn
-import math
-from ..configuration_utils import ConfigMixin, register_to_config
-from .modeling_utils import ModelMixin
-from .embeddings import get_timestep_embedding
-from .attention import FiLMLayer
-from .cross_attention import CrossAttention
-
-
-class NewGELUActivation(nn.Module):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
-
-
-class T5LayerCrossAttention(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
-        super().__init__()
-        self.EncDecAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
-        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        key_value_states=None,
-        attention_mask=None,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            key_value_states=key_value_states,
-        )
-        layer_output = hidden_states + self.dropout(attention_output[0])
-        outputs = (layer_output,)
-        return outputs
-
-
-class T5Attention(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
-        super().__init__()
-        self.d_model = d_model
-        self.key_value_proj_dim = d_kv
-        self.n_heads = num_heads
-        self.dropout = dropout_rate
-        self.inner_dim = self.n_heads * self.key_value_proj_dim
-
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        mask=None,
-        key_value_states=None,
-    ):
-        """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-        """
-        # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = hidden_states.shape[:2]
-
-        def shape(states):
-            """projection"""
-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-        def unshape(states):
-            """reshape"""
-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-        def project(hidden_states, proj_layer, key_value_states):
-            """projects hidden states correctly to key/query states"""
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            else:
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            return hidden_states
-        # get query states
-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-
-        # get key/value states
-        key_states = project(hidden_states, self.k, key_value_states)
-        value_states = project(hidden_states, self.v, key_value_states)
-
-        # compute scores
-        scores = torch.matmul(
-            query_states, key_states.transpose(3, 2)
-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-        if mask is not None:
-            scores += mask
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
-
-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
-
-        return (attn_output,)
-
-
-class T5LayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
-
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class T5DenseGatedActDense(nn.Module):
-    def __init__(self, d_model, d_ff, dropout_rate):
-        super().__init__()
-        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
-        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
-        self.wo = nn.Linear(d_ff, d_model, bias=False)
-        self.dropout = nn.Dropout(dropout_rate)
-        self.act = NewGELUActivation()
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-
-        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
-        # See https://github.com/huggingface/transformers/issues/20287
-        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
-        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
-            hidden_states = hidden_states.to(self.wo.weight.dtype)
-
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5LayerFFCond(nn.Module):
-    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
-        super().__init__()
-        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
-        self.film = FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(self, hidden_states, conditioning_emb=None):
-        forwarded_states = self.layer_norm(hidden_states)
-        if conditioning_emb is not None:
-            forwarded_states = self.film(forwarded_states, conditioning_emb)
-
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5LayerSelfAttentionCond(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
-        super().__init__()
-        self.layer_norm = T5LayerNorm(d_model)
-        self.FiLMLayer = FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.SelfAttention = T5Attention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
-        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def set_weights(self):
-        self.attention.to_q.weight = nn.Parameter(self.SelfAttention.q.weight.data)
-        self.attention.to_k.weight = nn.Parameter(self.SelfAttention.k.weight.data)
-        self.attention.to_v.weight = nn.Parameter(self.SelfAttention.v.weight.data)
-        self.attention.to_out[0].weight = nn.Parameter(self.SelfAttention.o.weight.data)
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-    ):
-        self.set_weights()
-        # pre_self_attention_layer_norm
-        normed_hidden_states = self.layer_norm(hidden_states)
-
-        if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
-
-        # Self-attention block
-        attention_output = self.SelfAttention(normed_hidden_states)
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,)
-        return outputs
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
-        super().__init__()
-        self.layer = nn.ModuleList()
-
-        # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate))
-
-        # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
-
-        # Film Cond MLP + dropout: last layer
-        self.layer.append(T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-    ):
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-        )
-        hidden_states = self_attention_outputs[0]
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        if encoder_hidden_states is not None:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10)
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_extended_attention_mask,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        # Apply Film Conditional Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-
-        return (hidden_states,)
-
-
-class T5FilmDecoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(
-        self,
-        input_dims: int = 128,
-        targets_length: int = 256,
-        max_decoder_noise_time: float = 2000.0,
-        d_model: int = 768,
-        num_layers: int = 12,
-        num_heads: int = 12,
-        d_kv: int = 64,
-        d_ff: int = 2048,
-        dropout_rate: float = 0.1,
-        feed_forward_proj: str = "gated-gelu",
-    ):
-        super().__init__()
-
-        self.conditioning_emb = nn.Sequential(
-            nn.Linear(d_model, d_model * 4, bias=False),
-            nn.SiLU(),
-            nn.Linear(d_model * 4, d_model * 4, bias=False),
-            nn.SiLU(),
-        )
-
-        self.position_encoding = nn.Embedding(targets_length, d_model)
-        self.position_encoding.weight.requires_grad = False
-
-        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
-
-        self.dropout = nn.Dropout(p=dropout_rate)
-
-        self.decoders = nn.ModuleList()
-        for lyr_num in range(num_layers):
-            # FiLM conditional T5 decoder
-            lyr = DecoderLayer(d_model=d_model, d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, dropout_rate=dropout_rate)
-            self.decoders.append(lyr)
-
-        self.decoder_norm = T5LayerNorm(d_model)
-
-        self.post_dropout = nn.Dropout(p=dropout_rate)
-        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
-
-    def encoder_decoder_mask(self, query_input, key_input, pairwise_fn=torch.mul):
-        mask = pairwise_fn(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
-        return mask.unsqueeze(-3)
-
-    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
-        batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape == (batch,)
-
-        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
-        time_steps = get_timestep_embedding(
-            decoder_noise_time * self.config.max_decoder_noise_time,
-            embedding_dim=self.config.d_model,
-            max_period=self.config.max_decoder_noise_time,
-        ).to(dtype=self.dtype)
-
-        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
-
-        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
-
-        seq_length = decoder_input_tokens.shape[1]
-
-        # If we want to use relative positions for audio context, we can just offset
-        # this sequence by the length of encodings_and_masks.
-        decoder_positions = torch.broadcast_to(
-            torch.arange(seq_length, device=decoder_input_tokens.device),
-            (batch, seq_length),
-        )
-
-        position_encodings = self.position_encoding(decoder_positions)
-
-        inputs = self.continuous_inputs_projection(decoder_input_tokens)
-        inputs += position_encodings
-        y = self.dropout(inputs)
-
-        # decoder: No padding present.
-        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device)
-
-        # Translate encoding masks to encoder-decoder masks.
-        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-
-        # cross attend style: concat encodings
-        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
-        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
-
-        for lyr in self.decoders:
-            y = lyr(
-                y,
-                conditioning_emb=conditioning_emb,
-                encoder_hidden_states=encoded,
-                encoder_attention_mask=encoder_decoder_mask,
-            )[0]
-
-        y = self.decoder_norm(y)
-        y = self.post_dropout(y)
-
-        spec_out = self.spec_out(y)
-        return spec_out

From 25cb927dbaa4bcf46ae0e6073a47066d5f64deaf Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 17:49:07 +0100
Subject: [PATCH 088/131] Apply suggestions from code review

---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index ff839354eb65..ade44437ec5c 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -42,7 +42,7 @@ def tearDown(self):
     def test_spectrogram_fast(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick_huggingface_co/music-spectrogram-diffusion")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion", torch_dtype=torch.float16)
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -58,7 +58,7 @@ def test_spectrogram_fast(self):
     def test_spectrogram(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("/home/patrick_huggingface_co/music-spectrogram-diffusion", torch_dtype=torch.float16)
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion", torch_dtype=torch.float16)
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 

From 59101b56e14d14c933c1b79cfc292c3eeeb0027b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 17:58:24 +0100
Subject: [PATCH 089/131] Update
 tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py

---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index ade44437ec5c..f66fa8e16cf7 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -42,7 +42,7 @@ def tearDown(self):
     def test_spectrogram_fast(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion", torch_dtype=torch.float16)
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 

From bc83fb3eb618d255bfdcd7206c33c3482e80e9a9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 3 Mar 2023 09:52:57 +0100
Subject: [PATCH 090/131] loop over all tokens

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index e4ac791b2596..0c717685ca86 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -177,7 +177,7 @@ def __call__(
         full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
         ones = torch.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool, device=self.device)
 
-        for i, encoder_input_tokens in enumerate(input_tokens[:2]):
+        for i, encoder_input_tokens in enumerate(input_tokens):
             if i == 0:
                 encoder_continuous_inputs = torch.from_numpy(pred_mel[:1].copy()).to(
                     device=self.device, dtype=self.decoder.dtype

From 783f89e3b5ac56e32154c5008589721c6664f8fd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 3 Mar 2023 09:55:45 +0100
Subject: [PATCH 091/131] make style

---
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/models/__init__.py              |  2 +-
 src/diffusers/models/cross_attention.py       |  4 +-
 src/diffusers/models/t5_film_transformer.py   | 42 ++++++++++++++-----
 .../pipeline_spectrogram_diffusion.py         |  7 ++--
 .../test_spectrogram_diffusion.py             |  8 ++--
 6 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 7f5b7acab5a3..97b7ae567118 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -38,8 +38,8 @@
         ControlNetModel,
         ModelMixin,
         PriorTransformer,
-        Transformer2DModel,
         T5FilmDecoder,
+        Transformer2DModel,
         UNet1DModel,
         UNet2DConditionModel,
         UNet2DModel,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index ac355f5b8081..9b146d8f0694 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -21,12 +21,12 @@
     from .dual_transformer_2d import DualTransformer2DModel
     from .modeling_utils import ModelMixin
     from .prior_transformer import PriorTransformer
+    from .t5_film_transformer import T5FilmDecoder
     from .transformer_2d import Transformer2DModel
     from .unet_1d import UNet1DModel
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
     from .vq_model import VQModel
-    from .t5_film_transformer import T5FilmDecoder
 
 if is_flax_available():
     from .unet_2d_condition_flax import FlaxUNet2DConditionModel
diff --git a/src/diffusers/models/cross_attention.py b/src/diffusers/models/cross_attention.py
index d9e12ff3b7a0..05a9d9328173 100644
--- a/src/diffusers/models/cross_attention.py
+++ b/src/diffusers/models/cross_attention.py
@@ -297,7 +297,9 @@ def __call__(
         encoder_hidden_states=None,
         attention_mask=None,
     ):
-        batch_size, sequence_length, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         query = attn.to_q(hidden_states)
 
diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index 9b151b5d2484..f4a6751dbfd9 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -11,13 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
+
 import torch
 from torch import nn
-import math
+
 from ..configuration_utils import ConfigMixin, register_to_config
-from .modeling_utils import ModelMixin
-from .embeddings import get_timestep_embedding
 from .cross_attention import CrossAttention
+from .embeddings import get_timestep_embedding
+from .modeling_utils import ModelMixin
 
 
 class T5FilmDecoder(ModelMixin, ConfigMixin):
@@ -96,7 +98,9 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
         y = self.dropout(inputs)
 
         # decoder: No padding present.
-        decoder_mask = torch.ones(decoder_input_tokens.shape[:2], device=decoder_input_tokens.device, dtype=inputs.dtype)
+        decoder_mask = torch.ones(
+            decoder_input_tokens.shape[:2], device=decoder_input_tokens.device, dtype=inputs.dtype
+        )
 
         # Translate encoding masks to encoder-decoder masks.
         encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
@@ -126,13 +130,25 @@ def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsi
         self.layer = nn.ModuleList()
 
         # cond self attention: layer 0
-        self.layer.append(T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate))
+        self.layer.append(
+            T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        )
 
         # cross attention: layer 1
-        self.layer.append(T5LayerCrossAttention(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
+        self.layer.append(
+            T5LayerCrossAttention(
+                d_model=d_model,
+                d_kv=d_kv,
+                num_heads=num_heads,
+                dropout_rate=dropout_rate,
+                layer_norm_epsilon=layer_norm_epsilon,
+            )
+        )
 
         # Film Cond MLP + dropout: last layer
-        self.layer.append(T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon))
+        self.layer.append(
+            T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon)
+        )
 
     def forward(
         self,
@@ -150,7 +166,9 @@ def forward(
         )
 
         if encoder_hidden_states is not None:
-            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10).to(encoder_hidden_states.dtype)
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10).to(
+                encoder_hidden_states.dtype
+            )
 
             hidden_states = self.layer[1](
                 hidden_states,
@@ -169,7 +187,9 @@ def __init__(self, d_model, d_kv, num_heads, dropout_rate):
         super().__init__()
         self.layer_norm = T5LayerNorm(d_model)
         self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.attention = CrossAttention(
+            query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False
+        )
         self.dropout = nn.Dropout(dropout_rate)
 
     def forward(
@@ -195,7 +215,9 @@ def forward(
 class T5LayerCrossAttention(nn.Module):
     def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
         super().__init__()
-        self.attention = CrossAttention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.attention = CrossAttention(
+            query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False
+        )
         self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
         self.dropout = nn.Dropout(dropout_rate)
 
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 0c717685ca86..abef5f0afb9c 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -19,14 +19,12 @@
 import numpy as np
 import torch
 
-from .notes_encoder import SpectrogramNotesEncoder
-from .continous_encoder import SpectrogramContEncoder
+from ...models import T5FilmDecoder
 from ...schedulers import DDPMScheduler
 from ...utils import is_note_seq_available, randn_tensor
 from ..onnx_utils import OnnxRuntimeModel
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
-from ...models import T5FilmDecoder
-
+from .continous_encoder import SpectrogramContEncoder
 from .midi_utils import (
     DEFAULT_MAX_SHIFT_SECONDS,
     DEFAULT_NUM_VELOCITY_BINS,
@@ -48,6 +46,7 @@
     note_sequence_to_onsets_and_offsets_and_programs,
     program_to_slakh_program,
 )
+from .notes_encoder import SpectrogramNotesEncoder
 
 
 if is_note_seq_available():
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index f66fa8e16cf7..1d97201a1aea 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -17,11 +17,11 @@
 import unittest
 
 import numpy as np
+import scipy
 import torch
 
 from diffusers import SpectrogramDiffusionPipeline
-from diffusers.utils import slow, require_torch_gpu, torch_device
-import scipy
+from diffusers.utils import require_torch_gpu, slow, torch_device
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -58,7 +58,9 @@ def test_spectrogram_fast(self):
     def test_spectrogram(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion", torch_dtype=torch.float16)
+        pipe = SpectrogramDiffusionPipeline.from_pretrained(
+            "kashif/music-spectrogram-diffusion", torch_dtype=torch.float16
+        )
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 

From 5584ab3cb2cd4f01a856e52e8e21da4d90072e0c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 3 Mar 2023 10:06:40 +0100
Subject: [PATCH 092/131] Added a section on the model

---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index 816d729e4c27..630e63cff7a5 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -20,6 +20,12 @@ An ideal music synthesizer should be both interactive and expressive, generating
 
 The original codebase of this implementation can be found at [magenta/music-spectrogram-diffusion)](https://github.com/magenta/music-spectrogram-diffusion).
 
+## Model
+
+![img](https://storage.googleapis.com/music-synthesis-with-spectrogram-diffusion/architecture.png)
+
+As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window's generated Spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to generate the resulting spectrogram from the MIDI window and we concatenate this spectrogram to the final output spectrogram as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the resulting spectrogram to audio which is the final result of this pipeline. 
+
 ## Available Pipelines:
 
 | Pipeline | Tasks | Colab

From 46ad2c786b6a5a9392a47a71e2ef0e4f54e14135 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 3 Mar 2023 10:09:20 +0100
Subject: [PATCH 093/131] fix formatting

---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index 630e63cff7a5..29d6dd80223f 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -24,7 +24,7 @@ The original codebase of this implementation can be found at [magenta/music-spec
 
 ![img](https://storage.googleapis.com/music-synthesis-with-spectrogram-diffusion/architecture.png)
 
-As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window's generated Spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to generate the resulting spectrogram from the MIDI window and we concatenate this spectrogram to the final output spectrogram as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the resulting spectrogram to audio which is the final result of this pipeline. 
+As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window's generated spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to generate the resulting Spectrogram from the MIDI window and we concatenate this spectrogram to the final output  as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the resulting spectrogram to audio which is the final result of this pipeline.
 
 ## Available Pipelines:
 

From a9cafb7a66bc8bbb17b22092f2f28a55de0cd048 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 3 Mar 2023 10:15:19 +0100
Subject: [PATCH 094/131] grammer

---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index 29d6dd80223f..11872a35b0dd 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
  ## Overview
 
-[Spectrogram Diffusion](https://arxiv.org/abs/2206.05408) by Hawthorne et al.
+[Spectrogram Diffusion](https://arxiv.org/abs/2206.05408) by Curtis Hawthorne, Ian Simon, Adam Roberts, Neil Zeghidour, Josh Gardner, Ethan Manilow, and Jesse Engel.
 
 An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes.
 
@@ -24,7 +24,7 @@ The original codebase of this implementation can be found at [magenta/music-spec
 
 ![img](https://storage.googleapis.com/music-synthesis-with-spectrogram-diffusion/architecture.png)
 
-As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window's generated spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to generate the resulting Spectrogram from the MIDI window and we concatenate this spectrogram to the final output  as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the resulting spectrogram to audio which is the final result of this pipeline.
+As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window's generated spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to sample the denoised Spectrogram from the MIDI window and we concatenate this spectrogram to the final output as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the potentially long spectrogram to audio which is the final result of this pipeline.
 
 ## Available Pipelines:
 

From 3f1bb1330db4756092b155ec74b71ede637c7da4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 3 Mar 2023 10:18:15 +0100
Subject: [PATCH 095/131] formatting

---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index 11872a35b0dd..120abbbbef8d 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -18,7 +18,7 @@ specific language governing permissions and limitations under the License.
 
 An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes.
 
-The original codebase of this implementation can be found at [magenta/music-spectrogram-diffusion)](https://github.com/magenta/music-spectrogram-diffusion).
+The original codebase of this implementation can be found at [magenta/music-spectrogram-diffusion](https://github.com/magenta/music-spectrogram-diffusion).
 
 ## Model
 

From ff5e1358cd75f5c7ec764b2dea669928f336a87b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 3 Mar 2023 10:24:33 +0100
Subject: [PATCH 096/131] make fix-copies

---
 src/diffusers/utils/dummy_pt_objects.py           | 15 +++++++++++++++
 .../utils/dummy_torch_and_note_seq_objects.py     |  2 --
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index c731a1f1ddf3..cc948c9e8d54 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -62,6 +62,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class T5FilmDecoder(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class Transformer2DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/diffusers/utils/dummy_torch_and_note_seq_objects.py b/src/diffusers/utils/dummy_torch_and_note_seq_objects.py
index 288bec68ef2a..997333630763 100644
--- a/src/diffusers/utils/dummy_torch_and_note_seq_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_note_seq_objects.py
@@ -1,6 +1,4 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
-
 from ..utils import DummyObject, requires_backends
 
 
From 07f5429dd533b5c40d0f9791d7dd2129e06e98ab Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 7 Mar 2023 11:45:08 +0100
Subject: [PATCH 097/131] Update src/diffusers/pipelines/__init__.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 81fc9ebe2175..9b6e03e5d414 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -37,7 +37,7 @@
     from .audio_diffusion import AudioDiffusionPipeline, Mel
 
 try:
-    if not (is_torch_available() and is_note_seq_available()):
+    if not (is_torch_available() and is_note_seq_available()) and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_torch_and_note_seq_objects import *  # noqa F403

From bae1edaddb8a552ac47b98c64983a0a561d00852 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 7 Mar 2023 12:19:11 +0100
Subject: [PATCH 098/131] Update
 src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index abef5f0afb9c..b9488353c01d 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -62,7 +62,7 @@ def __init__(
         continuous_encoder: SpectrogramContEncoder,
         decoder: T5FilmDecoder,
         scheduler: DDPMScheduler,
-        melgan: OnnxRuntimeModel,
+        melgan: OnnxRuntimeModel if is_onnx_available() else Any,
     ) -> None:
         super().__init__()
 

From adf1e6e15ddd92637c487dbcf6428886629354ba Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 7 Mar 2023 12:36:24 +0100
Subject: [PATCH 099/131] added callback ad optional ionnx

---
 src/diffusers/pipelines/__init__.py           |  2 +-
 .../pipeline_spectrogram_diffusion.py         | 43 ++++++++++++++++---
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 9b6e03e5d414..81fc9ebe2175 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -37,7 +37,7 @@
     from .audio_diffusion import AudioDiffusionPipeline, Mel
 
 try:
-    if not (is_torch_available() and is_note_seq_available()) and is_onnx_available()):
+    if not (is_torch_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_torch_and_note_seq_objects import *  # noqa F403
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index b9488353c01d..b4952d086e61 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -14,15 +14,17 @@
 # limitations under the License.
 
 import math
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, Any, Callable
 
 import numpy as np
 import torch
 
 from ...models import T5FilmDecoder
 from ...schedulers import DDPMScheduler
-from ...utils import is_note_seq_available, randn_tensor
-from ..onnx_utils import OnnxRuntimeModel
+from ...utils import is_note_seq_available, randn_tensor, is_onnx_available
+
+if is_onnx_available():
+    from ..onnx_utils import OnnxRuntimeModel
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .continous_encoder import SpectrogramContEncoder
 from .midi_utils import (
@@ -132,7 +134,18 @@ def __call__(
         generator: Optional[torch.Generator] = None,
         num_inference_steps: int = 100,
         return_dict: bool = True,
+        output_type: str = "numpy",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
     ) -> Union[AudioPipelineOutput, Tuple]:
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
         ns = note_seq.midi_file_to_note_sequence(midi_file)
         ns_sus = note_seq.apply_sustain_control_changes(ns)
 
@@ -212,7 +225,7 @@ def __call__(
             self.scheduler.set_timesteps(num_inference_steps)
 
             # Denoising diffusion loop
-            for t in self.progress_bar(self.scheduler.timesteps):
+            for j, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
                 output = self.decode(
                     encodings_and_masks=encodings_and_masks,
                     input_tokens=x,
@@ -222,6 +235,10 @@ def __call__(
                 # Compute previous output: x_t -> x_t-1
                 x = self.scheduler.step(output, t, x, generator=generator).prev_sample
 
+                # call the callback, if provided
+                if callback is not None and j % callback_steps == 0:
+                    callback(j, t, x)
+
             mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
             encoder_continuous_inputs = mel[:1]
             pred_mel = mel.cpu().float().numpy()
@@ -229,9 +246,21 @@ def __call__(
             full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
             print("Generated segment", i)
 
-        full_pred_audio = self.melgan(input_features=full_pred_mel.astype(np.float32))
+        if output_type == "numpy" and not is_onnx_available():
+            raise ValueError(
+                "Cannot return output in 'np' format if ONNX is not available. Make sure to have ONNX installed or set 'output_type' to 'mel'."
+            )
+        elif output_type == "numpy" and self.melgan is None:
+            raise ValueError(
+                "Cannot return output in 'np' format if melgan component is not defined. Make sure to define `self.melgan` or set 'output_type' to 'mel'."
+            )
+
+        if output_type == "numpy":
+            output = self.melgan(input_features=full_pred_mel.astype(np.float32))
+        else:
+            output = full_pred_mel
 
         if not return_dict:
-            return (full_pred_audio,)
+            return (output,)
 
-        return AudioPipelineOutput(audios=full_pred_audio)
+        return AudioPipelineOutput(audios=output)

From 1e78af982aaf40bfc459e3fa1f367aa31b34cb0e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 7 Mar 2023 12:47:14 +0100
Subject: [PATCH 100/131] do not squeeze batch dim

---
 src/diffusers/models/t5_film_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index f4a6751dbfd9..5ffb32876f5b 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -231,7 +231,7 @@ def forward(
         attention_output = self.attention(
             normed_hidden_states,
             encoder_hidden_states=key_value_states,
-            attention_mask=attention_mask.squeeze(),
+            attention_mask=attention_mask.squeeze(1),
         )
         layer_output = hidden_states + self.dropout(attention_output)
         return layer_output

From 098f1a2dd9830a6c72dd54c27dc190091a27a551 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Mar 2023 19:35:01 +0000
Subject: [PATCH 101/131] clean up more

---
 src/diffusers/__init__.py                     |  8 ++
 src/diffusers/pipelines/__init__.py           | 16 ++--
 .../spectrogram_diffusion/__init__.py         |  4 +
 .../spectrogram_diffusion/midi_utils.py       | 54 +++++++++++-
 .../pipeline_spectrogram_diffusion.py         | 87 +++----------------
 .../test_spectrogram_diffusion.py             | 30 +++++--
 6 files changed, 107 insertions(+), 92 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 97b7ae567118..035413fab683 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -212,3 +212,11 @@
         FlaxStableDiffusionInpaintPipeline,
         FlaxStableDiffusionPipeline,
     )
+
+try:
+    if not (is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils.dummy_note_seq_objects import *  # noqa F403
+else:
+    from .pipelines import MidiProcessor
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 81fc9ebe2175..6ac8474a09a4 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -27,6 +27,7 @@
     from .repaint import RePaintPipeline
     from .score_sde_ve import ScoreSdeVePipeline
     from .stochastic_karras_ve import KarrasVePipeline
+    from .spectrogram_diffusion import SpectrogramDiffusionPipeline
 
 try:
     if not (is_torch_available() and is_librosa_available()):
@@ -36,14 +37,6 @@
 else:
     from .audio_diffusion import AudioDiffusionPipeline, Mel
 
-try:
-    if not (is_torch_available() and is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_torch_and_note_seq_objects import *  # noqa F403
-else:
-    from .spectrogram_diffusion import SpectrogramDiffusionPipeline
-
 try:
     if not (is_torch_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
@@ -133,3 +126,10 @@
         FlaxStableDiffusionInpaintPipeline,
         FlaxStableDiffusionPipeline,
     )
+try:
+    if not (is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils.dummy_note_seq_objects import *  # noqa F403
+else:
+    from .spectrogram_diffusion import MidiProcessor
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index 04badb03af1d..fd716fb7cbdf 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,4 +1,6 @@
 # flake8: noqa
+from ...utils import is_note_seq_available
+
 from .notes_encoder import SpectrogramNotesEncoder
 from .continous_encoder import SpectrogramContEncoder
 from .pipeline_spectrogram_diffusion import (
@@ -6,3 +8,5 @@
     SpectrogramDiffusionPipeline,
     T5FilmDecoder,
 )
+if is_note_seq_available():
+    from .midi_utils import MidiProcessor
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
index 783c92702370..1d5b89ab3224 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -15,7 +15,9 @@
 
 import dataclasses
 import math
-from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Sequence, Tuple
+from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Sequence, Tuple, Union
+import os
+from .pipeline_spectrogram_diffusion import TARGET_FEATURE_LENGTH
 
 import numpy as np
 import torch
@@ -31,7 +33,6 @@
 
 
 INPUT_FEATURE_LENGTH = 2048
-TARGET_FEATURE_LENGTH = 256
 
 SAMPLE_RATE = 16000
 HOP_SIZE = 320
@@ -613,3 +614,52 @@ def note_representation_processor_chain(features, codec: Codec, note_representat
     features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"])
 
     return features
+
+
+class MidiProcessor:
+    def __init__(self):
+        self.codec = Codec(
+            max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND,
+            steps_per_second=DEFAULT_STEPS_PER_SECOND,
+            event_ranges=[
+                EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+                EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS),
+                EventRange("tie", 0, 0),
+                EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM),
+                EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+            ],
+        )
+        self.tokenizer = Tokenizer(self.codec.num_classes)
+        self.note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True)
+
+    def __call__(self, midi: Union[bytes, os.PathLike, str]):
+        if not isinstance(midi, bytes):
+            with open(midi, 'rb') as f:
+                midi = f.read()
+
+        ns = note_seq.midi_to_note_sequence(midi)
+        ns_sus = note_seq.apply_sustain_control_changes(ns)
+
+        for note in ns_sus.notes:
+            if not note.is_drum:
+                note.program = program_to_slakh_program(note.program)
+
+        samples = np.zeros(int(ns_sus.total_time * SAMPLE_RATE))
+
+        _, frame_times = audio_to_frames(samples, HOP_SIZE, FRAME_RATE)
+        times, values = note_sequence_to_onsets_and_offsets_and_programs(ns_sus)
+
+        events = encode_and_index_events(
+            state=NoteEncodingState(),
+            event_times=times,
+            event_values=values,
+            frame_times=frame_times,
+            codec=self.codec,
+            encode_event_fn=note_event_data_to_events,
+            encoding_state_to_events_fn=note_encoding_state_to_events,
+        )
+
+        events = [note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events]
+        input_tokens = [self.tokenizer.encode(event["inputs"]) for event in events]
+
+        return input_tokens
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index b4952d086e61..a1c3a8e0b40f 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -14,47 +14,25 @@
 # limitations under the License.
 
 import math
-from typing import Optional, Tuple, Union, Any, Callable
-
+from typing import Optional, Tuple, Union, Any, Callable, List
 import numpy as np
 import torch
 
 from ...models import T5FilmDecoder
 from ...schedulers import DDPMScheduler
-from ...utils import is_note_seq_available, randn_tensor, is_onnx_available
+from ...utils import randn_tensor, is_onnx_available, logging
+
+TARGET_FEATURE_LENGTH = 256
 
 if is_onnx_available():
     from ..onnx_utils import OnnxRuntimeModel
+
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .continous_encoder import SpectrogramContEncoder
-from .midi_utils import (
-    DEFAULT_MAX_SHIFT_SECONDS,
-    DEFAULT_NUM_VELOCITY_BINS,
-    DEFAULT_STEPS_PER_SECOND,
-    FRAME_RATE,
-    HOP_SIZE,
-    SAMPLE_RATE,
-    TARGET_FEATURE_LENGTH,
-    Codec,
-    EventRange,
-    NoteEncodingState,
-    NoteRepresentationConfig,
-    Tokenizer,
-    audio_to_frames,
-    encode_and_index_events,
-    note_encoding_state_to_events,
-    note_event_data_to_events,
-    note_representation_processor_chain,
-    note_sequence_to_onsets_and_offsets_and_programs,
-    program_to_slakh_program,
-)
 from .notes_encoder import SpectrogramNotesEncoder
 
 
-if is_note_seq_available():
-    import note_seq
-else:
-    raise ImportError("Please install note-seq via `pip install note-seq`")
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 class SpectrogramDiffusionPipeline(DiffusionPipeline):
@@ -130,7 +108,7 @@ def decode(self, encodings_and_masks, input_tokens, noise_time):
     @torch.no_grad()
     def __call__(
         self,
-        midi_file,
+        input_tokens: List[List[int]],
         generator: Optional[torch.Generator] = None,
         num_inference_steps: int = 100,
         return_dict: bool = True,
@@ -138,6 +116,7 @@ def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
     ) -> Union[AudioPipelineOutput, Tuple]:
+
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
@@ -146,45 +125,6 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
-        ns = note_seq.midi_file_to_note_sequence(midi_file)
-        ns_sus = note_seq.apply_sustain_control_changes(ns)
-
-        for note in ns_sus.notes:
-            if not note.is_drum:
-                note.program = program_to_slakh_program(note.program)
-
-        samples = np.zeros(int(ns_sus.total_time * SAMPLE_RATE))
-
-        _, frame_times = audio_to_frames(samples, HOP_SIZE, FRAME_RATE)
-        times, values = note_sequence_to_onsets_and_offsets_and_programs(ns_sus)
-
-        codec = Codec(
-            max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND,
-            steps_per_second=DEFAULT_STEPS_PER_SECOND,
-            event_ranges=[
-                EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
-                EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS),
-                EventRange("tie", 0, 0),
-                EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM),
-                EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
-            ],
-        )
-        tokenizer = Tokenizer(codec.num_classes)
-
-        events = encode_and_index_events(
-            state=NoteEncodingState(),
-            event_times=times,
-            event_values=values,
-            frame_times=frame_times,
-            codec=codec,
-            encode_event_fn=note_event_data_to_events,
-            encoding_state_to_events_fn=note_encoding_state_to_events,
-        )
-
-        note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True)
-        events = [note_representation_processor_chain(event, codec, note_representation_config) for event in events]
-        input_tokens = [tokenizer.encode(event["inputs"]) for event in events]
-
         pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
         full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
         ones = torch.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool, device=self.device)
@@ -235,16 +175,17 @@ def __call__(
                 # Compute previous output: x_t -> x_t-1
                 x = self.scheduler.step(output, t, x, generator=generator).prev_sample
 
-                # call the callback, if provided
-                if callback is not None and j % callback_steps == 0:
-                    callback(j, t, x)
-
             mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
             encoder_continuous_inputs = mel[:1]
             pred_mel = mel.cpu().float().numpy()
 
             full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
-            print("Generated segment", i)
+
+            # call the callback, if provided
+            if callback is not None and j % callback_steps == 0:
+                callback(j, t, x)
+
+            logger.info("Generated segment", i)
 
         if output_type == "numpy" and not is_onnx_available():
             raise ValueError(
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 1d97201a1aea..193c5f26cbf9 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -20,7 +20,7 @@
 import scipy
 import torch
 
-from diffusers import SpectrogramDiffusionPipeline
+from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
 from diffusers.utils import require_torch_gpu, slow, torch_device
 
 
@@ -45,26 +45,38 @@ def test_spectrogram_fast(self):
         pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
+        processor = MidiProcessor()
+
+        input_tokens = processor(MIDI_FILE)
+        # just run two denoising loops
+        input_tokens = input_tokens[:2]
 
         generator = torch.manual_seed(0)
-        output = pipe(MIDI_FILE, num_inference_steps=2, generator=generator)
+        output = pipe(input_tokens, num_inference_steps=2, generator=generator)
 
         audio = output.audios[0]
 
-        assert abs(np.abs(audio).sum() - 3612.841) < 1e-3
-
-        print("Finished")
+        assert abs(np.abs(audio).sum() - 3612.841) < 1e-1
 
     def test_spectrogram(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained(
-            "kashif/music-spectrogram-diffusion", torch_dtype=torch.float16
-        )
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
-        output = pipe(MIDI_FILE, num_inference_steps=100)
+        processor = MidiProcessor()
+
+        input_tokens = processor(MIDI_FILE)
+
+        # just run 5 denoising loops
+        input_tokens = input_tokens[:5]
+
+        output = pipe(input_tokens, num_inference_steps=100)
+
+        audio = output.audios[0]
+        assert abs(np.abs(audio).sum() - 21340.146) < 5e-2
+
         audio = output.audios[0]
         rate = 16_000
         scipy.io.wavfile.write("/home/patrick_huggingface_co/audios/beet.wav", rate, audio[0])

From fa77427be528ac5fffa0c1a18db68f5da9856d1c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 9 Mar 2023 19:49:29 +0000
Subject: [PATCH 102/131] upload

---
 src/diffusers/pipelines/__init__.py           |  2 +-
 .../spectrogram_diffusion/__init__.py         |  1 +
 .../spectrogram_diffusion/midi_utils.py       | 10 +++--
 .../pipeline_spectrogram_diffusion.py         | 15 ++++---
 src/diffusers/utils/testing_utils.py          |  9 ++++-
 .../test_spectrogram_diffusion.py             | 40 ++++++++++++++++---
 6 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 6ac8474a09a4..da5ceff909ad 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -26,8 +26,8 @@
     from .pndm import PNDMPipeline
     from .repaint import RePaintPipeline
     from .score_sde_ve import ScoreSdeVePipeline
-    from .stochastic_karras_ve import KarrasVePipeline
     from .spectrogram_diffusion import SpectrogramDiffusionPipeline
+    from .stochastic_karras_ve import KarrasVePipeline
 
 try:
     if not (is_torch_available() and is_librosa_available()):
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index fd716fb7cbdf..64acafc80e3b 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -8,5 +8,6 @@
     SpectrogramDiffusionPipeline,
     T5FilmDecoder,
 )
+
 if is_note_seq_available():
     from .midi_utils import MidiProcessor
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
index 1d5b89ab3224..00277adc7fbe 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
@@ -15,15 +15,15 @@
 
 import dataclasses
 import math
-from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Sequence, Tuple, Union
 import os
-from .pipeline_spectrogram_diffusion import TARGET_FEATURE_LENGTH
+from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
 import torch.nn.functional as F
 
 from ...utils import is_note_seq_available
+from .pipeline_spectrogram_diffusion import TARGET_FEATURE_LENGTH
 
 
 if is_note_seq_available():
@@ -634,7 +634,7 @@ def __init__(self):
 
     def __call__(self, midi: Union[bytes, os.PathLike, str]):
         if not isinstance(midi, bytes):
-            with open(midi, 'rb') as f:
+            with open(midi, "rb") as f:
                 midi = f.read()
 
         ns = note_seq.midi_to_note_sequence(midi)
@@ -659,7 +659,9 @@ def __call__(self, midi: Union[bytes, os.PathLike, str]):
             encoding_state_to_events_fn=note_encoding_state_to_events,
         )
 
-        events = [note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events]
+        events = [
+            note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events
+        ]
         input_tokens = [self.tokenizer.encode(event["inputs"]) for event in events]
 
         return input_tokens
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index a1c3a8e0b40f..fedd340e8c1f 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -14,15 +14,15 @@
 # limitations under the License.
 
 import math
-from typing import Optional, Tuple, Union, Any, Callable, List
+from typing import Any, Callable, List, Optional, Tuple, Union
+
 import numpy as np
 import torch
 
 from ...models import T5FilmDecoder
 from ...schedulers import DDPMScheduler
-from ...utils import randn_tensor, is_onnx_available, logging
+from ...utils import is_onnx_available, logging, randn_tensor
 
-TARGET_FEATURE_LENGTH = 256
 
 if is_onnx_available():
     from ..onnx_utils import OnnxRuntimeModel
@@ -34,8 +34,12 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+TARGET_FEATURE_LENGTH = 256
+
 
 class SpectrogramDiffusionPipeline(DiffusionPipeline):
+    _optional_components = ["melgan"]
+
     def __init__(
         self,
         notes_encoder: SpectrogramNotesEncoder,
@@ -116,7 +120,6 @@ def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
     ) -> Union[AudioPipelineOutput, Tuple]:
-
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
@@ -182,8 +185,8 @@ def __call__(
             full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
 
             # call the callback, if provided
-            if callback is not None and j % callback_steps == 0:
-                callback(j, t, x)
+            if callback is not None and i % callback_steps == 0:
+                callback(i, full_pred_mel)
 
             logger.info("Generated segment", i)
 
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 3727d2dbf97e..00c460991890 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -16,7 +16,7 @@
 import requests
 from packaging import version
 
-from .import_utils import is_flax_available, is_onnx_available, is_torch_available
+from .import_utils import is_flax_available, is_note_seq_available, is_onnx_available, is_torch_available
 from .logging import get_logger
 
 
@@ -182,6 +182,13 @@ def require_onnxruntime(test_case):
     return unittest.skipUnless(is_onnx_available(), "test requires onnxruntime")(test_case)
 
 
+def require_note_seq(test_case):
+    """
+    Decorator marking a test that requires note_seq. These tests are skipped when note_seq isn't installed.
+    """
+    return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case)
+
+
 def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray:
     if isinstance(arry, str):
         # local_path = "/home/patrick_huggingface_co/"
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 193c5f26cbf9..f7ab91bb5019 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -20,8 +20,9 @@
 import scipy
 import torch
 
-from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
+from diffusers import MidiProcessor, SpectrogramDiffusionPipeline
 from diffusers.utils import require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -30,8 +31,13 @@
 MIDI_FILE = "./tests/fixtures/elise_format0.mid"
 
 
+# Add more fast tests without MidiProcessor and onnx melgan, so that just PyTorch is needed
+
+
 @slow
 @require_torch_gpu
+@require_onnxruntime
+@require_note_seq
 class PipelineIntegrationTests(unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
@@ -39,6 +45,29 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
+    def test_callback(self):
+        # TODO - test that pipeline can decode tokens in a callback
+        # so that music can be played live
+        device = torch_device
+
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion", melgan=None)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        def callback(step, mel_output):
+            # TODO - decode tokens to audio
+            # ... melgan(...)
+            # simulate that audio is played
+            pass
+
+        processor = MidiProcessor()
+
+        input_tokens = processor(MIDI_FILE)
+        input_tokens = input_tokens[:3]
+        generator = torch.manual_seed(0)
+
+        pipe(input_tokens, num_inference_steps=5, generator=generator, callback=callback)
+
     def test_spectrogram_fast(self):
         device = torch_device
 
@@ -69,13 +98,14 @@ def test_spectrogram(self):
 
         input_tokens = processor(MIDI_FILE)
 
-        # just run 5 denoising loops
-        input_tokens = input_tokens[:5]
+        # just run 4 denoising loops
+        input_tokens = input_tokens[:4]
 
-        output = pipe(input_tokens, num_inference_steps=100)
+        generator = torch.manual_seed(0)
+        output = pipe(input_tokens, num_inference_steps=100, generator=generator)
 
         audio = output.audios[0]
-        assert abs(np.abs(audio).sum() - 21340.146) < 5e-2
+        assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2
 
         audio = output.audios[0]
         rate = 16_000

From 40a7f781be598e5fda38a2a0c1009cf2cb689525 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 7 Mar 2023 14:53:40 +0100
Subject: [PATCH 103/131] convert jax to nnumpy

---
 scripts/convert_music_spectrogram_to_diffusers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 0eb63166cbfc..4cf297ade636 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -4,8 +4,11 @@
 
 import torch
 import torch.nn as nn
+
 from music_spectrogram_diffusion import inference
 from t5x import checkpoints
+import jax as jnp
+import numpy as onp
 
 from diffusers import DDPMScheduler, OnnxRuntimeModel, SpectrogramDiffusionPipeline
 from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
@@ -124,6 +127,7 @@ def load_decoder(weights, model):
 
 def main(args):
     t5_checkpoint = checkpoints.load_t5x_checkpoint(args.checkpoint_path)
+    t5_checkpoint = jnp.tree_util.tree_map(onp.array, t5_checkpoint)
 
     gin_overrides = [
         "from __gin__ import dynamic_registration",
@@ -172,7 +176,6 @@ def main(args):
         d_kv=synth_model.model.module.config.head_dim,
         d_ff=synth_model.model.module.config.mlp_dim,
         dropout_rate=synth_model.model.module.config.dropout_rate,
-        feed_forward_proj="gated-gelu",
     )
 
     notes_encoder = load_notes_encoder(t5_checkpoint["target"]["token_encoder"], notes_encoder)

From 14b19563ec2055dcb6589ac44084dc5afc078bc6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 13 Mar 2023 09:43:54 +0100
Subject: [PATCH 104/131] make style

---
 scripts/convert_music_spectrogram_to_diffusers.py | 5 ++---
 src/diffusers/utils/testing_utils.py              | 8 +++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py
index 4cf297ade636..41ee8b914774 100644
--- a/scripts/convert_music_spectrogram_to_diffusers.py
+++ b/scripts/convert_music_spectrogram_to_diffusers.py
@@ -2,13 +2,12 @@
 import argparse
 import os
 
+import jax as jnp
+import numpy as onp
 import torch
 import torch.nn as nn
-
 from music_spectrogram_diffusion import inference
 from t5x import checkpoints
-import jax as jnp
-import numpy as onp
 
 from diffusers import DDPMScheduler, OnnxRuntimeModel, SpectrogramDiffusionPipeline
 from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 7d66f7117e01..dcdf48cfcc0e 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -16,7 +16,13 @@
 import requests
 from packaging import version
 
-from .import_utils import is_compel_available, is_flax_available, is_note_seq_available, is_onnx_available, is_torch_available
+from .import_utils import (
+    is_compel_available,
+    is_flax_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_torch_available,
+)
 from .logging import get_logger
 
 
From 40e90f08efeff44eb4c667dab070c0ad1bf0933f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 13 Mar 2023 09:49:16 +0100
Subject: [PATCH 105/131] fix warning

---
 .../spectrogram_diffusion/pipeline_spectrogram_diffusion.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index fedd340e8c1f..66155ebf7f35 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -130,7 +130,7 @@ def __call__(
 
         pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
         full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
-        ones = torch.ones((1, TARGET_FEATURE_LENGTH), dtype=np.bool, device=self.device)
+        ones = torch.ones((1, TARGET_FEATURE_LENGTH), dtype=bool, device=self.device)
 
         for i, encoder_input_tokens in enumerate(input_tokens):
             if i == 0:
@@ -138,7 +138,7 @@ def __call__(
                     device=self.device, dtype=self.decoder.dtype
                 )
                 # The first chunk has no previous context.
-                encoder_continuous_mask = torch.zeros((1, TARGET_FEATURE_LENGTH), dtype=np.bool, device=self.device)
+                encoder_continuous_mask = torch.zeros((1, TARGET_FEATURE_LENGTH), dtype=bool, device=self.device)
             else:
                 # The full song pipeline does not feed in a context feature, so the mask
                 # will be all 0s after the feature converter. Because we know we're

From 819705c0f06c05ac2283e12a4d7c4e40cad959e0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 13 Mar 2023 13:27:22 +0100
Subject: [PATCH 106/131] make fix-copies

---
 src/diffusers/utils/dummy_note_seq_objects.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 src/diffusers/utils/dummy_note_seq_objects.py

diff --git a/src/diffusers/utils/dummy_note_seq_objects.py b/src/diffusers/utils/dummy_note_seq_objects.py
new file mode 100644
index 000000000000..c02d0b015aed
--- /dev/null
+++ b/src/diffusers/utils/dummy_note_seq_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class MidiProcessor(metaclass=DummyObject):
+    _backends = ["note_seq"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["note_seq"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["note_seq"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["note_seq"])

From a2725a201425fb2bba0b24d3b9b7c99c9220d897 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Mar 2023 13:23:08 +0100
Subject: [PATCH 107/131] fix warning

---
 src/diffusers/models/t5_film_transformer.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index 5ffb32876f5b..dcefaec760ae 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -17,7 +17,7 @@
 from torch import nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .cross_attention import CrossAttention
+from ..models.attention_processor import Attention
 from .embeddings import get_timestep_embedding
 from .modeling_utils import ModelMixin
 
@@ -187,9 +187,7 @@ def __init__(self, d_model, d_kv, num_heads, dropout_rate):
         super().__init__()
         self.layer_norm = T5LayerNorm(d_model)
         self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.attention = CrossAttention(
-            query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False
-        )
+        self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
         self.dropout = nn.Dropout(dropout_rate)
 
     def forward(
@@ -215,9 +213,7 @@ def forward(
 class T5LayerCrossAttention(nn.Module):
     def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
         super().__init__()
-        self.attention = CrossAttention(
-            query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False
-        )
+        self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
         self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
         self.dropout = nn.Dropout(dropout_rate)
 
@@ -285,7 +281,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 

From 0b850f3a381b8b1d8853ce2ac85a3e3d51205877 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Mar 2023 17:01:42 +0100
Subject: [PATCH 108/131] add initial fast tests

---
 .../spectrogram_diffusion/__init__.py         |   0
 .../test_spectrogram_diffusion.py             | 136 ++++++++++++++++--
 2 files changed, 126 insertions(+), 10 deletions(-)
 create mode 100644 tests/pipelines/spectrogram_diffusion/__init__.py

diff --git a/tests/pipelines/spectrogram_diffusion/__init__.py b/tests/pipelines/spectrogram_diffusion/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index f7ab91bb5019..28ec7c25a41b 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -20,10 +20,16 @@
 import scipy
 import torch
 
-from diffusers import MidiProcessor, SpectrogramDiffusionPipeline
-from diffusers.utils import require_torch_gpu, slow, torch_device
-from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime
+from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
+from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
+from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device
+from diffusers.utils.testing_utils import is_onnx_available, require_note_seq, require_onnxruntime
 
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+if is_onnx_available():
+    from diffusers import OnnxRuntimeModel
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
@@ -31,7 +37,116 @@
 MIDI_FILE = "./tests/fixtures/elise_format0.mid"
 
 
-# Add more fast tests without MidiProcessor and onnx melgan, so that just PyTorch is needed
+class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = SpectrogramDiffusionPipeline
+    required_optional_params = PipelineTesterMixin.required_optional_params - {
+        "callback",
+        "latents",
+        "callback_steps",
+        "output_type",
+        "num_images_per_prompt",
+    }
+    test_attention_slicing = False
+    test_cpu_offload = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        notes_encoder = SpectrogramNotesEncoder(
+            max_length=2048,
+            vocab_size=1536,
+            d_model=768,
+            dropout_rate=0.1,
+            num_layers=1,
+            num_heads=1,
+            d_kv=4,
+            d_ff=2048,
+            feed_forward_proj="gated-gelu",
+        )
+
+        continuous_encoder = SpectrogramContEncoder(
+            input_dims=128,
+            targets_context_length=256,
+            d_model=768,
+            dropout_rate=0.1,
+            num_layers=1,
+            num_heads=1,
+            d_kv=4,
+            d_ff=2048,
+            feed_forward_proj="gated-gelu",
+        )
+
+        decoder = T5FilmDecoder(
+            input_dims=128,
+            targets_length=256,
+            max_decoder_noise_time=20000.0,
+            d_model=768,
+            num_layers=1,
+            num_heads=1,
+            d_kv=4,
+            d_ff=2048,
+            dropout_rate=0.1,
+        )
+
+        scheduler = DDPMScheduler()
+
+        components = {
+            "notes_encoder": notes_encoder,
+            "continuous_encoder": continuous_encoder,
+            "decoder": decoder,
+            "scheduler": scheduler,
+            "melgan": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "input_tokens": [
+                [1134, 90, 1135, 1133, 1080, 112, 1132, 1080, 1133, 1079, 133, 1132, 1079, 1133, 1] + [0] * 2033
+            ],
+            "generator": generator,
+            "num_inference_steps": 4,
+            "output_type": "mel",
+        }
+        return inputs
+
+    def test_spectrogram_diffusion(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = SpectrogramDiffusionPipeline(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = pipe(**inputs)
+        mel = output.audios
+
+        mel_slice = mel[0, -3:, -3:]
+
+        assert mel_slice.shape == (3, 3)
+        expected_slice = np.array(
+            [-4.783236, 4.0, -2.2628813, -4.4896817, -10.321411, -11.162924, -11.512925, 4.0, 4.0]
+        )
+        assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_save_load_local(self):
+        return super().test_save_load_local()
+
+    @skip_mps
+    def test_dict_tuple_outputs_equivalent(self):
+        return super().test_dict_tuple_outputs_equivalent()
+
+    @skip_mps
+    def test_save_load_optional_components(self):
+        return super().test_save_load_optional_components()
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        return super().test_attention_slicing_forward_pass()
 
 
 @slow
@@ -51,22 +166,23 @@ def test_callback(self):
         device = torch_device
 
         pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion", melgan=None)
+        melgan = OnnxRuntimeModel.from_pretrained("kashif/soundstream_mel_decoder")
+
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
         def callback(step, mel_output):
-            # TODO - decode tokens to audio
-            # ... melgan(...)
+            # decode mel to audio
+            audio = melgan(input_features=mel_output.astype(np.float32))
             # simulate that audio is played
-            pass
+            return audio
 
         processor = MidiProcessor()
-
         input_tokens = processor(MIDI_FILE)
+
         input_tokens = input_tokens[:3]
         generator = torch.manual_seed(0)
-
-        pipe(input_tokens, num_inference_steps=5, generator=generator, callback=callback)
+        pipe(input_tokens, num_inference_steps=5, generator=generator, callback=callback, output_type="mel")
 
     def test_spectrogram_fast(self):
         device = torch_device

From d326591430805415f0d8647e99a04f017b8ff6cb Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Mar 2023 17:44:58 +0100
Subject: [PATCH 109/131] add initial pipeline_params

---
 tests/pipeline_params.py                                    | 4 ++++
 .../spectrogram_diffusion/test_spectrogram_diffusion.py     | 6 +++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/pipeline_params.py b/tests/pipeline_params.py
index 2703801d4a7d..db47a9ea42c7 100644
--- a/tests/pipeline_params.py
+++ b/tests/pipeline_params.py
@@ -102,3 +102,7 @@
 UNCONDITIONAL_AUDIO_GENERATION_PARAMS = frozenset(["batch_size"])
 
 UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
+
+TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
+
+TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 28ec7c25a41b..26305d6c4164 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -25,6 +25,7 @@
 from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device
 from diffusers.utils.testing_utils import is_onnx_available, require_note_seq, require_onnxruntime
 
+from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_PARAMS, TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS
 from ...test_pipelines_common import PipelineTesterMixin
 
 
@@ -48,6 +49,8 @@ class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCa
     }
     test_attention_slicing = False
     test_cpu_offload = False
+    batch_params = TOKENS_TO_AUDIO_GENERATION_PARAMS
+    params = TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS
 
     def get_dummy_components(self):
         torch.manual_seed(0)
@@ -173,7 +176,8 @@ def test_callback(self):
 
         def callback(step, mel_output):
             # decode mel to audio
-            audio = melgan(input_features=mel_output.astype(np.float32))
+            audio = melgan(input_features=mel_output.astype(np.float32))[0]
+            assert len(audio[0]) == 81920 * (step + 1)
             # simulate that audio is played
             return audio
 

From dffad61aae2311ab4e3de587bf88bb8bf498a04c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Mar 2023 18:21:53 +0100
Subject: [PATCH 110/131] eval mode due to dropout

---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 26305d6c4164..68272334326f 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -93,9 +93,9 @@ def get_dummy_components(self):
         scheduler = DDPMScheduler()
 
         components = {
-            "notes_encoder": notes_encoder,
-            "continuous_encoder": continuous_encoder,
-            "decoder": decoder,
+            "notes_encoder": notes_encoder.eval(),
+            "continuous_encoder": continuous_encoder.eval(),
+            "decoder": decoder.eval(),
             "scheduler": scheduler,
             "melgan": None,
         }
@@ -131,7 +131,7 @@ def test_spectrogram_diffusion(self):
 
         assert mel_slice.shape == (3, 3)
         expected_slice = np.array(
-            [-4.783236, 4.0, -2.2628813, -4.4896817, -10.321411, -11.162924, -11.512925, 4.0, 4.0]
+            [-11.512925, -4.788215, -0.46172905, -2.051715, -10.539147, -10.970963, -9.091634, 4.0, 4.0]
         )
         assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2
 

From 78397e49ed1aa43640901a3dc1a6dc8bc94e6d62 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Mar 2023 20:53:46 +0100
Subject: [PATCH 111/131] skip batch tests as pipeline runs on a single file

---
 tests/pipeline_params.py                                    | 2 +-
 .../spectrogram_diffusion/test_spectrogram_diffusion.py     | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/pipeline_params.py b/tests/pipeline_params.py
index db47a9ea42c7..d341aec7a213 100644
--- a/tests/pipeline_params.py
+++ b/tests/pipeline_params.py
@@ -105,4 +105,4 @@
 
 TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
 
-TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
+TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 68272334326f..8c7d9dc8b2de 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -151,6 +151,12 @@ def test_save_load_optional_components(self):
     def test_attention_slicing_forward_pass(self):
         return super().test_attention_slicing_forward_pass()
 
+    def test_inference_batch_single_identical(self):
+        pass
+
+    def test_inference_batch_consistent(self):
+        pass
+
 
 @slow
 @require_torch_gpu

From 4908b0586e6db2df1789bedc0b299af0d57cd64a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Mar 2023 21:36:11 +0100
Subject: [PATCH 112/131] make style

---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 8c7d9dc8b2de..b44d1735a949 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -25,7 +25,7 @@
 from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device
 from diffusers.utils.testing_utils import is_onnx_available, require_note_seq, require_onnxruntime
 
-from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_PARAMS, TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS
+from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
 from ...test_pipelines_common import PipelineTesterMixin
 
 
From 03c7ae518148db9b79a4be4476b69e9d5cb84115 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Mar 2023 21:39:40 +0100
Subject: [PATCH 113/131] fix relative path

---
 src/diffusers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 5eaa0444e2fa..fd1c0d5c2386 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -218,6 +218,6 @@
     if not (is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_note_seq_objects import *  # noqa F403
+    from .utils.dummy_note_seq_objects import *  # noqa F403
 else:
     from .pipelines import MidiProcessor

From dfb32828330c49e56812c9dc9646e87151d3b5af Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 17 Mar 2023 22:25:20 +0100
Subject: [PATCH 114/131] fix doc tests

---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index 120abbbbef8d..0ab8286bfc80 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -10,9 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
- # Multi-instrument Music Synthesis with Spectrogram Diffusion
+# Multi-instrument Music Synthesis with Spectrogram Diffusion
 
- ## Overview
+## Overview
 
 [Spectrogram Diffusion](https://arxiv.org/abs/2206.05408) by Curtis Hawthorne, Ian Simon, Adam Roberts, Neil Zeghidour, Josh Gardner, Ethan Manilow, and Jesse Engel.
 
@@ -41,6 +41,7 @@ from diffusers import SpectrogramDiffusionPipeline
 pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
 pipe = pipe.to("cuda")
 
+# Download via: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
 output = pipe("beethoven_hammerklavier_2.mid")
 
 audio = output.audios[0]

From 2a38f7648e53f30a6db385fde0e1421ef975864b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 13:54:15 +0100
Subject: [PATCH 115/131] Update src/diffusers/models/t5_film_transformer.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/t5_film_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index dcefaec760ae..f9bcb06ebc0b 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -17,7 +17,7 @@
 from torch import nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..models.attention_processor import Attention
+from .attention_processor import Attention
 from .embeddings import get_timestep_embedding
 from .modeling_utils import ModelMixin
 

From 96111b2306e87318cbb1058d587bd689275c1337 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 13:54:47 +0100
Subject: [PATCH 116/131] Update src/diffusers/models/t5_film_transformer.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/t5_film_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
index f9bcb06ebc0b..1c41e656a9db 100644
--- a/src/diffusers/models/t5_film_transformer.py
+++ b/src/diffusers/models/t5_film_transformer.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 17dbe1d2bc3c07dc5b0dabdb26d0117c7354cdb7 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 14:05:11 +0100
Subject: [PATCH 117/131] Update
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index 0ab8286bfc80..2f33bf1aaea4 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -38,7 +38,7 @@ As depicted above the model takes as input a MIDI file and tokenizes it into a s
 ```python
 from diffusers import SpectrogramDiffusionPipeline
 
-pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
 pipe = pipe.to("cuda")
 
 # Download via: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid

From dd9f8ca51c61e8dfc60dded09c7e654a25365985 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 14:05:28 +0100
Subject: [PATCH 118/131] Update
 tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index b44d1735a949..c2e3990b22ec 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -174,7 +174,7 @@ def test_callback(self):
         # so that music can be played live
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion", melgan=None)
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
         melgan = OnnxRuntimeModel.from_pretrained("kashif/soundstream_mel_decoder")
 
         pipe = pipe.to(device)

From 654c79669c1644539cf5e36c125c23ac6eabe569 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 14:05:43 +0100
Subject: [PATCH 119/131] Update
 tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index c2e3990b22ec..5698bb6ef22f 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -175,7 +175,8 @@ def test_callback(self):
         device = torch_device
 
         pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        melgan = OnnxRuntimeModel.from_pretrained("kashif/soundstream_mel_decoder")
+        melgan = pipe.melgan
+        pipe.melgan = None
 
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)

From 9a8a93dd7c3b2fe9a4638c9da4a85919160b03a7 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 14:05:54 +0100
Subject: [PATCH 120/131] Update
 tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 5698bb6ef22f..cd11427dd7b0 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -217,7 +217,7 @@ def test_spectrogram_fast(self):
     def test_spectrogram(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 

From ebb8e9a77a95777da6bf9eabe8161ea28b473448 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 14:06:08 +0100
Subject: [PATCH 121/131] Update
 tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index cd11427dd7b0..587b166de01a 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -198,7 +198,7 @@ def callback(step, mel_output):
     def test_spectrogram_fast(self):
         device = torch_device
 
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
         processor = MidiProcessor()

From 3a944769c11d17fc35f6877cc45308b1d47f6365 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 14:28:24 +0100
Subject: [PATCH 122/131] add MidiProcessor

---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index 2f33bf1aaea4..b2ed410d2896 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -36,13 +36,15 @@ As depicted above the model takes as input a MIDI file and tokenizes it into a s
 ## Example usage
 
 ```python
-from diffusers import SpectrogramDiffusionPipeline
+from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
 
-pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
+pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
 pipe = pipe.to("cuda")
 
 # Download via: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
-output = pipe("beethoven_hammerklavier_2.mid")
+processor = MidiProcessor()
+
+output = pipe(processor("beethoven_hammerklavier_2.mid"))
 
 audio = output.audios[0]
 ```

From 7c43be8936c0ebf23af2bfcfe08c189db00ff9c1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 14:31:05 +0100
Subject: [PATCH 123/131] format

---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index b2ed410d2896..e9fb1c282c23 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -40,10 +40,9 @@ from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
 
 pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
 pipe = pipe.to("cuda")
-
-# Download via: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
 processor = MidiProcessor()
 
+# Download MIDI from: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
 output = pipe(processor("beethoven_hammerklavier_2.mid"))
 
 audio = output.audios[0]

From 6dcd3f7a204c224a1ee4ad8c2631fd2c9d224f36 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 14:42:40 +0100
Subject: [PATCH 124/131] fix org

---
 docs/source/en/api/pipelines/spectrogram_diffusion.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
index e9fb1c282c23..c98300fe791f 100644
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
+++ b/docs/source/en/api/pipelines/spectrogram_diffusion.mdx
@@ -38,7 +38,7 @@ As depicted above the model takes as input a MIDI file and tokenizes it into a s
 ```python
 from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
 
-pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
+pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
 pipe = pipe.to("cuda")
 processor = MidiProcessor()
 

From 17b7481962ecb2f97dcb3bd2aee1daee6ed51a66 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 21 Mar 2023 15:24:11 +0100
Subject: [PATCH 125/131] Apply suggestions from code review

---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py        | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 587b166de01a..e7ee2784ccdb 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -234,6 +234,3 @@ def test_spectrogram(self):
         audio = output.audios[0]
         assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2
 
-        audio = output.audios[0]
-        rate = 16_000
-        scipy.io.wavfile.write("/home/patrick_huggingface_co/audios/beet.wav", rate, audio[0])

From 458e7b77421af4bb755ca2a5bc8a78d1029e7af0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 21 Mar 2023 15:24:32 +0100
Subject: [PATCH 126/131] Update
 tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py

---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index e7ee2784ccdb..f8ddc66c05dc 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -233,4 +233,3 @@ def test_spectrogram(self):
 
         audio = output.audios[0]
         assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2
-

From 4f27f66eb9c46a3c1593c252d16807890d9afb6f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 15:31:14 +0100
Subject: [PATCH 127/131] make style

---
 .../spectrogram_diffusion/test_spectrogram_diffusion.py     | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index f8ddc66c05dc..ed9df3a56b1d 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -17,21 +17,17 @@
 import unittest
 
 import numpy as np
-import scipy
 import torch
 
 from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
 from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
 from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device
-from diffusers.utils.testing_utils import is_onnx_available, require_note_seq, require_onnxruntime
+from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime
 
 from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
 from ...test_pipelines_common import PipelineTesterMixin
 
 
-if is_onnx_available():
-    from diffusers import OnnxRuntimeModel
-
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
From 76a28c1981f3c20a6152a78cf7da6e4a3b1a453b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 15:47:43 +0100
Subject: [PATCH 128/131] pin protobuf to <4

---
 setup.py                                   | 1 +
 src/diffusers/dependency_versions_table.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 5cc48c6caa19..6a37ede4a5ab 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,7 @@
     "note-seq",
     "numpy",
     "parameterized",
+    "protobuf >=3.20.3,<4",
     "pytest",
     "pytest-timeout",
     "pytest-xdist",
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 1ac669a36753..787ce508b08b 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -22,6 +22,7 @@
     "note-seq": "note-seq",
     "numpy": "numpy",
     "parameterized": "parameterized",
+    "protobuf ": "protobuf >=3.20.3,<4",
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",

From 7339d379e84f3fd2a994fa1a3703449c19949e62 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 15:52:26 +0100
Subject: [PATCH 129/131] fix formatting

---
 setup.py                                   | 2 +-
 src/diffusers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 6a37ede4a5ab..0ad2ed6c3b3c 100644
--- a/setup.py
+++ b/setup.py
@@ -98,7 +98,7 @@
     "note-seq",
     "numpy",
     "parameterized",
-    "protobuf >=3.20.3,<4",
+    "protobuf>=3.20.3,<4",
     "pytest",
     "pytest-timeout",
     "pytest-xdist",
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 787ce508b08b..4db1afba2fcd 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -22,7 +22,7 @@
     "note-seq": "note-seq",
     "numpy": "numpy",
     "parameterized": "parameterized",
-    "protobuf ": "protobuf >=3.20.3,<4",
+    "protobuf ": "protobuf>=3.20.3,<4",
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",

From f71b15508e2934d62af645d831ff81bdfbd81cbd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 15:54:27 +0100
Subject: [PATCH 130/131] white space

---
 src/diffusers/dependency_versions_table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 4db1afba2fcd..1269cf1578a6 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -22,7 +22,7 @@
     "note-seq": "note-seq",
     "numpy": "numpy",
     "parameterized": "parameterized",
-    "protobuf ": "protobuf>=3.20.3,<4",
+    "protobuf": "protobuf>=3.20.3,<4",
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",

From e5225a3920f203b43563c872655e8ce43acabaac Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 21 Mar 2023 16:22:27 +0100
Subject: [PATCH 131/131] tensorboard needs protobuf

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0ad2ed6c3b3c..972f9a5b4a24 100644
--- a/setup.py
+++ b/setup.py
@@ -184,7 +184,7 @@ def run(self):
 extras = {}
 extras["quality"] = deps_list("black", "isort", "ruff", "hf-doc-builder")
 extras["docs"] = deps_list("hf-doc-builder")
-extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "Jinja2")
+extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2")
 extras["test"] = deps_list(
     "compel",
     "datasets",