From 24a07ab6e6dd339941da0b8334227262bc2fab8f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 01:30:16 +0100 Subject: [PATCH 01/31] tts : implement mimi decoder --- .gitignore | 1 + common/common.cpp | 28 + common/common.h | 22 + examples/tts/CMakeLists.txt | 6 + examples/tts/README-mimi.md | 50 ++ examples/tts/convert_mimi_to_gguf.py | 191 +++++++ examples/tts/mimi.cpp | 770 +++++++++++++++++++++++++++ 7 files changed, 1068 insertions(+) create mode 100644 examples/tts/README-mimi.md create mode 100644 examples/tts/convert_mimi_to_gguf.py create mode 100644 examples/tts/mimi.cpp diff --git a/.gitignore b/.gitignore index 2c67ad7f7c609..41fe1f31271d2 100644 --- a/.gitignore +++ b/.gitignore @@ -107,6 +107,7 @@ examples/server/*.gz.hpp !examples/*/*/*.kts !examples/sycl/*.bat !examples/sycl/*.sh +/*.wav # Server Web UI temporary files node_modules diff --git a/common/common.cpp b/common/common.cpp index 18ffb4e738aee..30870980a148d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2055,3 +2055,31 @@ common_grammar_trigger common_grammar_trigger::from_json(const json & in) { } return out; } + +// +// Audio utils +// + +bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { + std::ofstream file(fname, std::ios::binary); + if (!file) { + LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str()); + return false; + } + + wav_header header; + header.sample_rate = sample_rate; + header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); + header.block_align = header.num_channels * (header.bits_per_sample / 8); + header.data_size = data.size() * (header.bits_per_sample / 8); + header.chunk_size = 36 + header.data_size; + + file.write(reinterpret_cast(&header), sizeof(header)); + + for (const auto & sample : data) { + int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); + file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); + } + + return file.good(); +} diff --git a/common/common.h b/common/common.h index 1c0f199774976..0c67693149285 100644 --- a/common/common.h +++ b/common/common.h @@ -683,3 +683,25 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count"; const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; } + +// +// Audio utils +// + +struct wav_header { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t chunk_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_chunk_size = 16; + uint16_t audio_format = 1; // PCM + uint16_t num_channels = 1; // Mono + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; + +bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate); diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index c72bd814c3b31..f76d834b18fec 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -3,3 +3,9 @@ add_executable(${TARGET} tts.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +set(TARGET llama-mimi) +add_executable(${TARGET} mimi.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/README-mimi.md b/examples/tts/README-mimi.md new file mode 100644 index 0000000000000..b46f5f77b95d0 --- /dev/null +++ b/examples/tts/README-mimi.md @@ -0,0 +1,50 @@ +# llama.cpp/example/mimi + +This demonstrates running [Kyutai's Mimi](https://huggingface.co/kyutai/mimi) model via GGML. + +## Quickstart + +Convert model to GGUF (no need to download, the script will automatically download the `safetensors` file) + +```sh +python examples/tts/convert_mimi_to_gguf.py + +# output file: kyutai-mimi.gguf + +# optionally, use q8_0 quantization for faster speed +python examples/tts/convert_mimi_to_gguf.py --outtype q8_0 +``` + +Then compile, run it: + +```sh +cmake --build build -j --target llama-mimi + +./build/bin/llama-mimi kyutai-mimi.gguf codes.txt + +# output: output.wav + +# alternatively, use "dummy1" to get a "hey hello there" sample output file +./build/bin/llama-mimi kyutai-mimi.gguf dummy1 +``` + +Example of code file (one code per line): + +``` +1263 +1597 +1596 +1477 +1540 +1720 +1433 +118 +1066 +1968 +1096 +232 +418 +566 +1653 +2010 +``` diff --git a/examples/tts/convert_mimi_to_gguf.py b/examples/tts/convert_mimi_to_gguf.py new file mode 100644 index 0000000000000..5b44ef62103ba --- /dev/null +++ b/examples/tts/convert_mimi_to_gguf.py @@ -0,0 +1,191 @@ +import gguf +import argparse +import logging +import torch +from typing import Union +from pathlib import Path +from torch import Tensor +from transformers import MimiModel + +logger = logging.getLogger("mimi") + + +class MimiModelConverter: + mimi_model: MimiModel + gguf_writer: gguf.GGUFWriter + fname_out: Path + ftype: gguf.LlamaFileType + + def __init__(self, + pretrained_model_name_or_path: Union[Path, str], + fname_out: Path, + ftype: gguf.LlamaFileType, + is_big_endian: bool,): + self.mimi_model = MimiModel.from_pretrained(pretrained_model_name_or_path) + self.fname_out = fname_out + self.ftype = ftype + endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.gguf_writer = gguf.GGUFWriter( + path=None, + arch="if you see this, you are using the wrong file", + endianess=endianess) + + assert self.mimi_model.config.architectures[0] == "MimiModel" + + # load tensors + for name, data_torch in self.mimi_model.state_dict().items(): + # convert any unsupported data types to float32 + old_dtype = data_torch.dtype + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + self.add_tensor(name, data_torch, old_dtype) + + def add_tensor(self, name: str, data_torch: Tensor, old_dtype: torch.dtype): + is_1d = len(data_torch.shape) == 1 + is_bias = ".bias" in name + can_quantize = not is_1d and not is_bias + data_qtype = gguf.GGMLQuantizationType.F32 + + n_head = self.mimi_model.config.num_attention_heads + n_kv_head = self.mimi_model.config.num_key_value_heads + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = self.undo_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = self.undo_permute(data_torch, n_head, n_kv_head) + + # process codebook + if ".codebook.initialized" in name: + # "initialized" tensor + state_dict = self.mimi_model.state_dict() + embed_sum = state_dict[name.replace(".initialized", ".embed_sum")] + cluster_usage = state_dict[name.replace(".initialized", ".cluster_usage")] + # see modeling_mimi.py --> MimiEuclideanCodebook + data_torch = embed_sum / cluster_usage.clamp(min=self.mimi_model.config.norm_eps)[:, None] + name = name.replace(".initialized", "") + + # ignore processed tensors + if ".cluster_usage" in name or ".embed_sum" in name: + return + + # transpose some tensors + if ".conv.bias" in name: + data_torch = data_torch.view((1, data_torch.shape[0])) + data_torch = data_torch.transpose(0, 1) + + # change view 3d to 2d + if "quantizer" in name and "_proj." in name: + assert data_torch.shape[2] == 1 + data_torch = data_torch.view((data_torch.shape[0], data_torch.shape[1])) + + # shorten name, otherwise it will be too long for ggml to read + name = name.replace("_residual_vector_quantizer", "_rvq") + + if can_quantize: + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unsupported file type: {self.ftype}") + + # Conv kernels are always F16 + if ".conv.weight" in name: + data_qtype = gguf.GGMLQuantizationType.F16 + + data = data_torch.numpy() + + try: + data = gguf.quants.quantize(data, data_qtype) + except Exception as e: + logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" + logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + + def write(self): + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @staticmethod + def undo_permute(weights: Tensor, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert Mimi safetensors model to GGUF",) + parser.add_argument( + "--outfile", type=Path, default="kyutai-mimi.gguf", + help="path to write to", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", + help="output format", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="directory or model ID containing model file (if model ID is specified, download from Hugging Face hub)", + nargs="?", + default="kyutai/mimi", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + + args = parser.parse_args() + if args.model is None: + parser.error("the following arguments are required: model") + return args + + +def main() -> None: + args = parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + dir_model = args.model + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + } + + logger.info(f"Loading model: {dir_model}") + + with torch.inference_mode(): + converter = MimiModelConverter( + pretrained_model_name_or_path=dir_model, + fname_out=args.outfile, + ftype=ftype_map[args.outtype], + is_big_endian=args.bigendian, + ) + converter.write() + + +if __name__ == '__main__': + main() + diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp new file mode 100644 index 0000000000000..2c5833faa277b --- /dev/null +++ b/examples/tts/mimi.cpp @@ -0,0 +1,770 @@ +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" + +#include "common.h" + +#include +#include +#include +#include +#include +#include + +/** + * Implementation of Kyutai's Mimi model using GGML. + * Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp + * + * NOTE: only decoder is working for now. + * + * Background: + * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc + * - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times + * + * How it works? + * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code + * 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale + * 3. The upscaled code is passed to transformer, it converts N frames to N frames + * 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform + * 5. Waveform is written to a file + */ + +// copied from https://huggingface.co/kyutai/mimi/blob/main/config.json +struct mimi_config_t { + bool causal = true; + int max_position_embeddings = 8000; + int num_hidden_layers = 8; + int n_embd = 512; + int n_ffn = 2048; + int n_head = 8; + int n_head_kv = 8; + int n_rot = 64; + float norm_eps = 1e-5; + float rope_theta = 10000.0f; + int sliding_window = 250; + std::array upsampling_ratio = {8, 6, 5, 4}; + std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio + // vector quantizer + float frame_rate = 12.5; + int audio_channels = 1; + int codebook_size = 2048; + int codebook_dim = 256; + int n_semantic_components = 1; + int n_acoustic_components = 31; + // decode + float trim_right_ratio = 1.0f; +} mimi_config; + +// Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h +struct mimi_ggml_ctx { + gguf_context * ctx_gguf = nullptr; + ggml_context * ctx_data = nullptr; + ggml_context * ctx_gf = nullptr; + + // CPU-only for now, as many kernels are missing and we actually get less performance with GPU + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buf = nullptr; + ggml_backend_sched_ptr sched; + + ggml_cgraph * gf = nullptr; + std::vector buf_compute_meta; + int max_nodes = 16 * 1024; + + std::unordered_map tensors; + + mimi_ggml_ctx() { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + auto buft = ggml_backend_get_default_buffer_type(backend); + sched.reset( + ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false) + ); + buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + } + + void load_gguf(const char * fname) { + ggml_context * meta = nullptr; + + gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf = gguf_init_from_file(fname, params); + + // load tensors + const int n_tensors = gguf_get_n_tensors(ctx_gguf); + + std::vector read_buf; + ggml_init_params ggml_params = { + /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ctx_data = ggml_init(ggml_params); + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + ggml_free(meta); + throw std::runtime_error("cannot open model file for loading tensors"); + } + + // add tensors to context + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * t = ggml_get_tensor(meta, name); + ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); + ggml_set_name(cur, name); + tensors.insert({name, cur}); + } + + // alloc memory and offload data + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); + // printf("%s: Loading tensor \"%s\"\n", __func__, name); + fin.seekg(offset, std::ios::beg); + if (!fin) { + ggml_free(meta); + throw std::runtime_error(string_format("failed to seek for tensor: %s", name)); + } + int num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); + fin.close(); + + ggml_free(meta); + } + + /** + * Build a cgraph using the given builder function. + * + * The built cgraph will be stored in `ctx.gf` + */ + void build_graph(std::function builder_fn) { + ggml_free(ctx_gf); + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx_gf = ggml_init(params); + ggml_backend_sched_reset(sched.get()); + gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); + + builder_fn(ctx_gf, gf); + ggml_backend_sched_alloc_graph(sched.get(), gf); + } + + ggml_status compute() { + ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); + return status; + } + + void set_tensor_data(const std::string & name, const void * data) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); + } + + std::pair> get_tensor_data(const std::string & name) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + std::vector data(ggml_nbytes(t)); + ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); + return std::make_pair(t, data); + } + + ggml_tensor * get_weight(const char *fmt, ...) { + std::vector str(128); + va_list va; + va_start(va, fmt); + vsnprintf(str.data(), 128, fmt, va); + va_end(va); + auto it = tensors.find(str.data()); + if (it == tensors.end()) { + throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); + } + return it->second; + } + + ~mimi_ggml_ctx() { + ggml_free(ctx_data); + gguf_free(ctx_gguf); + ggml_backend_buffer_free(buf); + } +}; + +/////////////////////////////////////////////////////////////////////////// +// extension to ggml.h +// TODO: add these ops to the library (ofc with a more optimized kernel) + + +// mode: (0) constant, (1) reflect, (2) replicate, (3) circular +// value is only used in "constant" +// only "constant" with 0.0f and "replicate" are implemented here +static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, + int64_t pad_left, int64_t pad_right, float value = 0.0f) { + GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f + GGML_ASSERT(mode == 0 || mode == 2); + if (pad_left > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, tmp, x, 0); + } + if (pad_right > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + int64_t last = x->ne[0] - 1; + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, x, tmp, 0); + } + return x; +} + + + + +/////////////////////////////////////////////////////////////////////////// +// MimiConv and MimiConvTranspose + +static int64_t div_ceil(int64_t a, int64_t b) { + return a / b + (a % b ? 1 : 0); +} + +static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { + int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; + int64_t p_total = kernel_size - stride; // padding total + int64_t p_half = p_total / 2; + + int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); + int64_t ideal_len = n_frames * stride + kernel_size - p_total; + int64_t p_extra = ideal_len - x->ne[0]; + + int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; + int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); + + x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); + + x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); + if (bias) { + x = ggml_add(ctx0, x, bias); + } + ggml_set_name(x, "mimi_conv_1d"); + return x; +} + +static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { + GGML_ASSERT(x->ne[1] == kernel->ne[2]); + int64_t n_rows = x->ne[1]; + int64_t kernel_size = kernel->ne[0]; + int64_t p_total = kernel_size - stride; // padding total + + int64_t p_right = mimi_config.causal + ? (float)p_total / mimi_config.trim_right_ratio + : p_total / 2; + int64_t p_left = p_total - p_right; + + ggml_tensor * out = nullptr; + + if (depthwise) { + for (int64_t ir = 0; ir < n_rows; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, x, + x->ne[0], ir*x->ne[0]*ggml_element_size(x)); + ggml_tensor * krn = ggml_view_1d(ctx0, kernel, + kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); + row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); + // unpad (remove p_right and p_left columns) + row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); + + // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc + out = out ? ggml_concat(ctx0, out, row, 1) : row; + } + + } else { + out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); + // unpad + out = ggml_view_2d(ctx0, out, + out->ne[0] - p_total, out->ne[1], + out->nb[1], p_left*ggml_element_size(out)); + } + + if (bias) { + out = ggml_add(ctx0, out, bias); + } + + return out; +} + + + +/////////////////////////////////////////////////////////////////////////// + +// based on MimiEncoder +// SEANet encoder as used by Mimi. +struct mimi_encoder_decoder { + mimi_ggml_ctx & ctx; + struct layer { + bool is_elu = false; + bool is_resnet = false; + bool is_transposed_conv = false; + ggml_tensor * conv_0_w; + ggml_tensor * conv_0_b; + ggml_tensor * conv_1_w; + ggml_tensor * conv_1_b; + int stride = 1; + }; + std::vector layers; + + std::array repeated_pattern = {1, 4, 7, 10}; + + mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) { + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), + }); + for (int i = 0; i < (int)repeated_pattern.size(); ++i) { + int i_start = repeated_pattern[i]; + // upsampling layers + layers.push_back({ + .is_elu = true, // layer (i_start) + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), + .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), + .stride = mimi_config.upsampling_ratio[i], + .is_transposed_conv = true, + }); + // residual layers + layers.push_back({ + .is_resnet = true, + .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), + .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), + .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), + .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), + }); + } + layers.push_back({ + .is_elu = true, // layer 13 + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), + }); + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) { + ggml_tensor * x = input; + + for (auto & layer : layers) { + if (layer.is_elu) { + x = ggml_elu(ctx0, x); + } else if (layer.is_resnet) { + ggml_tensor * residual = x; + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); + x = ggml_add(ctx0, x, residual); + } else { + x = layer.is_transposed_conv + ? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) + : mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); + } + } + + return x; + } +}; + +struct mimi_transformer { + struct layer { + ggml_tensor * inp_norm_w; + ggml_tensor * inp_norm_b; + + ggml_tensor * attn_q; + ggml_tensor * attn_k; + ggml_tensor * attn_v; + ggml_tensor * attn_o; + ggml_tensor * attn_post_norm_w; + ggml_tensor * attn_post_norm_b; + ggml_tensor * attn_layer_scale; + + ggml_tensor * ffn_up; + ggml_tensor * ffn_down; + ggml_tensor * mlp_layer_scale; + }; + std::vector layers; + + mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) { + for (int il = 0; il < n_layers; il++) { + layers.push_back({ + .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), + .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), + + .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), + .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), + .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), + .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), + .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), + .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), + .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), + + .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), + .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), + .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), + }); + } + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) { + int n_tokens = input->ne[1]; + ggml_tensor * x = input; + + auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { + x = ggml_norm(ctx0, x, mimi_config.norm_eps); + x = ggml_mul(ctx0, x, w); + x = ggml_add(ctx0, x, b); + return x; + }; + + ggml_tensor * residual = input; + + for (auto & layer : layers) { + residual = x; + + // input layer norm + x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); + + // self attention + { + ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x); + ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); + ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); + + int n_embd_head = mimi_config.n_embd / mimi_config.n_head; + q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); + k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); + v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); + + int n_rot = n_embd_head; + q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); + q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); + + k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); + k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp + kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); + ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); + kq = ggml_soft_max_inplace(ctx0, kq_masked); + + v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); + kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); + + x = ggml_mul_mat(ctx0, layer.attn_o, kqv); + } + + // residual + x = ggml_mul(ctx0, x, layer.attn_layer_scale); + x = ggml_add(ctx0, x, residual); + + residual = x; + x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); + + // mlp + { + x = ggml_mul_mat(ctx0, layer.ffn_up, x); + x = ggml_gelu(ctx0, x); + x = ggml_mul_mat(ctx0, layer.ffn_down, x); + } + + // residual + x = ggml_mul(ctx0, x, layer.mlp_layer_scale); + x = ggml_add(ctx0, x, residual); + } + + return x; + } +}; + +struct mimi_residual_vector_quantizer { + struct component { + ggml_tensor * codebook; + }; + + ggml_tensor * semantic_inp_proj; + std::vector semantic_components; + ggml_tensor * semantic_out_proj; + + ggml_tensor * acoustic_inp_proj; + std::vector acoustic_components; + ggml_tensor * acoustic_out_proj; + + mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) { + semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); + semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_semantic_components; i++) { + semantic_components.push_back({ + .codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i), + }); + } + acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); + acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_acoustic_components; i++) { + acoustic_components.push_back({ + .codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i), + }); + } + } + + // the input has shape [n_codes, n_codes_per_embd] + // first row is semantic, the rest are acoustic + // example: [ [semantic], [acoustic1], [acoustic2], ... ] + ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) { + GGML_ASSERT(input->type == GGML_TYPE_I32); + + size_t n_semantic = semantic_components.size(); + int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); + int64_t n_codes = input->ne[0] / n_codes_per_embd; + + GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); + + ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + out_s = ggml_scale(ctx0, out_s, 0.0f); // clear + out_a = ggml_scale(ctx0, out_a, 0.0f); // clear + + for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); + if (ir < n_semantic) { + // semantic + ggml_tensor * codebook = semantic_components[ir].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_s = ggml_add(ctx0, out_s, embd); + } else { + // acoustic + ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_a = ggml_add(ctx0, out_a, embd); + } + } + + out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s); + out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a); + + return ggml_add(ctx0, out_s, out_a); + } +}; + + + +/////////////////////////////////////////////////////////////////////////// +// main program + +int main(int argc, const char ** argv) { + if (argc < 3) { + fprintf(stderr, "Usage: %s model.gguf codes.txt [output.wav]\n", argv[0]); + fprintf(stderr, " Format of codes.txt file: one code per line\n"); + fprintf(stderr, " Replace codes.txt with dummy0 and dummy1 for testing\n"); + fprintf(stderr, " dummy0: using code 1, 2, 3,..., 96, used for logits matching\n"); + fprintf(stderr, " dummy1: using code that will outputs 'hey hello there' sound\n"); + return 1; + } + + const char * model_path = argv[1]; + const char * codes_path = argv[2]; + const char * out_path = argc < 4 ? "output.wav" : argv[3]; + + mimi_ggml_ctx ctx; + ctx.load_gguf(model_path); + + // initialize components + mimi_encoder_decoder decoder(ctx); + mimi_transformer transformer(ctx, "decoder", mimi_config.num_hidden_layers); + mimi_residual_vector_quantizer quantizer(ctx); + + // load codes + std::vector codes; + if (strcmp(codes_path, "dummy0") == 0) { + printf("Using dummy0 codes\n"); + codes.resize(32 * 3); // [n_codes = 3, n_codes_per_embd = 32] + int n = 0; + for (int c = 0; c < 32; c++) { + for (int r = 0; r < 3; r++) { + codes[r*32 + c] = n++; + } + } + } else if (strcmp(codes_path, "dummy1") == 0) { + printf("Using dummy1 codes\n"); + codes = { + 1263 ,1597 ,1596 ,1477 ,1540 ,1720 ,1433 ,118 ,1066 ,1968 ,1096 ,232 ,418 ,566 ,1653 ,2010 , + 1029 ,1874 ,77 ,1803 ,123 ,908 ,97 ,1616 ,595 ,1170 ,1654 ,1211 ,1967 ,1579 ,1846 ,1462 , + 1962 ,175 ,1539 ,742 ,1065 ,1226 ,19 ,955 ,528 ,1031 ,659 ,1687 ,1173 ,1802 ,1031 ,1714 , + 1986 ,582 ,367 ,112 ,1245 ,1386 ,759 ,532 ,1472 ,1790 ,802 ,1213 ,1543 ,1916 ,1251 ,309 , + 1962 ,1280 ,1943 ,878 ,1588 ,1989 ,568 ,1463 ,1814 ,1095 ,103 ,583 ,976 ,998 ,871 ,587 , + 247 ,1698 ,1817 ,1024 ,268 ,597 ,45 ,1608 ,1880 ,2047 ,759 ,1578 ,1612 ,49 ,1031 ,1076 , + 927 ,1202 ,1601 ,1719 ,1670 ,412 ,568 ,1838 ,341 ,1265 ,1279 ,830 ,1997 ,32 ,1369 ,1686 , + 1307 ,419 ,1143 ,324 ,325 ,572 ,1597 ,1920 ,795 ,915 ,610 ,2000 ,819 ,718 ,1235 ,282 , + 1912 ,1911 ,141 ,1069 ,1485 ,642 ,1370 ,732 ,284 ,1407 ,1591 ,1002 ,939 ,671 ,951 ,1411 , + 1887 ,460 ,1588 ,1636 ,1312 ,232 ,969 ,1513 ,1336 ,1185 ,1660 ,4 ,926 ,1243 ,1077 ,1379 , + 704 ,85 ,257 ,1302 ,1029 ,1717 ,899 ,1345 ,355 ,1915 ,1007 ,315 ,1283 ,779 ,415 ,335 , + 1848 ,1786 ,469 ,295 ,380 ,1736 ,393 ,765 ,1921 ,836 ,374 ,1649 ,52 ,1633 ,759 ,548 , + 1922 ,47 ,564 ,893 ,34 ,131 ,1063 ,1657 ,474 ,1960 ,1255 ,1275 ,92 ,976 ,1217 ,483 , + 105 ,1746 ,1158 ,1557 ,1001 ,512 ,1668 ,1255 ,1045 ,1596 ,613 ,1272 ,1366 ,1147 ,411 ,831 , + 349 ,692 ,1435 ,2005 ,1465 ,37 ,892 ,95 ,460 ,557 ,1315 ,259 ,1978 ,1838 ,1232 ,2003 , + 1197 ,111 ,1953 ,1297 ,1843 ,671 ,1687 ,91 ,1788 ,1138 ,1896 ,399 ,615 ,758 ,1423 ,365 , + 288 ,632 ,876 ,875 ,1156 ,345 ,1189 ,638 ,1527 ,1981 ,1925 ,333 ,1353 ,473 ,1913 ,1443 , + 1634 ,1373 ,803 ,420 ,192 ,1440 ,1593 ,1925 ,784 ,831 ,552 ,807 ,1942 ,1289 ,612 ,511 , + 968 ,1091 ,30 ,828 ,1611 ,1241 ,1985 ,596 ,273 ,529 ,1182 ,302 ,726 ,1942 ,733 ,1590 , + 1564 ,214 ,1156 ,1722 ,1215 ,1837 ,1729 ,1823 ,672 ,116 ,340 ,396 ,721 ,462 ,1615 ,1380 , + 1459 ,1553 ,636 ,586 ,1148 ,1147 ,1941 ,471 ,876 ,127 ,1938 ,2002 ,1563 ,1121 ,857 ,1179 , + 1983 ,1324 ,1726 ,1445 ,295 ,270 ,896 ,1947 ,1740 ,1211 ,128 ,1266 ,734 ,715 ,1562 ,285 , + 1139 ,304 ,526 ,653 ,1270 ,320 ,484 ,22 ,687 ,1065 ,489 ,827 ,993 ,1654 ,431 ,1552 , + 1418 ,1604 ,455 ,841 ,412 ,848 ,475 ,540 ,1903 ,575 ,584 ,300 ,1079 ,189 ,1481 ,893 , + 228 ,1577 ,429 ,635 ,106 ,1536 ,176 ,348 ,1733 ,1570 ,537 ,1840 ,798 ,410 ,1714 ,1318 , + 487 ,332 ,1109 ,1744 ,283 ,692 ,681 ,1744 ,1008 ,1715 ,1956 ,1066 ,1768 ,1645 ,139 ,1967 , + 897 ,132 ,1010 ,1932 ,277 ,1536 ,1541 ,952 ,19 ,88 ,1663 ,1232 ,1681 ,1878 ,1241 ,1805 , + 89 ,1401 ,544 ,1061 ,1166 ,267 ,1351 ,1998 ,1623 ,1898 ,425 ,1320 ,2006 ,865 ,1981 ,823 , + 1243 ,471 ,485 ,1765 ,391 ,1281 ,1607 ,1418 ,116 ,1702 ,1725 ,512 ,1088 ,1375 ,1994 ,1738 , + 725 ,1471 ,811 ,1251 ,1156 ,1664 ,898 ,1511 ,1872 ,1717 ,444 ,1005 ,254 ,103 ,202 ,1769 , + 1511 ,433 ,284 ,721 ,1741 ,56 ,615 ,916 ,887 ,1253 ,916 ,535 ,1666 ,1713 ,741 ,873 , + 447 ,492 ,388 ,321 ,1860 ,1456 ,1658 ,1682 ,848 ,462 ,2034 ,1368 ,1609 ,1887 ,510 ,1516 , + }; + } else { + std::ifstream fin(codes_path); + if (!fin) { + fprintf(stderr, "Error: cannot open codes file: %s\n", codes_path); + return 1; + } + std::string line; + while (std::getline(fin, line)) { + // Skip empty lines + if (line.empty()) continue; + try { + int code = std::stoi(line); + codes.push_back(code); + } catch (const std::exception& e) { + fprintf(stderr, "Error parsing code: %s\n", line.c_str()); + return 1; + } + } + if (codes.empty()) { + fprintf(stderr, "Error: no codes found in file: %s\n", codes_path); + return 1; + } + + printf("Loaded %d codes from %s\n", (int)codes.size(), codes_path); + } + + // build cgraph + int n_pos = -1; + int n_codes = codes.size(); + int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiple of n_codes_per_embd"); + + ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { + ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); + ggml_set_name(inp_dec, "inp_dec"); + ggml_set_input(inp_dec); + + // RVQ + ggml_tensor * embeddings = quantizer.decode(ctx_gf, inp_dec); + + // upsample + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx.get_weight("upsample.conv.weight"), nullptr, 2, 1, true); + + // transformer + n_pos = embeddings->ne[0]; + ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_dec, "pos_dec"); + ggml_set_input(pos_dec); + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = transformer.forward(ctx_gf, embeddings, pos_dec); + + // SEANET decoder + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + ggml_tensor * output = decoder.forward(ctx_gf, embeddings); + + ggml_set_name(output, "output"); + ggml_set_output(output); + ggml_build_forward_expand(gf, output); + }); + + // position data + std::vector pos_data(1024); + for (int i = 0; i < (int)pos_data.size(); i++) { + pos_data[i] = i; + } + ctx.set_tensor_data("pos_dec", pos_data.data()); + + // code data (need to transpose it) + // code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes] + std::vector codes_t(n_codes_per_embd * n_codes); + for (int i = 0; i < n_codes / n_codes_per_embd; i++) { + for (int j = 0; j < n_codes_per_embd; j++) { + int src_idx = i * n_codes_per_embd + j; + int dst_idx = j * (n_codes / n_codes_per_embd) + i; + codes_t[dst_idx] = codes[src_idx]; + } + } + ctx.set_tensor_data("inp_dec", codes_t.data()); + + ctx.compute(); + + auto output = ctx.get_tensor_data("output"); + auto output_tensor = output.first; + auto output_data = output.second; + printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]); + + // print first 20 values + for (int i = 0; i < 20; i++) { + printf("%2.4f, ", ((float *)output_data.data())[i]); + } + printf("...\n"); + + // write to wav + std::vector wav_data(output_data.size() / sizeof(float)); + for (size_t i = 0; i < wav_data.size(); i++) { + wav_data[i] = ((float *)output_data.data())[i]; + } + printf("Writing to %s\n", out_path); + save_wav16(out_path, wav_data, 24000); +} From efeaa5712cb6489b9a704daf670d043e5e758347 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 09:06:00 +0100 Subject: [PATCH 02/31] fix llama-tts --- examples/tts/tts.cpp | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 4cc42e1674ccc..b3461b5d273ef 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -71,46 +71,6 @@ static void print_usage(int, char ** argv) { LOG("\n"); } -struct wav_header { - char riff[4] = {'R', 'I', 'F', 'F'}; - uint32_t chunk_size; - char wave[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - uint32_t fmt_chunk_size = 16; - uint16_t audio_format = 1; // PCM - uint16_t num_channels = 1; // Mono - uint32_t sample_rate; - uint32_t byte_rate; - uint16_t block_align; - uint16_t bits_per_sample = 16; - char data[4] = {'d', 'a', 't', 'a'}; - uint32_t data_size; -}; - -static bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { - std::ofstream file(fname, std::ios::binary); - if (!file) { - LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str()); - return false; - } - - wav_header header; - header.sample_rate = sample_rate; - header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); - header.block_align = header.num_channels * (header.bits_per_sample / 8); - header.data_size = data.size() * (header.bits_per_sample / 8); - header.chunk_size = 36 + header.data_size; - - file.write(reinterpret_cast(&header), sizeof(header)); - - for (const auto & sample : data) { - int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); - file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); - } - - return file.good(); -} - static void fill_hann_window(int length, bool periodic, float * output) { int offset = -1; if (periodic) { From a98f19918d7e6cff600d1bf0db15ea9cb9bff0da Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 09:51:10 +0100 Subject: [PATCH 03/31] put mimi_model into a shared header --- examples/tts/CMakeLists.txt | 2 +- examples/tts/README-mimi.md | 2 +- examples/tts/mimi-model.cpp | 720 ++++++++++++++++++++++++++++++++++++ examples/tts/mimi-model.h | 32 ++ examples/tts/mimi.cpp | 677 +-------------------------------- 5 files changed, 762 insertions(+), 671 deletions(-) create mode 100644 examples/tts/mimi-model.cpp create mode 100644 examples/tts/mimi-model.h diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index f76d834b18fec..39e0a92c5acb4 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -5,7 +5,7 @@ target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-mimi) -add_executable(${TARGET} mimi.cpp) +add_executable(${TARGET} mimi.cpp mimi-model.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/README-mimi.md b/examples/tts/README-mimi.md index b46f5f77b95d0..6576a118291ad 100644 --- a/examples/tts/README-mimi.md +++ b/examples/tts/README-mimi.md @@ -24,7 +24,7 @@ cmake --build build -j --target llama-mimi # output: output.wav -# alternatively, use "dummy1" to get a "hey hello there" sample output file +# alternatively, use "dummy1" to get a "wah hello there" sample output file ./build/bin/llama-mimi kyutai-mimi.gguf dummy1 ``` diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp new file mode 100644 index 0000000000000..31ff86256ae10 --- /dev/null +++ b/examples/tts/mimi-model.cpp @@ -0,0 +1,720 @@ +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" + +#include "common.h" +#include "mimi-model.h" + +#include +#include +#include +#include +#include +#include +#include + +/** + * Implementation of Kyutai's Mimi model using GGML. + * Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp + * + * NOTE: only decoder is working for now. + * + * Background: + * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc + * - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times + * + * How it works? + * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code + * 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale + * 3. The upscaled code is passed to transformer, it converts N frames to N frames + * 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform + * 5. Waveform is written to a file + */ + +// copied from https://huggingface.co/kyutai/mimi/blob/main/config.json +struct mimi_config_t { + bool causal = true; + int sample_rate = 24000; + int max_position_embeddings = 8000; + int num_hidden_layers = 8; + int n_embd = 512; + int n_ffn = 2048; + int n_head = 8; + int n_head_kv = 8; + int n_rot = 64; + float norm_eps = 1e-5; + float rope_theta = 10000.0f; + int sliding_window = 250; + std::array upsampling_ratio = {8, 6, 5, 4}; + std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio + // vector quantizer + float frame_rate = 12.5; + int audio_channels = 1; + int codebook_size = 2048; + int codebook_dim = 256; + int n_semantic_components = 1; + int n_acoustic_components = 31; + // decode + float trim_right_ratio = 1.0f; + int n_codes_per_frame = (sliding_window / 2) * (n_semantic_components + n_acoustic_components); +} mimi_config; + +// Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h +struct mimi_ggml_ctx { + gguf_context * ctx_gguf = nullptr; + ggml_context * ctx_data = nullptr; + ggml_context * ctx_gf = nullptr; + + // CPU-only for now, as many kernels are missing and we actually get less performance with GPU + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buf = nullptr; + ggml_backend_sched_ptr sched; + + ggml_cgraph * gf = nullptr; + std::vector buf_compute_meta; + int max_nodes = 16 * 1024; + + std::unordered_map tensors; + + mimi_ggml_ctx() { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + auto buft = ggml_backend_get_default_buffer_type(backend); + sched.reset( + ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false) + ); + buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + } + + void load_gguf(const char * fname) { + ggml_context * meta = nullptr; + + gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf = gguf_init_from_file(fname, params); + + // load tensors + const int n_tensors = gguf_get_n_tensors(ctx_gguf); + + std::vector read_buf; + ggml_init_params ggml_params = { + /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ctx_data = ggml_init(ggml_params); + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + ggml_free(meta); + throw std::runtime_error("cannot open model file for loading tensors"); + } + + // add tensors to context + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * t = ggml_get_tensor(meta, name); + ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); + ggml_set_name(cur, name); + tensors.insert({name, cur}); + } + + // alloc memory and offload data + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); + // printf("%s: Loading tensor \"%s\"\n", __func__, name); + fin.seekg(offset, std::ios::beg); + if (!fin) { + ggml_free(meta); + throw std::runtime_error(string_format("failed to seek for tensor: %s", name)); + } + int num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); + fin.close(); + + ggml_free(meta); + } + + /** + * Build a cgraph using the given builder function. + * + * The built cgraph will be stored in `ctx.gf` + */ + void build_graph(std::function builder_fn) { + ggml_free(ctx_gf); + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx_gf = ggml_init(params); + ggml_backend_sched_reset(sched.get()); + gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); + + builder_fn(ctx_gf, gf); + ggml_backend_sched_alloc_graph(sched.get(), gf); + } + + ggml_status compute() { + ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); + return status; + } + + void set_tensor_data(const std::string & name, const void * data) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); + } + + std::pair> get_tensor_data(const std::string & name) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + std::vector data(ggml_nbytes(t)); + ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); + return std::make_pair(t, data); + } + + ggml_tensor * get_weight(const char *fmt, ...) { + std::vector str(128); + va_list va; + va_start(va, fmt); + vsnprintf(str.data(), 128, fmt, va); + va_end(va); + auto it = tensors.find(str.data()); + if (it == tensors.end()) { + throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); + } + return it->second; + } + + ~mimi_ggml_ctx() { + ggml_free(ctx_data); + gguf_free(ctx_gguf); + ggml_backend_buffer_free(buf); + } +}; + +/////////////////////////////////////////////////////////////////////////// +// extension to ggml.h +// TODO: add these ops to the library (ofc with a more optimized kernel) + + +// mode: (0) constant, (1) reflect, (2) replicate, (3) circular +// value is only used in "constant" +// only "constant" with 0.0f and "replicate" are implemented here +static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, + int64_t pad_left, int64_t pad_right, float value = 0.0f) { + GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f + GGML_ASSERT(mode == 0 || mode == 2); + if (pad_left > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, tmp, x, 0); + } + if (pad_right > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + int64_t last = x->ne[0] - 1; + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, x, tmp, 0); + } + return x; +} + + + + +/////////////////////////////////////////////////////////////////////////// +// MimiConv and MimiConvTranspose + +static int64_t div_ceil(int64_t a, int64_t b) { + return a / b + (a % b ? 1 : 0); +} + +static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { + int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; + int64_t p_total = kernel_size - stride; // padding total + int64_t p_half = p_total / 2; + + int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); + int64_t ideal_len = n_frames * stride + kernel_size - p_total; + int64_t p_extra = ideal_len - x->ne[0]; + + int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; + int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); + + x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); + + x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); + if (bias) { + x = ggml_add(ctx0, x, bias); + } + ggml_set_name(x, "mimi_conv_1d"); + return x; +} + +static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { + GGML_ASSERT(x->ne[1] == kernel->ne[2]); + int64_t n_rows = x->ne[1]; + int64_t kernel_size = kernel->ne[0]; + int64_t p_total = kernel_size - stride; // padding total + + int64_t p_right = mimi_config.causal + ? (float)p_total / mimi_config.trim_right_ratio + : p_total / 2; + int64_t p_left = p_total - p_right; + + ggml_tensor * out = nullptr; + + if (depthwise) { + for (int64_t ir = 0; ir < n_rows; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, x, + x->ne[0], ir*x->ne[0]*ggml_element_size(x)); + ggml_tensor * krn = ggml_view_1d(ctx0, kernel, + kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); + row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); + // unpad (remove p_right and p_left columns) + row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); + + // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc + out = out ? ggml_concat(ctx0, out, row, 1) : row; + } + + } else { + out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); + // unpad + out = ggml_view_2d(ctx0, out, + out->ne[0] - p_total, out->ne[1], + out->nb[1], p_left*ggml_element_size(out)); + } + + if (bias) { + out = ggml_add(ctx0, out, bias); + } + + return out; +} + + + +/////////////////////////////////////////////////////////////////////////// + +// based on MimiEncoder +// SEANet encoder as used by Mimi. +struct mimi_encoder_decoder { + mimi_ggml_ctx & ctx; + struct layer { + bool is_elu = false; + bool is_resnet = false; + bool is_transposed_conv = false; + ggml_tensor * conv_0_w; + ggml_tensor * conv_0_b; + ggml_tensor * conv_1_w; + ggml_tensor * conv_1_b; + int stride = 1; + }; + std::vector layers; + + std::array repeated_pattern = {1, 4, 7, 10}; + + mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) { + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), + }); + for (int i = 0; i < (int)repeated_pattern.size(); ++i) { + int i_start = repeated_pattern[i]; + // upsampling layers + layers.push_back({ + .is_elu = true, // layer (i_start) + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), + .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), + .stride = mimi_config.upsampling_ratio[i], + .is_transposed_conv = true, + }); + // residual layers + layers.push_back({ + .is_resnet = true, + .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), + .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), + .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), + .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), + }); + } + layers.push_back({ + .is_elu = true, // layer 13 + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), + }); + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) { + ggml_tensor * x = input; + + for (auto & layer : layers) { + if (layer.is_elu) { + x = ggml_elu(ctx0, x); + } else if (layer.is_resnet) { + ggml_tensor * residual = x; + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); + x = ggml_add(ctx0, x, residual); + } else { + x = layer.is_transposed_conv + ? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) + : mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); + } + } + + return x; + } +}; + +struct mimi_transformer { + struct layer { + ggml_tensor * inp_norm_w; + ggml_tensor * inp_norm_b; + + ggml_tensor * attn_q; + ggml_tensor * attn_k; + ggml_tensor * attn_v; + ggml_tensor * attn_o; + ggml_tensor * attn_post_norm_w; + ggml_tensor * attn_post_norm_b; + ggml_tensor * attn_layer_scale; + + ggml_tensor * ffn_up; + ggml_tensor * ffn_down; + ggml_tensor * mlp_layer_scale; + }; + std::vector layers; + + mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) { + for (int il = 0; il < n_layers; il++) { + layers.push_back({ + .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), + .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), + + .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), + .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), + .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), + .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), + .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), + .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), + .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), + + .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), + .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), + .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), + }); + } + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) { + int n_tokens = input->ne[1]; + ggml_tensor * x = input; + + auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { + x = ggml_norm(ctx0, x, mimi_config.norm_eps); + x = ggml_mul(ctx0, x, w); + x = ggml_add(ctx0, x, b); + return x; + }; + + ggml_tensor * residual = input; + + for (auto & layer : layers) { + residual = x; + + // input layer norm + x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); + + // self attention + { + ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x); + ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); + ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); + + int n_embd_head = mimi_config.n_embd / mimi_config.n_head; + q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); + k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); + v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); + + int n_rot = n_embd_head; + q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); + q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); + + k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); + k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp + kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); + ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); + kq = ggml_soft_max_inplace(ctx0, kq_masked); + + v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); + kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); + + x = ggml_mul_mat(ctx0, layer.attn_o, kqv); + } + + // residual + x = ggml_mul(ctx0, x, layer.attn_layer_scale); + x = ggml_add(ctx0, x, residual); + + residual = x; + x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); + + // mlp + { + x = ggml_mul_mat(ctx0, layer.ffn_up, x); + x = ggml_gelu(ctx0, x); + x = ggml_mul_mat(ctx0, layer.ffn_down, x); + } + + // residual + x = ggml_mul(ctx0, x, layer.mlp_layer_scale); + x = ggml_add(ctx0, x, residual); + } + + return x; + } +}; + +struct mimi_residual_vector_quantizer { + struct component { + ggml_tensor * codebook; + }; + + ggml_tensor * semantic_inp_proj; + std::vector semantic_components; + ggml_tensor * semantic_out_proj; + + ggml_tensor * acoustic_inp_proj; + std::vector acoustic_components; + ggml_tensor * acoustic_out_proj; + + mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) { + semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); + semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_semantic_components; i++) { + semantic_components.push_back({ + .codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i), + }); + } + acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); + acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_acoustic_components; i++) { + acoustic_components.push_back({ + .codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i), + }); + } + } + + // the input has shape [n_codes, n_codes_per_embd] + // first row is semantic, the rest are acoustic + // example: [ [semantic], [acoustic1], [acoustic2], ... ] + ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) { + GGML_ASSERT(input->type == GGML_TYPE_I32); + + size_t n_semantic = semantic_components.size(); + int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); + int64_t n_codes = input->ne[0] / n_codes_per_embd; + + GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); + + ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + out_s = ggml_scale(ctx0, out_s, 0.0f); // clear + out_a = ggml_scale(ctx0, out_a, 0.0f); // clear + + for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); + if (ir < n_semantic) { + // semantic + ggml_tensor * codebook = semantic_components[ir].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_s = ggml_add(ctx0, out_s, embd); + } else { + // acoustic + ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_a = ggml_add(ctx0, out_a, embd); + } + } + + out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s); + out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a); + + return ggml_add(ctx0, out_s, out_a); + } +}; + + +mimi_model::mimi_model(const char * fname, bool verbose) : verbose(verbose) { + ctx.reset(new mimi_ggml_ctx()); + ctx->load_gguf(fname); + + // initialize components + seanet_dec .reset(new mimi_encoder_decoder(*ctx)); + transformer_dec.reset(new mimi_transformer(*ctx, "decoder", mimi_config.num_hidden_layers)); + quantizer .reset(new mimi_residual_vector_quantizer(*ctx)); +} + +mimi_model::~mimi_model() { +} + +std::vector mimi_model::decode_frame(const std::vector & codes, int & n_past) { + // build cgraph + int n_pos = -1; + int n_codes = codes.size(); + int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiple of n_codes_per_embd"); + + ctx->build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { + ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); + ggml_set_name(inp_dec, "inp_dec"); + ggml_set_input(inp_dec); + + // RVQ + ggml_tensor * embeddings = quantizer->decode(ctx_gf, inp_dec); + + // upsample + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx->get_weight("upsample.conv.weight"), nullptr, 2, 1, true); + + // transformer + n_pos = embeddings->ne[0]; + ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_dec, "pos_dec"); + ggml_set_input(pos_dec); + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = transformer_dec->forward(ctx_gf, embeddings, pos_dec); + + // SEANET decoder + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + ggml_tensor * output = seanet_dec->forward(ctx_gf, embeddings); + + ggml_set_name(output, "output"); + ggml_set_output(output); + ggml_build_forward_expand(gf, output); + }); + + // position data + GGML_ASSERT(n_pos <= mimi_config.sliding_window); + std::vector pos_data(n_pos); + for (int i = 0; i < (int)pos_data.size(); i++) { + pos_data[i] = i + n_past; + } + n_past += n_pos; + if (verbose) { + printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past); + } + ctx->set_tensor_data("pos_dec", pos_data.data()); + + // code data (need to transpose it) + // code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes] + std::vector codes_t(n_codes_per_embd * n_codes); + for (int i = 0; i < n_codes / n_codes_per_embd; i++) { + for (int j = 0; j < n_codes_per_embd; j++) { + int src_idx = i * n_codes_per_embd + j; + int dst_idx = j * (n_codes / n_codes_per_embd) + i; + codes_t[dst_idx] = codes[src_idx]; + } + } + ctx->set_tensor_data("inp_dec", codes_t.data()); + + ctx->compute(); + + auto output = ctx->get_tensor_data("output"); + // auto output_tensor = output.first; + auto output_data = output.second; + // printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]); + + std::vector wav_data(output_data.size() / sizeof(float)); + for (size_t i = 0; i < wav_data.size(); i++) { + wav_data[i] = ((float *)output_data.data())[i]; + } + + return wav_data; +} + +std::vector mimi_model::decode(const std::vector & codes) { + std::vector output; + + if (verbose) { + printf("%s: n_codes: %zu\n", __func__, codes.size()); + } + + int64_t t_start = ggml_time_ms(); + int n_frames = 0; + + int n_past = 0; + for (size_t i = 0; i < codes.size(); i += mimi_config.n_codes_per_frame) { + size_t remaining = std::min((size_t)mimi_config.n_codes_per_frame, codes.size() - i); + std::vector frame(codes.begin() + i, codes.begin() + i + remaining); + + auto wav_data = decode_frame(frame, n_past); + output.insert(output.end(), wav_data.begin(), wav_data.end()); + + n_frames++; + } + + int64_t t_end = ggml_time_ms(); + if (verbose) { + printf("%s: n_frames: %d, time: %" PRId64 "ms, per_frame: %" PRId64 "ms\n", __func__, n_frames, t_end - t_start, (t_end - t_start) / n_frames); + } + + return output; +} + +int mimi_model::get_sample_rate() const { + return mimi_config.sample_rate; +} diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h new file mode 100644 index 0000000000000..d48c19b5476e3 --- /dev/null +++ b/examples/tts/mimi-model.h @@ -0,0 +1,32 @@ +#pragma once + +#include "ggml.h" +#include +#include + +struct mimi_ggml_ctx; +struct mimi_encoder_decoder; +struct mimi_transformer; +struct mimi_residual_vector_quantizer; + +struct mimi_model { + bool verbose = false; + std::unique_ptr ctx; + + std::unique_ptr seanet_dec; + std::unique_ptr transformer_dec; + std::unique_ptr quantizer; + + mimi_model(const char * fname, bool verbose = false); + ~mimi_model(); + + int get_sample_rate() const; + + std::vector decode(const std::vector & codes); + + // TODO: implement encoding pass + // std::vector encode(const std::vector & wav_data); + +private: + std::vector decode_frame(const std::vector & codes, int & n_past); +}; diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 2c5833faa277b..052f546b43a23 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -1,610 +1,17 @@ -#include "ggml.h" -#include "ggml-cpp.h" -#include "ggml-cpu.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "gguf.h" - #include "common.h" +#include "mimi-model.h" -#include #include -#include #include -#include -#include - -/** - * Implementation of Kyutai's Mimi model using GGML. - * Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp - * - * NOTE: only decoder is working for now. - * - * Background: - * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc - * - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times - * - * How it works? - * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code - * 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale - * 3. The upscaled code is passed to transformer, it converts N frames to N frames - * 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform - * 5. Waveform is written to a file - */ - -// copied from https://huggingface.co/kyutai/mimi/blob/main/config.json -struct mimi_config_t { - bool causal = true; - int max_position_embeddings = 8000; - int num_hidden_layers = 8; - int n_embd = 512; - int n_ffn = 2048; - int n_head = 8; - int n_head_kv = 8; - int n_rot = 64; - float norm_eps = 1e-5; - float rope_theta = 10000.0f; - int sliding_window = 250; - std::array upsampling_ratio = {8, 6, 5, 4}; - std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio - // vector quantizer - float frame_rate = 12.5; - int audio_channels = 1; - int codebook_size = 2048; - int codebook_dim = 256; - int n_semantic_components = 1; - int n_acoustic_components = 31; - // decode - float trim_right_ratio = 1.0f; -} mimi_config; - -// Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h -struct mimi_ggml_ctx { - gguf_context * ctx_gguf = nullptr; - ggml_context * ctx_data = nullptr; - ggml_context * ctx_gf = nullptr; - - // CPU-only for now, as many kernels are missing and we actually get less performance with GPU - ggml_backend_t backend = nullptr; - ggml_backend_buffer_t buf = nullptr; - ggml_backend_sched_ptr sched; - - ggml_cgraph * gf = nullptr; - std::vector buf_compute_meta; - int max_nodes = 16 * 1024; - - std::unordered_map tensors; - - mimi_ggml_ctx() { - backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - auto buft = ggml_backend_get_default_buffer_type(backend); - sched.reset( - ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false) - ); - buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); - } - - void load_gguf(const char * fname) { - ggml_context * meta = nullptr; - - gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &meta, - }; - - ctx_gguf = gguf_init_from_file(fname, params); - - // load tensors - const int n_tensors = gguf_get_n_tensors(ctx_gguf); - - std::vector read_buf; - ggml_init_params ggml_params = { - /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - ctx_data = ggml_init(ggml_params); - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - ggml_free(meta); - throw std::runtime_error("cannot open model file for loading tensors"); - } - - // add tensors to context - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - ggml_tensor * t = ggml_get_tensor(meta, name); - ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); - ggml_set_name(cur, name); - tensors.insert({name, cur}); - } - - // alloc memory and offload data - ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); - buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); - ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); - // printf("%s: Loading tensor \"%s\"\n", __func__, name); - fin.seekg(offset, std::ios::beg); - if (!fin) { - ggml_free(meta); - throw std::runtime_error(string_format("failed to seek for tensor: %s", name)); - } - int num_bytes = ggml_nbytes(cur); - if (ggml_backend_buft_is_host(buft)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), num_bytes); - ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); - } - } - printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); - fin.close(); - - ggml_free(meta); - } - - /** - * Build a cgraph using the given builder function. - * - * The built cgraph will be stored in `ctx.gf` - */ - void build_graph(std::function builder_fn) { - ggml_free(ctx_gf); - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ctx_gf = ggml_init(params); - ggml_backend_sched_reset(sched.get()); - gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); - - builder_fn(ctx_gf, gf); - ggml_backend_sched_alloc_graph(sched.get(), gf); - } - - ggml_status compute() { - ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); - return status; - } - - void set_tensor_data(const std::string & name, const void * data) { - ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); - if (!t) { - throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); - } - ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); - } - - std::pair> get_tensor_data(const std::string & name) { - ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); - if (!t) { - throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); - } - std::vector data(ggml_nbytes(t)); - ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); - return std::make_pair(t, data); - } - - ggml_tensor * get_weight(const char *fmt, ...) { - std::vector str(128); - va_list va; - va_start(va, fmt); - vsnprintf(str.data(), 128, fmt, va); - va_end(va); - auto it = tensors.find(str.data()); - if (it == tensors.end()) { - throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); - } - return it->second; - } - - ~mimi_ggml_ctx() { - ggml_free(ctx_data); - gguf_free(ctx_gguf); - ggml_backend_buffer_free(buf); - } -}; - -/////////////////////////////////////////////////////////////////////////// -// extension to ggml.h -// TODO: add these ops to the library (ofc with a more optimized kernel) - - -// mode: (0) constant, (1) reflect, (2) replicate, (3) circular -// value is only used in "constant" -// only "constant" with 0.0f and "replicate" are implemented here -static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, - int64_t pad_left, int64_t pad_right, float value = 0.0f) { - GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f - GGML_ASSERT(mode == 0 || mode == 2); - if (pad_left > 0) { - ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); - if (mode == 0) { - tmp = ggml_scale(ctx0, tmp, value); - } else if (mode == 2) { - ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column - tmp = ggml_repeat(ctx0, elem, tmp); - } - x = ggml_concat(ctx0, tmp, x, 0); - } - if (pad_right > 0) { - ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); - if (mode == 0) { - tmp = ggml_scale(ctx0, tmp, value); - } else if (mode == 2) { - int64_t last = x->ne[0] - 1; - ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column - tmp = ggml_repeat(ctx0, elem, tmp); - } - x = ggml_concat(ctx0, x, tmp, 0); - } - return x; -} - - - - -/////////////////////////////////////////////////////////////////////////// -// MimiConv and MimiConvTranspose - -static int64_t div_ceil(int64_t a, int64_t b) { - return a / b + (a % b ? 1 : 0); -} - -static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x, - ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { - int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; - int64_t p_total = kernel_size - stride; // padding total - int64_t p_half = p_total / 2; - - int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); - int64_t ideal_len = n_frames * stride + kernel_size - p_total; - int64_t p_extra = ideal_len - x->ne[0]; - - int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; - int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); - - x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); - - x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); - if (bias) { - x = ggml_add(ctx0, x, bias); - } - ggml_set_name(x, "mimi_conv_1d"); - return x; -} - -static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x, - ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { - GGML_ASSERT(x->ne[1] == kernel->ne[2]); - int64_t n_rows = x->ne[1]; - int64_t kernel_size = kernel->ne[0]; - int64_t p_total = kernel_size - stride; // padding total - - int64_t p_right = mimi_config.causal - ? (float)p_total / mimi_config.trim_right_ratio - : p_total / 2; - int64_t p_left = p_total - p_right; - - ggml_tensor * out = nullptr; - - if (depthwise) { - for (int64_t ir = 0; ir < n_rows; ir++) { - ggml_tensor * row = ggml_view_1d(ctx0, x, - x->ne[0], ir*x->ne[0]*ggml_element_size(x)); - ggml_tensor * krn = ggml_view_1d(ctx0, kernel, - kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); - row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); - // unpad (remove p_right and p_left columns) - row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); - - // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc - out = out ? ggml_concat(ctx0, out, row, 1) : row; - } - - } else { - out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); - // unpad - out = ggml_view_2d(ctx0, out, - out->ne[0] - p_total, out->ne[1], - out->nb[1], p_left*ggml_element_size(out)); - } - - if (bias) { - out = ggml_add(ctx0, out, bias); - } - - return out; -} - -/////////////////////////////////////////////////////////////////////////// - -// based on MimiEncoder -// SEANet encoder as used by Mimi. -struct mimi_encoder_decoder { - mimi_ggml_ctx & ctx; - struct layer { - bool is_elu = false; - bool is_resnet = false; - bool is_transposed_conv = false; - ggml_tensor * conv_0_w; - ggml_tensor * conv_0_b; - ggml_tensor * conv_1_w; - ggml_tensor * conv_1_b; - int stride = 1; - }; - std::vector layers; - - std::array repeated_pattern = {1, 4, 7, 10}; - - mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) { - layers.push_back({ - .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), - .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), - }); - for (int i = 0; i < (int)repeated_pattern.size(); ++i) { - int i_start = repeated_pattern[i]; - // upsampling layers - layers.push_back({ - .is_elu = true, // layer (i_start) - }); - layers.push_back({ - .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), - .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), - .stride = mimi_config.upsampling_ratio[i], - .is_transposed_conv = true, - }); - // residual layers - layers.push_back({ - .is_resnet = true, - .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), - .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), - .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), - .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), - }); - } - layers.push_back({ - .is_elu = true, // layer 13 - }); - layers.push_back({ - .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), - .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), - }); - } - - ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) { - ggml_tensor * x = input; - - for (auto & layer : layers) { - if (layer.is_elu) { - x = ggml_elu(ctx0, x); - } else if (layer.is_resnet) { - ggml_tensor * residual = x; - x = ggml_elu(ctx0, x); - x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); - x = ggml_elu(ctx0, x); - x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); - x = ggml_add(ctx0, x, residual); - } else { - x = layer.is_transposed_conv - ? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) - : mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); - } - } - - return x; - } -}; - -struct mimi_transformer { - struct layer { - ggml_tensor * inp_norm_w; - ggml_tensor * inp_norm_b; - - ggml_tensor * attn_q; - ggml_tensor * attn_k; - ggml_tensor * attn_v; - ggml_tensor * attn_o; - ggml_tensor * attn_post_norm_w; - ggml_tensor * attn_post_norm_b; - ggml_tensor * attn_layer_scale; - - ggml_tensor * ffn_up; - ggml_tensor * ffn_down; - ggml_tensor * mlp_layer_scale; - }; - std::vector layers; - - mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) { - for (int il = 0; il < n_layers; il++) { - layers.push_back({ - .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), - .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), - - .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), - .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), - .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), - .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), - .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), - .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), - .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), - - .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), - .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), - .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), - }); - } - } - - ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) { - int n_tokens = input->ne[1]; - ggml_tensor * x = input; - - auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { - x = ggml_norm(ctx0, x, mimi_config.norm_eps); - x = ggml_mul(ctx0, x, w); - x = ggml_add(ctx0, x, b); - return x; - }; - - ggml_tensor * residual = input; - - for (auto & layer : layers) { - residual = x; - - // input layer norm - x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); - - // self attention - { - ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x); - ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); - ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); - - int n_embd_head = mimi_config.n_embd / mimi_config.n_head; - q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); - k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); - v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); - - int n_rot = n_embd_head; - q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); - q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); - - k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); - k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); - - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp - kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); - ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); - kq = ggml_soft_max_inplace(ctx0, kq_masked); - - v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); - - ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); - kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); - - x = ggml_mul_mat(ctx0, layer.attn_o, kqv); - } - - // residual - x = ggml_mul(ctx0, x, layer.attn_layer_scale); - x = ggml_add(ctx0, x, residual); - - residual = x; - x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); - - // mlp - { - x = ggml_mul_mat(ctx0, layer.ffn_up, x); - x = ggml_gelu(ctx0, x); - x = ggml_mul_mat(ctx0, layer.ffn_down, x); - } - - // residual - x = ggml_mul(ctx0, x, layer.mlp_layer_scale); - x = ggml_add(ctx0, x, residual); - } - - return x; - } -}; - -struct mimi_residual_vector_quantizer { - struct component { - ggml_tensor * codebook; - }; - - ggml_tensor * semantic_inp_proj; - std::vector semantic_components; - ggml_tensor * semantic_out_proj; - - ggml_tensor * acoustic_inp_proj; - std::vector acoustic_components; - ggml_tensor * acoustic_out_proj; - - mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) { - semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); - semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); - for (int i = 0; i < mimi_config.n_semantic_components; i++) { - semantic_components.push_back({ - .codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i), - }); - } - acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); - acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); - for (int i = 0; i < mimi_config.n_acoustic_components; i++) { - acoustic_components.push_back({ - .codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i), - }); - } - } - - // the input has shape [n_codes, n_codes_per_embd] - // first row is semantic, the rest are acoustic - // example: [ [semantic], [acoustic1], [acoustic2], ... ] - ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) { - GGML_ASSERT(input->type == GGML_TYPE_I32); - - size_t n_semantic = semantic_components.size(); - int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); - int64_t n_codes = input->ne[0] / n_codes_per_embd; - - GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); - - ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); - ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); - out_s = ggml_scale(ctx0, out_s, 0.0f); // clear - out_a = ggml_scale(ctx0, out_a, 0.0f); // clear - - for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) { - ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); - if (ir < n_semantic) { - // semantic - ggml_tensor * codebook = semantic_components[ir].codebook; - ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); - out_s = ggml_add(ctx0, out_s, embd); - } else { - // acoustic - ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook; - ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); - out_a = ggml_add(ctx0, out_a, embd); - } - } - - out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s); - out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a); - - return ggml_add(ctx0, out_s, out_a); - } -}; - - - -/////////////////////////////////////////////////////////////////////////// -// main program - int main(int argc, const char ** argv) { if (argc < 3) { fprintf(stderr, "Usage: %s model.gguf codes.txt [output.wav]\n", argv[0]); fprintf(stderr, " Format of codes.txt file: one code per line\n"); fprintf(stderr, " Replace codes.txt with dummy0 and dummy1 for testing\n"); fprintf(stderr, " dummy0: using code 1, 2, 3,..., 96, used for logits matching\n"); - fprintf(stderr, " dummy1: using code that will outputs 'hey hello there' sound\n"); + fprintf(stderr, " dummy1: using code that will outputs 'wah hello there' sound\n"); return 1; } @@ -612,14 +19,6 @@ int main(int argc, const char ** argv) { const char * codes_path = argv[2]; const char * out_path = argc < 4 ? "output.wav" : argv[3]; - mimi_ggml_ctx ctx; - ctx.load_gguf(model_path); - - // initialize components - mimi_encoder_decoder decoder(ctx); - mimi_transformer transformer(ctx, "decoder", mimi_config.num_hidden_layers); - mimi_residual_vector_quantizer quantizer(ctx); - // load codes std::vector codes; if (strcmp(codes_path, "dummy0") == 0) { @@ -693,78 +92,18 @@ int main(int argc, const char ** argv) { printf("Loaded %d codes from %s\n", (int)codes.size(), codes_path); } - // build cgraph - int n_pos = -1; - int n_codes = codes.size(); - int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; - GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiple of n_codes_per_embd"); - - ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { - ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); - ggml_set_name(inp_dec, "inp_dec"); - ggml_set_input(inp_dec); - - // RVQ - ggml_tensor * embeddings = quantizer.decode(ctx_gf, inp_dec); - - // upsample - embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); - embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx.get_weight("upsample.conv.weight"), nullptr, 2, 1, true); - - // transformer - n_pos = embeddings->ne[0]; - ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos); - ggml_set_name(pos_dec, "pos_dec"); - ggml_set_input(pos_dec); - embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); - embeddings = transformer.forward(ctx_gf, embeddings, pos_dec); - - // SEANET decoder - embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); - ggml_tensor * output = decoder.forward(ctx_gf, embeddings); - - ggml_set_name(output, "output"); - ggml_set_output(output); - ggml_build_forward_expand(gf, output); - }); - - // position data - std::vector pos_data(1024); - for (int i = 0; i < (int)pos_data.size(); i++) { - pos_data[i] = i; - } - ctx.set_tensor_data("pos_dec", pos_data.data()); - - // code data (need to transpose it) - // code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes] - std::vector codes_t(n_codes_per_embd * n_codes); - for (int i = 0; i < n_codes / n_codes_per_embd; i++) { - for (int j = 0; j < n_codes_per_embd; j++) { - int src_idx = i * n_codes_per_embd + j; - int dst_idx = j * (n_codes / n_codes_per_embd) + i; - codes_t[dst_idx] = codes[src_idx]; - } - } - ctx.set_tensor_data("inp_dec", codes_t.data()); - - ctx.compute(); - - auto output = ctx.get_tensor_data("output"); - auto output_tensor = output.first; - auto output_data = output.second; - printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]); + mimi_model model(model_path, true); + std::vector wav_data = model.decode(codes); // print first 20 values + printf("Number of output samples: %d\n", (int)wav_data.size()); + printf("First 20 samples:\n"); for (int i = 0; i < 20; i++) { - printf("%2.4f, ", ((float *)output_data.data())[i]); + printf("%2.4f, ", wav_data[i]); } printf("...\n"); // write to wav - std::vector wav_data(output_data.size() / sizeof(float)); - for (size_t i = 0; i < wav_data.size(); i++) { - wav_data[i] = ((float *)output_data.data())[i]; - } printf("Writing to %s\n", out_path); - save_wav16(out_path, wav_data, 24000); + save_wav16(out_path, wav_data, model.get_sample_rate()); } From 891273cf3a678ea4fb4845c35f60af49360b0dbf Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 13:08:42 +0100 Subject: [PATCH 04/31] mimi : non-transposed input codes --- examples/tts/mimi-model.cpp | 14 +++---- examples/tts/mimi-model.h | 1 + examples/tts/mimi.cpp | 78 +++++++++++++++++++------------------ 3 files changed, 48 insertions(+), 45 deletions(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 31ff86256ae10..92bb47a8365d7 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -24,7 +24,8 @@ * * Background: * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc - * - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times + * - Audio codes must be in the order: N semantic codes followed by (N*31) acoustic codes + * (In other words, input matrix has shape 32 cols x N rows) * * How it works? * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code @@ -653,23 +654,22 @@ std::vector mimi_model::decode_frame(const std::vector & codes, int for (int i = 0; i < (int)pos_data.size(); i++) { pos_data[i] = i + n_past; } - n_past += n_pos; if (verbose) { printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past); } + n_past += n_pos; ctx->set_tensor_data("pos_dec", pos_data.data()); - // code data (need to transpose it) - // code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes] - std::vector codes_t(n_codes_per_embd * n_codes); + // code data + /*std::vector codes_t(n_codes_per_embd * n_codes); for (int i = 0; i < n_codes / n_codes_per_embd; i++) { for (int j = 0; j < n_codes_per_embd; j++) { int src_idx = i * n_codes_per_embd + j; int dst_idx = j * (n_codes / n_codes_per_embd) + i; codes_t[dst_idx] = codes[src_idx]; } - } - ctx->set_tensor_data("inp_dec", codes_t.data()); + }*/ + ctx->set_tensor_data("inp_dec", codes.data()); ctx->compute(); diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h index d48c19b5476e3..c26fd3bc08e9f 100644 --- a/examples/tts/mimi-model.h +++ b/examples/tts/mimi-model.h @@ -22,6 +22,7 @@ struct mimi_model { int get_sample_rate() const; + // layout of codes: N semantic codes followed by (N*31) acoustic codes std::vector decode(const std::vector & codes); // TODO: implement encoding pass diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 052f546b43a23..421c9e418ecc6 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -5,6 +5,11 @@ #include +/** + * This file is used for testing and showcase how to use "mimi_model" class. + * Please keep it simple and easy to understand. + */ + int main(int argc, const char ** argv) { if (argc < 3) { fprintf(stderr, "Usage: %s model.gguf codes.txt [output.wav]\n", argv[0]); @@ -23,48 +28,45 @@ int main(int argc, const char ** argv) { std::vector codes; if (strcmp(codes_path, "dummy0") == 0) { printf("Using dummy0 codes\n"); - codes.resize(32 * 3); // [n_codes = 3, n_codes_per_embd = 32] - int n = 0; - for (int c = 0; c < 32; c++) { - for (int r = 0; r < 3; r++) { - codes[r*32 + c] = n++; - } + codes.resize(32 * 3); // [n_codes_per_embd = 32, n_codes = 3] + for (int i = 0; i < (int)codes.size(); i++) { + codes[i] = i; } } else if (strcmp(codes_path, "dummy1") == 0) { printf("Using dummy1 codes\n"); codes = { - 1263 ,1597 ,1596 ,1477 ,1540 ,1720 ,1433 ,118 ,1066 ,1968 ,1096 ,232 ,418 ,566 ,1653 ,2010 , - 1029 ,1874 ,77 ,1803 ,123 ,908 ,97 ,1616 ,595 ,1170 ,1654 ,1211 ,1967 ,1579 ,1846 ,1462 , - 1962 ,175 ,1539 ,742 ,1065 ,1226 ,19 ,955 ,528 ,1031 ,659 ,1687 ,1173 ,1802 ,1031 ,1714 , - 1986 ,582 ,367 ,112 ,1245 ,1386 ,759 ,532 ,1472 ,1790 ,802 ,1213 ,1543 ,1916 ,1251 ,309 , - 1962 ,1280 ,1943 ,878 ,1588 ,1989 ,568 ,1463 ,1814 ,1095 ,103 ,583 ,976 ,998 ,871 ,587 , - 247 ,1698 ,1817 ,1024 ,268 ,597 ,45 ,1608 ,1880 ,2047 ,759 ,1578 ,1612 ,49 ,1031 ,1076 , - 927 ,1202 ,1601 ,1719 ,1670 ,412 ,568 ,1838 ,341 ,1265 ,1279 ,830 ,1997 ,32 ,1369 ,1686 , - 1307 ,419 ,1143 ,324 ,325 ,572 ,1597 ,1920 ,795 ,915 ,610 ,2000 ,819 ,718 ,1235 ,282 , - 1912 ,1911 ,141 ,1069 ,1485 ,642 ,1370 ,732 ,284 ,1407 ,1591 ,1002 ,939 ,671 ,951 ,1411 , - 1887 ,460 ,1588 ,1636 ,1312 ,232 ,969 ,1513 ,1336 ,1185 ,1660 ,4 ,926 ,1243 ,1077 ,1379 , - 704 ,85 ,257 ,1302 ,1029 ,1717 ,899 ,1345 ,355 ,1915 ,1007 ,315 ,1283 ,779 ,415 ,335 , - 1848 ,1786 ,469 ,295 ,380 ,1736 ,393 ,765 ,1921 ,836 ,374 ,1649 ,52 ,1633 ,759 ,548 , - 1922 ,47 ,564 ,893 ,34 ,131 ,1063 ,1657 ,474 ,1960 ,1255 ,1275 ,92 ,976 ,1217 ,483 , - 105 ,1746 ,1158 ,1557 ,1001 ,512 ,1668 ,1255 ,1045 ,1596 ,613 ,1272 ,1366 ,1147 ,411 ,831 , - 349 ,692 ,1435 ,2005 ,1465 ,37 ,892 ,95 ,460 ,557 ,1315 ,259 ,1978 ,1838 ,1232 ,2003 , - 1197 ,111 ,1953 ,1297 ,1843 ,671 ,1687 ,91 ,1788 ,1138 ,1896 ,399 ,615 ,758 ,1423 ,365 , - 288 ,632 ,876 ,875 ,1156 ,345 ,1189 ,638 ,1527 ,1981 ,1925 ,333 ,1353 ,473 ,1913 ,1443 , - 1634 ,1373 ,803 ,420 ,192 ,1440 ,1593 ,1925 ,784 ,831 ,552 ,807 ,1942 ,1289 ,612 ,511 , - 968 ,1091 ,30 ,828 ,1611 ,1241 ,1985 ,596 ,273 ,529 ,1182 ,302 ,726 ,1942 ,733 ,1590 , - 1564 ,214 ,1156 ,1722 ,1215 ,1837 ,1729 ,1823 ,672 ,116 ,340 ,396 ,721 ,462 ,1615 ,1380 , - 1459 ,1553 ,636 ,586 ,1148 ,1147 ,1941 ,471 ,876 ,127 ,1938 ,2002 ,1563 ,1121 ,857 ,1179 , - 1983 ,1324 ,1726 ,1445 ,295 ,270 ,896 ,1947 ,1740 ,1211 ,128 ,1266 ,734 ,715 ,1562 ,285 , - 1139 ,304 ,526 ,653 ,1270 ,320 ,484 ,22 ,687 ,1065 ,489 ,827 ,993 ,1654 ,431 ,1552 , - 1418 ,1604 ,455 ,841 ,412 ,848 ,475 ,540 ,1903 ,575 ,584 ,300 ,1079 ,189 ,1481 ,893 , - 228 ,1577 ,429 ,635 ,106 ,1536 ,176 ,348 ,1733 ,1570 ,537 ,1840 ,798 ,410 ,1714 ,1318 , - 487 ,332 ,1109 ,1744 ,283 ,692 ,681 ,1744 ,1008 ,1715 ,1956 ,1066 ,1768 ,1645 ,139 ,1967 , - 897 ,132 ,1010 ,1932 ,277 ,1536 ,1541 ,952 ,19 ,88 ,1663 ,1232 ,1681 ,1878 ,1241 ,1805 , - 89 ,1401 ,544 ,1061 ,1166 ,267 ,1351 ,1998 ,1623 ,1898 ,425 ,1320 ,2006 ,865 ,1981 ,823 , - 1243 ,471 ,485 ,1765 ,391 ,1281 ,1607 ,1418 ,116 ,1702 ,1725 ,512 ,1088 ,1375 ,1994 ,1738 , - 725 ,1471 ,811 ,1251 ,1156 ,1664 ,898 ,1511 ,1872 ,1717 ,444 ,1005 ,254 ,103 ,202 ,1769 , - 1511 ,433 ,284 ,721 ,1741 ,56 ,615 ,916 ,887 ,1253 ,916 ,535 ,1666 ,1713 ,741 ,873 , - 447 ,492 ,388 ,321 ,1860 ,1456 ,1658 ,1682 ,848 ,462 ,2034 ,1368 ,1609 ,1887 ,510 ,1516 , + 1049 ,1415 ,1962 ,914 ,1372 ,704 ,1922 ,2036 ,288 ,968 ,193 ,1139 ,897 ,897 ,1243 ,1511 , + 1597 ,175 ,1280 ,1202 ,1911 ,85 ,47 ,692 ,632 ,251 ,1553 ,1735 ,1577 ,132 ,471 ,433 , + 1325 ,1539 ,1943 ,1601 ,141 ,257 ,564 ,1435 ,876 ,1096 ,636 ,61 ,1497 ,1010 ,485 ,284 , + 839 ,776 ,878 ,1719 ,1069 ,1302 ,893 ,2005 ,875 ,908 ,586 ,2001 ,186 ,1932 ,1765 ,721 , + 592 ,1046 ,1588 ,1670 ,1485 ,1141 ,34 ,1465 ,1156 ,1938 ,435 ,753 ,1418 ,277 ,391 ,1741 , + 1440 ,117 ,723 ,412 ,642 ,1717 ,131 ,37 ,345 ,112 ,1979 ,2034 ,1822 ,1536 ,1281 ,56 , + 1341 ,803 ,568 ,568 ,1370 ,1995 ,1063 ,892 ,273 ,895 ,1226 ,354 ,1726 ,1541 ,1607 ,615 , + 985 ,1499 ,1736 ,1838 ,702 ,1345 ,1657 ,511 ,1774 ,1787 ,945 ,1927 ,947 ,952 ,1418 ,916 , + 1239 ,1457 ,1021 ,341 ,284 ,882 ,474 ,1559 ,1923 ,273 ,1330 ,1406 ,1782 ,19 ,116 ,887 , + 1146 ,1307 ,983 ,1237 ,1407 ,1350 ,1960 ,1255 ,878 ,1979 ,1500 ,1939 ,1415 ,88 ,1702 ,1253 , + 1778 ,2 ,10 ,1279 ,999 ,1549 ,1049 ,373 ,1355 ,1200 ,1466 ,1009 ,75 ,2042 ,1725 ,916 , + 1636 ,1135 ,833 ,830 ,1758 ,2015 ,1275 ,1675 ,287 ,744 ,89 ,430 ,1724 ,1232 ,1692 ,535 , + 1485 ,1287 ,973 ,1815 ,314 ,2020 ,424 ,1085 ,982 ,1994 ,1563 ,1269 ,1769 ,1681 ,1082 ,1666 , + 1622 ,1039 ,1209 ,32 ,679 ,732 ,976 ,1462 ,805 ,402 ,1150 ,170 ,1529 ,2013 ,350 ,1175 , + 757 ,1124 ,1091 ,1369 ,1061 ,415 ,1217 ,1135 ,1360 ,1578 ,1205 ,1785 ,1835 ,1241 ,14 ,716 , + 480 ,716 ,681 ,1686 ,1624 ,335 ,865 ,1356 ,1688 ,307 ,366 ,541 ,1262 ,1167 ,59 ,269 , + 1899 ,1798 ,1606 ,1307 ,1549 ,1814 ,114 ,483 ,958 ,1919 ,1179 ,898 ,834 ,1526 ,386 ,447 , + 1481 ,201 ,779 ,419 ,430 ,1451 ,1000 ,156 ,1062 ,615 ,1353 ,414 ,1214 ,1487 ,882 ,32 , + 840 ,1517 ,334 ,1143 ,823 ,454 ,725 ,1298 ,1325 ,649 ,1737 ,913 ,685 ,761 ,2010 ,63 , + 1397 ,1299 ,765 ,1158 ,1809 ,1299 ,1585 ,1776 ,625 ,1539 ,830 ,1563 ,461 ,308 ,1438 ,321 , + 82 ,886 ,1836 ,325 ,1976 ,761 ,359 ,1136 ,1720 ,2036 ,904 ,719 ,526 ,1567 ,145 ,1860 , + 1565 ,1786 ,1400 ,1696 ,232 ,1736 ,512 ,518 ,1895 ,1854 ,1584 ,1393 ,1869 ,1702 ,789 ,1986 , + 116 ,521 ,150 ,1597 ,727 ,1916 ,815 ,1826 ,1382 ,653 ,1596 ,286 ,1373 ,177 ,1397 ,1009 , + 1449 ,353 ,877 ,93 ,266 ,1853 ,1255 ,872 ,1974 ,556 ,1885 ,857 ,992 ,5 ,1921 ,1849 , + 1038 ,1912 ,464 ,795 ,747 ,56 ,124 ,431 ,1868 ,609 ,855 ,1522 ,912 ,1709 ,1507 ,1062 , + 1015 ,1357 ,1487 ,4 ,253 ,1871 ,933 ,215 ,1228 ,633 ,1306 ,2024 ,1453 ,900 ,457 ,471 , + 436 ,1311 ,870 ,1032 ,134 ,984 ,1983 ,1103 ,1627 ,1627 ,414 ,1845 ,583 ,1699 ,1458 ,2018 , + 150 ,450 ,1114 ,369 ,267 ,1273 ,1136 ,1578 ,1063 ,1820 ,120 ,779 ,652 ,1266 ,1929 ,1213 , + 159 ,297 ,1703 ,819 ,93 ,247 ,1366 ,144 ,1617 ,1428 ,812 ,121 ,1637 ,1620 ,289 ,1557 , + 1414 ,971 ,476 ,1685 ,428 ,1802 ,653 ,1290 ,614 ,1663 ,1528 ,1344 ,798 ,1027 ,1305 ,990 , + 1740 ,1154 ,1839 ,912 ,731 ,602 ,1064 ,1508 ,834 ,1387 ,252 ,745 ,1034 ,1102 ,965 ,696 , + 1971 ,1729 ,666 ,282 ,1993 ,1551 ,1703 ,1124 ,1628 ,1725 ,107 ,808 ,1096 ,1753 ,500 ,677 , }; } else { std::ifstream fin(codes_path); From 6dca237b1c12a5732088001dc483e9495ed7eb16 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 18:19:56 +0100 Subject: [PATCH 05/31] tts : add sesame csm --- examples/tts/CMakeLists.txt | 6 + examples/tts/convert_csm_to_gguf.py | 330 ++++++++++++++++++++++++++++ examples/tts/tts-csm.cpp | 120 ++++++++++ src/llama-arch.cpp | 48 ++-- src/llama-arch.h | 3 + src/llama-model.cpp | 24 +- src/llama-model.h | 5 + 7 files changed, 513 insertions(+), 23 deletions(-) create mode 100644 examples/tts/convert_csm_to_gguf.py create mode 100644 examples/tts/tts-csm.cpp diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index c72bd814c3b31..17d2ea08a074e 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -3,3 +3,9 @@ add_executable(${TARGET} tts.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +set(TARGET llama-tts-csm) +add_executable(${TARGET} tts-csm.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/convert_csm_to_gguf.py b/examples/tts/convert_csm_to_gguf.py new file mode 100644 index 0000000000000..ff91098a993ad --- /dev/null +++ b/examples/tts/convert_csm_to_gguf.py @@ -0,0 +1,330 @@ +import os +import sys +import argparse +import logging +import torch +from safetensors.torch import load_file +from typing import Union, Any, Dict +from pathlib import Path +from torch import Tensor +from huggingface_hub import hf_hub_download + +cur_path = sys.path +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent.parent.parent / 'gguf-py')) +import gguf + +sys.path = cur_path + +logger = logging.getLogger("csm") + + +# This converts directly one safetensors file to 2 GGUFs +# It is easier to do this way, rather than convert to 2 smaller HF models and then convert to GGUF +# This is because the Sesame model does not have built-in tokenizer + +def get_field_data(reader: gguf.GGUFReader, key: str) -> Any: + field = reader.get_field(key) + return field.contents() if field else None + +# copied from https://github.com/SesameAILabs/csm/blob/main/models.py +class Llama_3_2_1B: + vocab_size=128_256 + num_layers=16 + num_heads=32 + num_kv_heads=8 + embed_dim=2048 + max_seq_len=2048 + intermediate_dim=8192 + attn_dropout=0.0 + norm_eps=1e-5 + rope_base=500_000 + scale_factor=32 + + def write_gguf_metadata(self, fout: gguf.GGUFWriter, fvocab: gguf.GGUFReader): + arch = get_field_data(fvocab, gguf.Keys.General.ARCHITECTURE) + assert arch == "llama" + fout.add_type("model") + fout.add_block_count(self.num_layers) + fout.add_context_length(self.max_seq_len) + fout.add_feed_forward_length(self.intermediate_dim) + fout.add_embedding_length(self.embed_dim) + # attn + fout.add_head_count(self.num_heads) + fout.add_head_count_kv(self.num_kv_heads) + fout.add_rope_freq_base(self.rope_base) + # fout.add_rope_scaling_factor(self.scale_factor) # breaks if this is added + fout.add_rope_dimension_count(self.embed_dim // self.num_heads) + fout.add_layer_norm_rms_eps(self.norm_eps) + fout.add_key_length(self.embed_dim // self.num_heads) + fout.add_value_length(self.embed_dim // self.num_heads) + # vocab + fout.add_vocab_size(self.vocab_size) + fout.add_tokenizer_model(get_field_data(fvocab, gguf.Keys.Tokenizer.MODEL)) + fout.add_tokenizer_pre(get_field_data(fvocab, gguf.Keys.Tokenizer.PRE)) + fout.add_token_list(get_field_data(fvocab, gguf.Keys.Tokenizer.LIST)[:self.vocab_size]) + fout.add_token_types(get_field_data(fvocab, gguf.Keys.Tokenizer.TOKEN_TYPE)[:self.vocab_size]) + fout.add_token_merges(get_field_data(fvocab, gguf.Keys.Tokenizer.MERGES)) + fout.add_bos_token_id(get_field_data(fvocab, gguf.Keys.Tokenizer.BOS_ID)) + fout.add_eos_token_id(get_field_data(fvocab, gguf.Keys.Tokenizer.EOS_ID)) + +class Llama_3_2_100M(Llama_3_2_1B): + vocab_size=65_632 #128_256 + num_layers=4 + num_heads=8 + num_kv_heads=2 + embed_dim=1024 + max_seq_len=2048 + intermediate_dim=8192 + attn_dropout=0.0 + norm_eps=1e-5 + rope_base=500_000 + scale_factor=32 + +class CSMModelConverter: + state_dict: Dict[str, Tensor] + gguf_writer_backbone: gguf.GGUFWriter + gguf_writer_decoder: gguf.GGUFWriter + gguf_reader_vocab: gguf.GGUFReader + fname_out: Path + ftype: gguf.LlamaFileType + + projection_tensor: Tensor # projecting from n_embd_backbone (2048) to n_embd_decoder (1024) + + def __init__(self, + safetensors_path: Union[Path, str], + path_to_vocab_gguf: Path, + fname_out: Path, + ftype: gguf.LlamaFileType, + is_big_endian: bool,): + + if "" not in fname_out.name: + raise ValueError("Output file name must contain '' placeholder, for example: 'sesame-csm-.gguf'") + + self.state_dict = load_file(safetensors_path, device="cpu") + self.fname_out = fname_out + self.ftype = ftype + self.gguf_reader_vocab = gguf.GGUFReader(path_to_vocab_gguf) + endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + + # backbone + self.gguf_writer_backbone = gguf.GGUFWriter( + path=None, + arch="llama", + endianess=endianess) + + # decoder + self.gguf_writer_decoder = gguf.GGUFWriter( + path=None, + arch="llama", + endianess=endianess) + + Llama_3_2_1B().write_gguf_metadata(self.gguf_writer_backbone, self.gguf_reader_vocab) + Llama_3_2_100M().write_gguf_metadata(self.gguf_writer_decoder, self.gguf_reader_vocab) + + # get projection tensor) + for name, data_torch in self.state_dict.items(): + if name == "projection.weight": + self.projection_tensor = data_torch + break + + # load tensors + for component in ("backbone", "decoder"): + print() + print(f"Converting {component}...") + print() + for name, data_torch in self.state_dict.items(): + # convert any unsupported data types to float32 + old_dtype = data_torch.dtype + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + self.add_tensor(name, data_torch, old_dtype, component) + + def add_tensor(self, name: str, data_torch: Tensor, old_dtype: torch.dtype, component: str): + is_1d = len(data_torch.shape) == 1 + #is_embd = "_embeddings" in name + can_quantize = not is_1d #and not is_embd + data_qtype = gguf.GGMLQuantizationType.F32 + + is_backbone = False + is_decoder = False + + def rename_transformer(name: str) -> str: + # transformer + name = name.replace(".scale", ".weight") + name = name.replace("attn.k_proj", "attn_k") + name = name.replace("attn.q_proj", "attn_q") + name = name.replace("attn.v_proj", "attn_v") + name = name.replace("attn.output_proj", "attn_output") + name = name.replace("sa_norm", "attn_norm") + name = name.replace("mlp.w1", "ffn_gate") + name = name.replace("mlp.w2", "ffn_down") + name = name.replace("mlp.w3", "ffn_up") + name = name.replace("mlp_norm", "ffn_norm") + return name + + if "audio_embeddings." in name: + is_decoder = True + if component == "decoder": + name = name.replace("audio_embeddings.", "token_embd.") + data_torch = torch.mm(data_torch, self.projection_tensor.T) + print("Applied projection to audio_embeddings", data_torch.shape) + + elif "text_embeddings." in name: + is_backbone = True + name = name.replace("text_embeddings.", "token_embd.") + + elif "backbone." in name or "codebook0_head." in name: + is_backbone = True + name = name.replace("backbone.layers.", "blk.") + name = name.replace("backbone.norm.scale", "output_norm.weight") + name = rename_transformer(name) + + elif "decoder." in name: + is_decoder = True + name = name.replace("decoder.layers.", "blk.") + name = name.replace("decoder.norm.scale", "output_norm.weight") + name = rename_transformer(name) + + elif name == "audio_head": + is_decoder = True + name = "audio_head.weight" + + elif name == "projection.weight": + is_decoder = True + name = "inp_proj.weight" + self.projection_tensor = data_torch + + if can_quantize: + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unsupported file type: {self.ftype}") + + data = data_torch.numpy() + + try: + data = gguf.quants.quantize(data, data_qtype) + except Exception as e: + logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + if (is_backbone and component == "backbone") or (is_decoder and component == "decoder"): + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" + logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + if component == "backbone": + self.gguf_writer_backbone.add_tensor(name, data, raw_dtype=data_qtype) + elif component == "decoder": + self.gguf_writer_decoder.add_tensor(name, data, raw_dtype=data_qtype) + + def write(self): + self._write_single(self.gguf_writer_backbone, "backbone") + self._write_single(self.gguf_writer_decoder, "decoder") + + def _write_single(self, gguf_writer: gguf.GGUFWriter, component: str): + output_path = str(self.fname_out).replace("", component) + gguf_writer.write_header_to_file(path=Path(output_path)) + gguf_writer.write_kv_data_to_file() + gguf_writer.write_tensors_to_file(progress=True) + gguf_writer.close() + + @staticmethod + def undo_permute(weights: Tensor, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert Sesame model to GGUFs (multiple files)",) + parser.add_argument( + "--outfile", type=Path, default="sesame-csm-.gguf", + help="path to write to, the '' placeholder is required and will be replaced with 'backbone' and 'decoder'", + ) + parser.add_argument( + "--vocab", type=Path, default="models/ggml-vocab-llama-bpe.gguf", + help="path to vocab GGUF", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", + help="output format", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="path to safetensors or model ID containing model file (if model ID is specified, download from Hugging Face hub)", + nargs="?", + default="sesame/csm-1b:model.safetensors", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + + args = parser.parse_args() + if args.model is None: + parser.error("the following arguments are required: model") + return args + + +def main() -> None: + args = parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + dir_model = args.model + path_vocab = args.vocab + + dir_parts = str(dir_model).split(":") + if len(dir_parts) == 2: + try: + dir_model = Path(hf_hub_download(dir_parts[0], dir_parts[1])) + except Exception as e: + print("Error downloading model from Hugging Face hub:", e) + print() + print("Please make sure you have access to the model") + print("Hint: you may need to set HF_TOKEN by running: huggingface-cli login") + + if not path_vocab.exists(): + raise FileNotFoundError(f"Vocab file not found: {path_vocab} ; Hint: download it from https://github.com/ggml-org/llama.cpp/blob/master/models/ggml-vocab-llama-bpe.gguf") + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + } + + logger.info(f"Loading model: {dir_model}") + + with torch.inference_mode(): + converter = CSMModelConverter( + safetensors_path=dir_model, + fname_out=args.outfile, + path_to_vocab_gguf=path_vocab, + ftype=ftype_map[args.outtype], + is_big_endian=args.bigendian, + ) + converter.write() + + +if __name__ == '__main__': + main() + diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp new file mode 100644 index 0000000000000..2835265a59530 --- /dev/null +++ b/examples/tts/tts-csm.cpp @@ -0,0 +1,120 @@ +#include "llama.h" +#include "common.h" +#include "log.h" +#include "arg.h" + +#include +#include +#include + +static void print_usage(int, char ** argv) { + LOG("\nexample usage:\n"); + LOG("\n %s TODO ", argv[0]); + LOG("\n"); +} + +// greedy sampling with custom n_vocab +static llama_token sample_greedy(const float * logits, int n_vocab) { + llama_token max_idx = -1; + float max_val = -FLT_MAX; + for (int i = 0; i < n_vocab; ++i) { + if (logits[i] > max_val) { + max_val = logits[i]; + max_idx = i; + } + } + return max_idx; +} + +// hook to retrieve the embeddings +static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { + std::vector * embd = (std::vector *) user_data; + + if (t && strcmp(t->name, "result_norm") == 0) { + if (ask) return true; + + auto n_bytes = ggml_nbytes(t); + embd->resize(n_bytes); + ggml_backend_tensor_get(t, embd->data(), 0, n_bytes); + printf("result_norm\n"); + return true; + } + + return false; +} + +int main(int argc, char ** argv) { + common_params params; + + params.model = "sesame-csm-backbone.gguf"; + params.out_file = "output.wav"; + params.prompt = "[0]Hello from Sesame."; + + params.n_predict = 4096; + params.n_batch = 8192; + params.n_ctx = 8192; + + params.sampling.top_k = 4; + params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, }; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { + return 1; + } + + llama_backend_init(); + llama_numa_init(params.numa); + + common_params params_decoder(params); // duplicate the params + string_replace_all(params_decoder.model, "-backbone", "-decoder"); + + std::vector embd; + params.cb_eval = ggml_callback; + params.cb_eval_user_data = &embd; + common_init_result llama_backbone = common_init_from_params(params); + llama_model * model_bb = llama_backbone.model.get(); + llama_context * ctx_bb = llama_backbone.context.get(); + + //common_init_result llama_decoder = common_init_from_params(params_decoder); + //llama_model * model_dc = llama_decoder.model.get(); + //llama_context * ctx_dc = llama_decoder.context.get(); + + if (model_bb == nullptr || ctx_bb == nullptr) { + return ENOENT; + } + + const llama_vocab * vocab = llama_model_get_vocab(model_bb); + llama_tokens prompt_tokens = common_tokenize(vocab, params.prompt, false, true); + prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab)); + prompt_tokens.insert(prompt_tokens.end(), llama_vocab_eos(vocab)); + + printf("prompt tokens: \n"); + for (size_t i = 0; i < prompt_tokens.size(); ++i) { + printf("%d, ", prompt_tokens[i]); + } + printf("\n"); + + llama_batch batch = llama_batch_init(params.n_batch, 0, 1); + for (size_t i = 0; i < prompt_tokens.size(); ++i) { + common_batch_add(batch, prompt_tokens[i], i, { 0 }, false); + } + batch.logits[batch.n_tokens - 1] = true; + + if (llama_decode(ctx_bb, batch) != 0) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + return 1; + } + + //auto vocab_dc = llama_model_get_vocab(model_dc); + auto logits = llama_get_logits_ith(ctx_bb, batch.n_tokens - 1); + //printf("next tok: %d\n", sample_greedy(logits, llama_vocab_n_tokens(vocab_dc))); + for (size_t i = 0; i < 10; ++i) { + printf("%4.2f, ", logits[i]); + } + printf("next tok: %d\n", sample_greedy(logits, 65632)); + + for (size_t i = 0; i < 10; ++i) { + printf("%4.2f, ", embd[i]); + } + + return 0; +} diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 9e443d83029f5..268ef3c4f3e58 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -208,27 +208,30 @@ static const std::map> LLM_TENSOR_N { LLM_ARCH_LLAMA, { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, - { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, - { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, - { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, - { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, - { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, - { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_CSM_CBOOK_OUTPUT, "codebook0_head" }, + { LLM_TENSOR_CSM_AUDIO_OUTPUT, "audio_head" }, + { LLM_TENSOR_CSM_INP_PROJ, "inp_proj" }, }, }, { @@ -1570,6 +1573,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CSM_CBOOK_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CSM_AUDIO_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CSM_INP_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index 39e3a2ce0565c..c43e8e7c62c3c 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -347,6 +347,9 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, + LLM_TENSOR_CSM_CBOOK_OUTPUT, + LLM_TENSOR_CSM_AUDIO_OUTPUT, + LLM_TENSOR_CSM_INP_PROJ, }; enum llm_tensor_layer { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a4f06112d2842..845b901b3e8a1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1622,6 +1622,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } + // csm sesame model + { + // TODO: maybe store these in gguf metadata + int64_t csm_audio_cbook_size = 2051; // audio codebook size + int64_t csm_acoustic_tokens = 31; // == number of acoutic tokens for Mimi + int64_t csm_backbone_n_embd = 2048; // used by decoder (n_embd_decoder != n_embd_backbone) + csm_output_cbook = create_tensor(tn(LLM_TENSOR_CSM_CBOOK_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size}, TENSOR_NOT_REQUIRED); + csm_output_audio = create_tensor(tn(LLM_TENSOR_CSM_AUDIO_OUTPUT, "weight"), {csm_audio_cbook_size, n_embd, csm_acoustic_tokens}, TENSOR_NOT_REQUIRED); + csm_input_proj = create_tensor(tn(LLM_TENSOR_CSM_INP_PROJ, "weight"), {csm_backbone_n_embd, n_embd}, TENSOR_NOT_REQUIRED); + } + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -4265,8 +4276,17 @@ struct llm_build_llama : public llm_graph_context { cb(cur, "result_norm", -1); res->t_embd = cur; - // lm_head - cur = build_lora_mm(model.output, cur); + if (model.csm_output_cbook) { + // Sesame csm backbone + // hack: because n_cbook < n_vocab, we use the first logits for the codebook output + int64_t n_vocab = model.tok_embd->ne[1]; + int64_t n_codes = model.csm_output_cbook->ne[1]; + cur = build_lora_mm(model.csm_output_cbook, cur); + cur = ggml_pad(ctx0, cur, n_vocab - n_codes, 0, 0, 0); + } else { + // lm_head (normal case) + cur = build_lora_mm(model.output, cur); + } // For Granite architecture if (hparams.f_logit_scale) { diff --git a/src/llama-model.h b/src/llama-model.h index 0064d597a9613..6c368d691c0d0 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -337,6 +337,11 @@ struct llama_model { struct ggml_tensor * conv1d = nullptr; struct ggml_tensor * conv1d_b = nullptr; + // sesame csm + struct ggml_tensor * csm_output_cbook = nullptr; // backbone codebook + struct ggml_tensor * csm_output_audio = nullptr; // audio decoder output + struct ggml_tensor * csm_input_proj = nullptr; // audio decoder input projection + std::vector layers; llama_model_params params; From 2d743b6758c37b68543ebc24ba77248e5912a129 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 00:17:44 +0100 Subject: [PATCH 06/31] wip --- examples/tts/convert_csm_to_gguf.py | 32 ++-- examples/tts/tts-csm.cpp | 123 ++++++++++---- src/llama-arch.cpp | 33 +++- src/llama-arch.h | 4 +- src/llama-model.cpp | 250 +++++++++++++++++++++++++--- src/llama-model.h | 4 +- 6 files changed, 376 insertions(+), 70 deletions(-) diff --git a/examples/tts/convert_csm_to_gguf.py b/examples/tts/convert_csm_to_gguf.py index ff91098a993ad..183ea98b7076d 100644 --- a/examples/tts/convert_csm_to_gguf.py +++ b/examples/tts/convert_csm_to_gguf.py @@ -89,8 +89,6 @@ class CSMModelConverter: fname_out: Path ftype: gguf.LlamaFileType - projection_tensor: Tensor # projecting from n_embd_backbone (2048) to n_embd_decoder (1024) - def __init__(self, safetensors_path: Union[Path, str], path_to_vocab_gguf: Path, @@ -110,24 +108,18 @@ def __init__(self, # backbone self.gguf_writer_backbone = gguf.GGUFWriter( path=None, - arch="llama", + arch="llama-csm", endianess=endianess) # decoder self.gguf_writer_decoder = gguf.GGUFWriter( path=None, - arch="llama", + arch="llama-csm", endianess=endianess) Llama_3_2_1B().write_gguf_metadata(self.gguf_writer_backbone, self.gguf_reader_vocab) Llama_3_2_100M().write_gguf_metadata(self.gguf_writer_decoder, self.gguf_reader_vocab) - # get projection tensor) - for name, data_torch in self.state_dict.items(): - if name == "projection.weight": - self.projection_tensor = data_torch - break - # load tensors for component in ("backbone", "decoder"): print() @@ -165,10 +157,7 @@ def rename_transformer(name: str) -> str: if "audio_embeddings." in name: is_decoder = True - if component == "decoder": - name = name.replace("audio_embeddings.", "token_embd.") - data_torch = torch.mm(data_torch, self.projection_tensor.T) - print("Applied projection to audio_embeddings", data_torch.shape) + name = name.replace("audio_embeddings.", "audio_embd.") elif "text_embeddings." in name: is_backbone = True @@ -189,11 +178,18 @@ def rename_transformer(name: str) -> str: elif name == "audio_head": is_decoder = True name = "audio_head.weight" + if component == "decoder": + # add padding at the beginning so that build_lora_mm_id can be used + zero_tensor = torch.zeros(1, 1024, 2051) + data_torch = torch.cat([zero_tensor, data_torch], dim=0) + assert data_torch.shape == (32, 1024, 2051) + # then, transpose it + data_torch = data_torch.transpose(1, 2) elif name == "projection.weight": is_decoder = True - name = "inp_proj.weight" - self.projection_tensor = data_torch + is_backbone = True + name = "csm_proj.weight" if can_quantize: if self.ftype == gguf.LlamaFileType.ALL_F32: @@ -203,7 +199,9 @@ def rename_transformer(name: str) -> str: elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 + # decoder is very sensitive to quantization, do not quantize it lower than F16 + data_qtype = gguf.GGMLQuantizationType.Q8_0 if component != "decoder" \ + else gguf.GGMLQuantizationType.F16 else: raise ValueError(f"Unsupported file type: {self.ftype}") diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 2835265a59530..34eeb4b4db4d9 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -30,13 +30,12 @@ static llama_token sample_greedy(const float * logits, int n_vocab) { static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { std::vector * embd = (std::vector *) user_data; - if (t && strcmp(t->name, "result_norm") == 0) { + if (t && (strcmp(t->name, "output_csm_proj") == 0 || strcmp(t->name, "output_audio_embd") == 0)) { if (ask) return true; - auto n_bytes = ggml_nbytes(t); - embd->resize(n_bytes); - ggml_backend_tensor_get(t, embd->data(), 0, n_bytes); - printf("result_norm\n"); + embd->resize(ggml_nelements(t)); + ggml_backend_tensor_get(t, embd->data(), 0, ggml_nbytes(t)); + // printf("%s tensor size: %lld, %lld\n", t->name, t->ne[0], t->ne[1]); return true; } @@ -54,9 +53,6 @@ int main(int argc, char ** argv) { params.n_batch = 8192; params.n_ctx = 8192; - params.sampling.top_k = 4; - params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, }; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { return 1; } @@ -64,24 +60,30 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - common_params params_decoder(params); // duplicate the params - string_replace_all(params_decoder.model, "-backbone", "-decoder"); - std::vector embd; params.cb_eval = ggml_callback; params.cb_eval_user_data = &embd; + params.warmup = false; + + common_params params_decoder(params); // duplicate the params + string_replace_all(params_decoder.model, "-backbone", "-decoder"); + common_init_result llama_backbone = common_init_from_params(params); llama_model * model_bb = llama_backbone.model.get(); llama_context * ctx_bb = llama_backbone.context.get(); - //common_init_result llama_decoder = common_init_from_params(params_decoder); - //llama_model * model_dc = llama_decoder.model.get(); - //llama_context * ctx_dc = llama_decoder.context.get(); + common_init_result llama_decoder = common_init_from_params(params_decoder); + llama_model * model_dc = llama_decoder.model.get(); + llama_context * ctx_dc = llama_decoder.context.get(); if (model_bb == nullptr || ctx_bb == nullptr) { return ENOENT; } + if (model_dc == nullptr || ctx_dc == nullptr) { + return ENOENT; + } + const llama_vocab * vocab = llama_model_get_vocab(model_bb); llama_tokens prompt_tokens = common_tokenize(vocab, params.prompt, false, true); prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab)); @@ -93,27 +95,92 @@ int main(int argc, char ** argv) { } printf("\n"); + llama_pos n_past_bb = 0; llama_batch batch = llama_batch_init(params.n_batch, 0, 1); + common_batch_clear(batch); for (size_t i = 0; i < prompt_tokens.size(); ++i) { - common_batch_add(batch, prompt_tokens[i], i, { 0 }, false); + common_batch_add(batch, prompt_tokens[i], n_past_bb++, { 0 }, false); } batch.logits[batch.n_tokens - 1] = true; - if (llama_decode(ctx_bb, batch) != 0) { - LOG_ERR("%s: llama_decode() failed\n", __func__); - return 1; - } + std::vector inp_past_embd(2048, 0.0f); + llama_batch batch_past_embd = llama_batch_init(1, inp_past_embd.size(), 1); - //auto vocab_dc = llama_model_get_vocab(model_dc); - auto logits = llama_get_logits_ith(ctx_bb, batch.n_tokens - 1); - //printf("next tok: %d\n", sample_greedy(logits, llama_vocab_n_tokens(vocab_dc))); - for (size_t i = 0; i < 10; ++i) { - printf("%4.2f, ", logits[i]); - } - printf("next tok: %d\n", sample_greedy(logits, 65632)); + for (int k = 0; k < 4; ++k) { + if (llama_decode(ctx_bb, k == 0 ? batch : batch_past_embd) != 0) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + return 1; + } + + auto vocab_dc = llama_model_get_vocab(model_dc); + auto logits = llama_get_logits_ith(ctx_bb, k == 0 ? (batch.n_tokens - 1) : 0); + // for (size_t i = 0; i < 10; ++i) { + // printf("%4.2f, ", logits[i]); + // } + // printf("\n"); + + llama_token latent_token = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); + // printf("latent_token: %d\n", latent_token); + printf("%5d, ", latent_token); + + // for (size_t i = 0; i < 10; ++i) { + // printf("%4.2f, ", embd[i]); + // } + // printf("\n"); + + + + // decode + prompt_tokens.clear(); + prompt_tokens.push_back(latent_token); + inp_past_embd = std::vector(inp_past_embd.size(), 0.0f); + { + llama_kv_self_clear(ctx_dc); + llama_batch batch_embd = llama_batch_init(1, embd.size(), 1); + llama_batch batch_token = llama_batch_init(1, 0, 1); + { + batch_embd.n_tokens = 1; + batch_embd.pos[0] = 0; + batch_embd.seq_id[0][0] = 0; + batch_embd.n_seq_id[0] = 1; + batch_embd.logits[0] = false; + memcpy(batch_embd.embd, embd.data(), embd.size() * sizeof(float)); + } + llama_decode(ctx_dc, batch_embd); + + llama_token audio_token = latent_token; + for (int i = 0; i < 31; ++i) { + common_batch_clear(batch_token); + // encoder vocab is further divided into 32 codebooks, each with 2051 entries + llama_token inp_tok = audio_token + 2051*i; + common_batch_add(batch_token, inp_tok, i+1, { 0 }, true); + llama_decode(ctx_dc, batch_token); + auto logits = llama_get_logits_ith(ctx_dc, 0); + audio_token = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); + printf("%d,", audio_token); + prompt_tokens.push_back(audio_token); + + GGML_ASSERT(inp_past_embd.size() == embd.size()); + for (size_t i = 0; i < inp_past_embd.size(); ++i) { + inp_past_embd[i] += embd[i]; + } + } + printf("\n"); + + llama_batch_free(batch_embd); + llama_batch_free(batch_token); + } - for (size_t i = 0; i < 10; ++i) { - printf("%4.2f, ", embd[i]); + // prepare for the next iteration + { + batch_past_embd.n_tokens = 1; + batch_past_embd.pos[0] = n_past_bb; + batch_past_embd.seq_id[0][0] = 0; + batch_past_embd.n_seq_id[0] = 1; + batch_past_embd.logits[0] = true; + memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float)); + } + n_past_bb++; } return 0; diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 268ef3c4f3e58..fcdff0da60562 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -6,6 +6,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_LLAMA_CSM, "llama-csm" }, { LLM_ARCH_DECI, "deci" }, { LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_GROK, "grok" }, @@ -229,9 +230,36 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, + { + LLM_ARCH_LLAMA_CSM, // like LLM_ARCH_LLAMA, but with extra tensors for Sesame CSM + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_CSM_AUDIO_EMBD, "audio_embd" }, { LLM_TENSOR_CSM_CBOOK_OUTPUT, "codebook0_head" }, { LLM_TENSOR_CSM_AUDIO_OUTPUT, "audio_head" }, - { LLM_TENSOR_CSM_INP_PROJ, "inp_proj" }, + { LLM_TENSOR_CSM_PROJ, "csm_proj" }, }, }, { @@ -1573,9 +1601,10 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CSM_AUDIO_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, {LLM_TENSOR_CSM_CBOOK_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CSM_AUDIO_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CSM_INP_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CSM_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index c43e8e7c62c3c..4d39e88f0885b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -10,6 +10,7 @@ enum llm_arch { LLM_ARCH_LLAMA, + LLM_ARCH_LLAMA_CSM, LLM_ARCH_DECI, LLM_ARCH_FALCON, LLM_ARCH_BAICHUAN, @@ -347,9 +348,10 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, + LLM_TENSOR_CSM_AUDIO_EMBD, LLM_TENSOR_CSM_CBOOK_OUTPUT, LLM_TENSOR_CSM_AUDIO_OUTPUT, - LLM_TENSOR_CSM_INP_PROJ, + LLM_TENSOR_CSM_PROJ, }; enum llm_tensor_layer { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 845b901b3e8a1..73df168c31aaf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -513,7 +513,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { + if (arch == LLM_ARCH_LLAMA + || arch == LLM_ARCH_LLAMA_CSM + || arch == LLM_ARCH_DECI + || arch == LLM_ARCH_FALCON + ) { if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } @@ -531,6 +535,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // arch-specific KVs switch (arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_LLAMA_CSM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -1622,17 +1627,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - // csm sesame model - { - // TODO: maybe store these in gguf metadata - int64_t csm_audio_cbook_size = 2051; // audio codebook size - int64_t csm_acoustic_tokens = 31; // == number of acoutic tokens for Mimi - int64_t csm_backbone_n_embd = 2048; // used by decoder (n_embd_decoder != n_embd_backbone) - csm_output_cbook = create_tensor(tn(LLM_TENSOR_CSM_CBOOK_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size}, TENSOR_NOT_REQUIRED); - csm_output_audio = create_tensor(tn(LLM_TENSOR_CSM_AUDIO_OUTPUT, "weight"), {csm_audio_cbook_size, n_embd, csm_acoustic_tokens}, TENSOR_NOT_REQUIRED); - csm_input_proj = create_tensor(tn(LLM_TENSOR_CSM_INP_PROJ, "weight"), {csm_backbone_n_embd, n_embd}, TENSOR_NOT_REQUIRED); - } - for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -1676,6 +1670,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; + case LLM_ARCH_LLAMA_CSM: + { + // TODO: maybe store these in gguf metadata + int64_t csm_audio_cbook_size = 2051; // audio codebook size + int64_t csm_acoustic_tokens = 32; // equal to number of acoutic tokens for Mimi + //int64_t csm_n_audio_vocab = csm_audio_cbook_size*csm_acoustic_tokens; + + csm_output_cbook = create_tensor(tn(LLM_TENSOR_CSM_CBOOK_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size}, TENSOR_NOT_REQUIRED); + + bool is_backbone = csm_output_cbook != nullptr; + + csm_output_audio = is_backbone ? nullptr + : create_tensor(tn(LLM_TENSOR_CSM_AUDIO_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size, csm_acoustic_tokens}, 0); + + tok_embd = is_backbone + ? create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0) + : create_tensor(tn(LLM_TENSOR_CSM_AUDIO_EMBD, "weight"), {n_embd*2, n_vocab}, 0); + + csm_proj = is_backbone + ? create_tensor(tn(LLM_TENSOR_CSM_PROJ, "weight"), {n_embd, n_embd/2}, 0) + : create_tensor(tn(LLM_TENSOR_CSM_PROJ, "weight"), {n_embd*2, n_embd}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + // output tensor is either audio or code depends on backbone / decoder + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; case LLM_ARCH_DECI: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4276,21 +4312,190 @@ struct llm_build_llama : public llm_graph_context { cb(cur, "result_norm", -1); res->t_embd = cur; + // lm_head (normal case) + cur = build_lora_mm(model.output, cur); + + // For Granite architecture + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + +// llama used by Sesame CSM +struct llm_build_llama_csm : public llm_graph_context { + llm_build_llama_csm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + bool is_backbone = model.csm_output_cbook; + bool is_decoder = !is_backbone; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + ggml_tensor * input_embd = inpL; + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + if (is_decoder && inpL->ne[0] != hparams.n_embd) { + inpL = build_lora_mm(model.csm_proj, inpL); + } + + auto * inp_attn = build_attn_inp_kv_unified(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + if (model.layers[il].ffn_gate_inp == nullptr) { + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + if (model.csm_output_cbook) { // Sesame csm backbone - // hack: because n_cbook < n_vocab, we use the first logits for the codebook output + // hack: because n_cbook < n_vocab, we use the first logits for the output int64_t n_vocab = model.tok_embd->ne[1]; int64_t n_codes = model.csm_output_cbook->ne[1]; + ggml_tensor * last_h = cur; cur = build_lora_mm(model.csm_output_cbook, cur); cur = ggml_pad(ctx0, cur, n_vocab - n_codes, 0, 0, 0); - } else { - // lm_head (normal case) - cur = build_lora_mm(model.output, cur); - } - // For Granite architecture - if (hparams.f_logit_scale) { - cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + // project to csm decoder dim + last_h = build_lora_mm(model.csm_proj, last_h); + cb(last_h, "output_csm_proj", -1); // use callback to retrieve the result + ggml_build_forward_expand(gf, last_h); + + } else if (model.csm_output_audio && ggml_nelements(cur)) { + // Sesame csm decoder + // hack: because n_audio < n_vocab, we use the first logits for the output + cur = build_lora_mm_id(model.csm_output_audio, cur, inp_pos); + int64_t n_vocab = model.tok_embd->ne[1]; + int64_t n_codes = cur->ne[0]; + cur = ggml_pad(ctx0, cur, n_vocab - n_codes, cur->ne[1], 0, 0); + + // also get audio embeddings, which will be passed back to backbone to keep track of generation progress + if (ubatch.token) { + cb(input_embd, "output_audio_embd", -1); + ggml_build_forward_expand(gf, input_embd); + } + + } else { + // otherwise, dummy output } cb(cur, "result_output", -1); @@ -11896,6 +12101,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_LLAMA_CSM: + { + llm = std::make_unique(*this, params, gf); + } break; case LLM_ARCH_DECI: { llm = std::make_unique(*this, params, gf); @@ -12234,6 +12443,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: + case LLM_ARCH_LLAMA_CSM: case LLM_ARCH_DECI: case LLM_ARCH_BAICHUAN: case LLM_ARCH_STARCODER: diff --git a/src/llama-model.h b/src/llama-model.h index 6c368d691c0d0..296f8f16e4712 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -338,9 +338,9 @@ struct llama_model { struct ggml_tensor * conv1d_b = nullptr; // sesame csm - struct ggml_tensor * csm_output_cbook = nullptr; // backbone codebook + struct ggml_tensor * csm_output_cbook = nullptr; // backbone output codebook struct ggml_tensor * csm_output_audio = nullptr; // audio decoder output - struct ggml_tensor * csm_input_proj = nullptr; // audio decoder input projection + struct ggml_tensor * csm_proj = nullptr; // to convert backbone dim to decoder dim std::vector layers; From f9162e7005469fefe9f4a73c1373003cd29aa61f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 01:03:04 +0100 Subject: [PATCH 07/31] wip --- examples/tts/tts-csm.cpp | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 34eeb4b4db4d9..5b0a23b2141ad 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -106,7 +106,7 @@ int main(int argc, char ** argv) { std::vector inp_past_embd(2048, 0.0f); llama_batch batch_past_embd = llama_batch_init(1, inp_past_embd.size(), 1); - for (int k = 0; k < 4; ++k) { + for (int k = 0; k < 32; ++k) { if (llama_decode(ctx_bb, k == 0 ? batch : batch_past_embd) != 0) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; @@ -121,7 +121,7 @@ int main(int argc, char ** argv) { llama_token latent_token = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); // printf("latent_token: %d\n", latent_token); - printf("%5d, ", latent_token); + printf("%d,", latent_token); // for (size_t i = 0; i < 10; ++i) { // printf("%4.2f, ", embd[i]); @@ -149,7 +149,9 @@ int main(int argc, char ** argv) { llama_decode(ctx_dc, batch_embd); llama_token audio_token = latent_token; - for (int i = 0; i < 31; ++i) { + int n_codes = 32; + int sum_codes = 0; + for (int i = 0; i < n_codes; ++i) { common_batch_clear(batch_token); // encoder vocab is further divided into 32 codebooks, each with 2051 entries llama_token inp_tok = audio_token + 2051*i; @@ -157,8 +159,13 @@ int main(int argc, char ** argv) { llama_decode(ctx_dc, batch_token); auto logits = llama_get_logits_ith(ctx_dc, 0); audio_token = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); - printf("%d,", audio_token); - prompt_tokens.push_back(audio_token); + + // discard last code + if (i < n_codes - 1) { + printf("%d,", audio_token); + prompt_tokens.push_back(audio_token); + sum_codes += audio_token; + } GGML_ASSERT(inp_past_embd.size() == embd.size()); for (size_t i = 0; i < inp_past_embd.size(); ++i) { @@ -169,8 +176,22 @@ int main(int argc, char ** argv) { llama_batch_free(batch_embd); llama_batch_free(batch_token); + + if (sum_codes == 0) { + return 0; // done + } } + // printf("inp_past_embd, n_past_bb = %d\n", n_past_bb); + // for (size_t i = 0; i < inp_past_embd.size(); ++i) { + // printf("%4.4f, ", inp_past_embd[i]); + // if (i == 2) { + // printf("... "); + // i = inp_past_embd.size() - 4; + // } + // } + // printf("\n"); + // prepare for the next iteration { batch_past_embd.n_tokens = 1; From eae5f0e1ced91eaffc5f148147c982b3d38877e2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 10:50:35 +0200 Subject: [PATCH 08/31] add mimi_model::transpose_input --- examples/tts/mimi-model.cpp | 27 ++++++++++++++++++--------- examples/tts/mimi-model.h | 5 +++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 92bb47a8365d7..ded56ff317d63 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -617,7 +617,7 @@ std::vector mimi_model::decode_frame(const std::vector & codes, int int n_pos = -1; int n_codes = codes.size(); int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; - GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiple of n_codes_per_embd"); + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); ctx->build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); @@ -661,14 +661,6 @@ std::vector mimi_model::decode_frame(const std::vector & codes, int ctx->set_tensor_data("pos_dec", pos_data.data()); // code data - /*std::vector codes_t(n_codes_per_embd * n_codes); - for (int i = 0; i < n_codes / n_codes_per_embd; i++) { - for (int j = 0; j < n_codes_per_embd; j++) { - int src_idx = i * n_codes_per_embd + j; - int dst_idx = j * (n_codes / n_codes_per_embd) + i; - codes_t[dst_idx] = codes[src_idx]; - } - }*/ ctx->set_tensor_data("inp_dec", codes.data()); ctx->compute(); @@ -715,6 +707,23 @@ std::vector mimi_model::decode(const std::vector & codes) { return output; } +std::vector mimi_model::transpose_input(const std::vector & codes) { + int n_codes = codes.size(); + int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); + + std::vector codes_T(n_codes_per_embd * n_codes); + for (int i = 0; i < n_codes / n_codes_per_embd; i++) { + for (int j = 0; j < n_codes_per_embd; j++) { + int src_idx = i * n_codes_per_embd + j; + int dst_idx = j * (n_codes / n_codes_per_embd) + i; + codes_T[dst_idx] = codes[src_idx]; + } + } + + return codes_T; +} + int mimi_model::get_sample_rate() const { return mimi_config.sample_rate; } diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h index c26fd3bc08e9f..96945981513c0 100644 --- a/examples/tts/mimi-model.h +++ b/examples/tts/mimi-model.h @@ -22,6 +22,11 @@ struct mimi_model { int get_sample_rate() const; + // transpose layout: + // - from: (1 semantic code followed by 31 acoustic codes) repeast N times + // - to: N semantic codes followed by (N*31) acoustic codes + std::vector transpose_input(const std::vector & codes); + // layout of codes: N semantic codes followed by (N*31) acoustic codes std::vector decode(const std::vector & codes); From 43bf237e3975a80fe0a52204ad08c7d43999c594 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 10:52:18 +0200 Subject: [PATCH 09/31] fix build --- examples/tts/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index 39e0a92c5acb4..371c3bbf7434d 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -8,4 +8,5 @@ set(TARGET llama-mimi) add_executable(${TARGET} mimi.cpp mimi-model.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +# for using C++ designated initializers, TODO: can be changed back to C++17 in the future +target_compile_features(${TARGET} PRIVATE cxx_std_20) From e618405d4b9040f9536e2acc6761eac004146969 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 11:07:18 +0200 Subject: [PATCH 10/31] fix build (2) --- examples/tts/mimi-model.cpp | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index ded56ff317d63..141dd1043923b 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -15,6 +15,7 @@ #include #include #include +#include /** * Implementation of Kyutai's Mimi model using GGML. @@ -344,10 +345,10 @@ struct mimi_encoder_decoder { bool is_elu = false; bool is_resnet = false; bool is_transposed_conv = false; - ggml_tensor * conv_0_w; - ggml_tensor * conv_0_b; - ggml_tensor * conv_1_w; - ggml_tensor * conv_1_b; + ggml_tensor * conv_0_w = nullptr; + ggml_tensor * conv_0_b = nullptr; + ggml_tensor * conv_1_w = nullptr; + ggml_tensor * conv_1_b = nullptr; int stride = 1; }; std::vector layers; @@ -415,20 +416,20 @@ struct mimi_encoder_decoder { struct mimi_transformer { struct layer { - ggml_tensor * inp_norm_w; - ggml_tensor * inp_norm_b; - - ggml_tensor * attn_q; - ggml_tensor * attn_k; - ggml_tensor * attn_v; - ggml_tensor * attn_o; - ggml_tensor * attn_post_norm_w; - ggml_tensor * attn_post_norm_b; - ggml_tensor * attn_layer_scale; - - ggml_tensor * ffn_up; - ggml_tensor * ffn_down; - ggml_tensor * mlp_layer_scale; + ggml_tensor * inp_norm_w = nullptr; + ggml_tensor * inp_norm_b = nullptr; + + ggml_tensor * attn_q = nullptr; + ggml_tensor * attn_k = nullptr; + ggml_tensor * attn_v = nullptr; + ggml_tensor * attn_o = nullptr; + ggml_tensor * attn_post_norm_w = nullptr; + ggml_tensor * attn_post_norm_b = nullptr; + ggml_tensor * attn_layer_scale = nullptr; + + ggml_tensor * ffn_up = nullptr; + ggml_tensor * ffn_down = nullptr; + ggml_tensor * mlp_layer_scale = nullptr; }; std::vector layers; From e185e0ac7fb10b2b933caae2823a04f44d703967 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 11:44:34 +0200 Subject: [PATCH 11/31] fix build (3) --- examples/tts/mimi-model.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 141dd1043923b..0b1fabe86088e 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -16,6 +16,7 @@ #include #include #include +#include /** * Implementation of Kyutai's Mimi model using GGML. @@ -367,10 +368,10 @@ struct mimi_encoder_decoder { .is_elu = true, // layer (i_start) }); layers.push_back({ + .is_transposed_conv = true, .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), .stride = mimi_config.upsampling_ratio[i], - .is_transposed_conv = true, }); // residual layers layers.push_back({ From ce83041ec3205b2586fca7d52ac9cef5c0ddc446 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 11:45:36 +0200 Subject: [PATCH 12/31] fix strcmp --- examples/tts/mimi.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 421c9e418ecc6..502e0150634b7 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -3,6 +3,7 @@ #include #include +#include // strcmp /** From 61d8ad6aef03879ca7193a302a0f549a40d761cb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 12:04:33 +0200 Subject: [PATCH 13/31] fix compilation on linux --- examples/tts/convert_mimi_to_gguf.py | 4 ++-- examples/tts/mimi-model.cpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/tts/convert_mimi_to_gguf.py b/examples/tts/convert_mimi_to_gguf.py index 5b44ef62103ba..5dce72a398a91 100644 --- a/examples/tts/convert_mimi_to_gguf.py +++ b/examples/tts/convert_mimi_to_gguf.py @@ -5,13 +5,13 @@ from typing import Union from pathlib import Path from torch import Tensor -from transformers import MimiModel +from transformers import MimiModel, PreTrainedModel logger = logging.getLogger("mimi") class MimiModelConverter: - mimi_model: MimiModel + mimi_model: PreTrainedModel gguf_writer: gguf.GGUFWriter fname_out: Path ftype: gguf.LlamaFileType diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 0b1fabe86088e..427aeff8658bf 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -17,6 +17,8 @@ #include #include #include +#include +#include /** * Implementation of Kyutai's Mimi model using GGML. From 40120540afccd23bcf31c95d720f3292097815e0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 13:02:07 +0200 Subject: [PATCH 14/31] clean up --- examples/tts/README-csm.md | 31 +++++++ examples/tts/convert_csm_to_gguf.py | 2 +- examples/tts/tts-csm.cpp | 134 ++++++++++++++++++---------- 3 files changed, 120 insertions(+), 47 deletions(-) create mode 100644 examples/tts/README-csm.md diff --git a/examples/tts/README-csm.md b/examples/tts/README-csm.md new file mode 100644 index 0000000000000..f660d8965ecbe --- /dev/null +++ b/examples/tts/README-csm.md @@ -0,0 +1,31 @@ +# Sesame CSM + +To get the GGUF: + +```sh +python examples/tts/convert_csm_to_gguf.py + +# default output files: +# sesame-csm-backbone.gguf +# sesame-csm-decoder.gguf + +# optionally, quantize it +# (lowest scheme is q8_0, it does not make sense to quantize further, quality degrades too much) +python examples/tts/convert_csm_to_gguf.py --outtype q8_0 +``` + +Compile the example: + +```sh +cmake --build build -j --target llama-tts-csm +``` + +Run the example: + +```sh +./build/bin/llama-tts-csm -m sesame-csm-backbone.gguf -p "[0]Hello world." +# sesame-csm-backbone.gguf will automatically be loaded +# make sure the place these 2 GGUF files in the same directory + +# output file: output.wav +``` diff --git a/examples/tts/convert_csm_to_gguf.py b/examples/tts/convert_csm_to_gguf.py index 183ea98b7076d..09b7748c2a63d 100644 --- a/examples/tts/convert_csm_to_gguf.py +++ b/examples/tts/convert_csm_to_gguf.py @@ -95,7 +95,7 @@ def __init__(self, fname_out: Path, ftype: gguf.LlamaFileType, is_big_endian: bool,): - + if "" not in fname_out.name: raise ValueError("Output file name must contain '' placeholder, for example: 'sesame-csm-.gguf'") diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 5b0a23b2141ad..b4b01331d2d22 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -6,6 +6,10 @@ #include #include #include +#include // memcpy and strcmp +#include + +// For more details on how this works, see: https://github.com/ggml-org/llama.cpp/pull/12648 static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); @@ -30,6 +34,8 @@ static llama_token sample_greedy(const float * logits, int n_vocab) { static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { std::vector * embd = (std::vector *) user_data; + // output_csm_proj is the embeddings output from backbone + // output_audio_embd is the embeddings output from decoder if (t && (strcmp(t->name, "output_csm_proj") == 0 || strcmp(t->name, "output_audio_embd") == 0)) { if (ask) return true; @@ -45,13 +51,10 @@ static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { int main(int argc, char ** argv) { common_params params; - params.model = "sesame-csm-backbone.gguf"; - params.out_file = "output.wav"; - params.prompt = "[0]Hello from Sesame."; - - params.n_predict = 4096; - params.n_batch = 8192; - params.n_ctx = 8192; + params.model = "sesame-csm-backbone.gguf"; + params.out_file = "output.wav"; + params.prompt = "[0]Hello from Sesame."; + params.n_predict = 2048; // CSM's max trained seq length if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { return 1; @@ -66,6 +69,7 @@ int main(int argc, char ** argv) { params.warmup = false; common_params params_decoder(params); // duplicate the params + params_decoder.n_ctx = 64; // we never use more than this string_replace_all(params_decoder.model, "-backbone", "-decoder"); common_init_result llama_backbone = common_init_from_params(params); @@ -96,77 +100,114 @@ int main(int argc, char ** argv) { printf("\n"); llama_pos n_past_bb = 0; - llama_batch batch = llama_batch_init(params.n_batch, 0, 1); - common_batch_clear(batch); + llama_batch batch_prompt = llama_batch_init(params.n_batch, 0, 1); + common_batch_clear(batch_prompt); for (size_t i = 0; i < prompt_tokens.size(); ++i) { - common_batch_add(batch, prompt_tokens[i], n_past_bb++, { 0 }, false); + common_batch_add(batch_prompt, prompt_tokens[i], n_past_bb++, { 0 }, false); } - batch.logits[batch.n_tokens - 1] = true; + batch_prompt.logits[batch_prompt.n_tokens - 1] = true; + // inp_past_embd is the "squashed" embeddings from the decoder std::vector inp_past_embd(2048, 0.0f); llama_batch batch_past_embd = llama_batch_init(1, inp_past_embd.size(), 1); - for (int k = 0; k < 32; ++k) { - if (llama_decode(ctx_bb, k == 0 ? batch : batch_past_embd) != 0) { - LOG_ERR("%s: llama_decode() failed\n", __func__); + int64_t t_gb_start = ggml_time_ms(); // global start time + int64_t t_bb = 0; // backbone time + int64_t n_bb_gen = 0; // backbone generation count + int64_t t_dc = 0; // decoder time + int64_t n_dc_gen = 0; // decoder generation count + + bool is_stop = false; + + // backbone generation loop + for (int k = 0; k < params.n_predict; ++k) { + bool is_prompt_processing = k == 0; + + if (!is_prompt_processing) { + // generate the next RVQ semantic token + batch_past_embd.n_tokens = 1; + batch_past_embd.pos[0] = n_past_bb++; + batch_past_embd.seq_id[0][0] = 0; + batch_past_embd.n_seq_id[0] = 1; + batch_past_embd.logits[0] = true; + std::memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float)); + } + + int64_t t_bb_start = ggml_time_ms(); + if (llama_decode(ctx_bb, is_prompt_processing ? batch_prompt : batch_past_embd) != 0) { + LOG_ERR("%s: backbone llama_decode() failed\n", __func__); return 1; } + n_bb_gen++; + t_bb += ggml_time_ms() - t_bb_start; auto vocab_dc = llama_model_get_vocab(model_dc); - auto logits = llama_get_logits_ith(ctx_bb, k == 0 ? (batch.n_tokens - 1) : 0); + auto logits = llama_get_logits_ith(ctx_bb, is_prompt_processing ? (batch_prompt.n_tokens - 1) : 0); // for (size_t i = 0; i < 10; ++i) { // printf("%4.2f, ", logits[i]); // } // printf("\n"); - llama_token latent_token = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); - // printf("latent_token: %d\n", latent_token); - printf("%d,", latent_token); + llama_token semantic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); + printf("%d,", semantic_tok); // for (size_t i = 0; i < 10; ++i) { // printf("%4.2f, ", embd[i]); // } // printf("\n"); - - // decode - prompt_tokens.clear(); - prompt_tokens.push_back(latent_token); + // decoder generation loop inp_past_embd = std::vector(inp_past_embd.size(), 0.0f); { llama_kv_self_clear(ctx_dc); llama_batch batch_embd = llama_batch_init(1, embd.size(), 1); llama_batch batch_token = llama_batch_init(1, 0, 1); + + // first "token" is the latent embeddings from backbone { batch_embd.n_tokens = 1; batch_embd.pos[0] = 0; batch_embd.seq_id[0][0] = 0; batch_embd.n_seq_id[0] = 1; batch_embd.logits[0] = false; - memcpy(batch_embd.embd, embd.data(), embd.size() * sizeof(float)); + std::memcpy(batch_embd.embd, embd.data(), embd.size() * sizeof(float)); + } + if (llama_decode(ctx_dc, batch_embd) != 0) { + LOG_ERR("%s: decoder llama_decode(embd) failed\n", __func__); + return 1; } - llama_decode(ctx_dc, batch_embd); - - llama_token audio_token = latent_token; + + // then, decode the semantic_tok to generate acoustic tokens + llama_token tok = semantic_tok; int n_codes = 32; - int sum_codes = 0; + int sum_codes = 0; // to check if all codes are 0 for (int i = 0; i < n_codes; ++i) { common_batch_clear(batch_token); // encoder vocab is further divided into 32 codebooks, each with 2051 entries - llama_token inp_tok = audio_token + 2051*i; + llama_token inp_tok = tok + 2051*i; common_batch_add(batch_token, inp_tok, i+1, { 0 }, true); - llama_decode(ctx_dc, batch_token); + + int64_t t_bb_start = ggml_time_ms(); + if (llama_decode(ctx_dc, batch_token) != 0) { + LOG_ERR("%s: decoder llama_decode(token) failed\n", __func__); + return 1; + } + n_dc_gen++; + t_dc += ggml_time_ms() - t_bb_start; + + // sample the acoustic token auto logits = llama_get_logits_ith(ctx_dc, 0); - audio_token = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); + llama_token acoustic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); - // discard last code + // discard last code (only for embeddings) if (i < n_codes - 1) { - printf("%d,", audio_token); - prompt_tokens.push_back(audio_token); - sum_codes += audio_token; + printf("%d,", acoustic_tok); + tok = acoustic_tok; // next input token + sum_codes += acoustic_tok; } + // do progressive hsum of embeddings GGML_ASSERT(inp_past_embd.size() == embd.size()); for (size_t i = 0; i < inp_past_embd.size(); ++i) { inp_past_embd[i] += embd[i]; @@ -177,9 +218,8 @@ int main(int argc, char ** argv) { llama_batch_free(batch_embd); llama_batch_free(batch_token); - if (sum_codes == 0) { - return 0; // done - } + // if all codes are 0, then we are done + is_stop = sum_codes == 0; } // printf("inp_past_embd, n_past_bb = %d\n", n_past_bb); @@ -192,17 +232,19 @@ int main(int argc, char ** argv) { // } // printf("\n"); - // prepare for the next iteration - { - batch_past_embd.n_tokens = 1; - batch_past_embd.pos[0] = n_past_bb; - batch_past_embd.seq_id[0][0] = 0; - batch_past_embd.n_seq_id[0] = 1; - batch_past_embd.logits[0] = true; - memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float)); + if (is_stop) { + break; } - n_past_bb++; } + // print timing info + printf("\ntimings:\n"); + printf(" backbone: %" PRId64 " ms, %" PRId64 " generated token (%.2f tok/s)\n", t_bb, n_bb_gen, (float)n_bb_gen*1000/(float)t_bb); + printf(" decoder: %" PRId64 " ms, %" PRId64 " generated token (%.2f tok/s)\n", t_dc, n_dc_gen, (float)n_dc_gen*1000/(float)t_dc); + printf(" total: %" PRId64 " ms\n\n", ggml_time_ms() - t_gb_start); + + llama_batch_free(batch_prompt); + llama_batch_free(batch_past_embd); + return 0; } From 7ecce7645576854d095726810ff9787df5380f65 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 14:21:55 +0200 Subject: [PATCH 15/31] working now --- examples/tts/CMakeLists.txt | 20 ++++++++------ examples/tts/convert_mimi_to_gguf.py | 2 +- examples/tts/mimi-model.cpp | 2 +- examples/tts/tts-csm.cpp | 41 ++++++++++++++++++++++++---- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index 58a8599148bab..ab184a85ba17b 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -4,15 +4,19 @@ install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) -set(TARGET llama-tts-csm) -add_executable(${TARGET} tts-csm.cpp) +add_library(mimi-model mimi-model.h mimi-model.cpp) +target_link_libraries(mimi-model PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +# for using C++ designated initializers, TODO: can be changed back to C++17 in the future +target_compile_features(mimi-model PRIVATE cxx_std_20) + +set(TARGET llama-mimi) +add_executable(${TARGET} mimi.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) -set(TARGET llama-mimi) -add_executable(${TARGET} mimi.cpp mimi-model.cpp) +set(TARGET llama-tts-csm) +add_executable(${TARGET} tts-csm.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) -# for using C++ designated initializers, TODO: can be changed back to C++17 in the future -target_compile_features(${TARGET} PRIVATE cxx_std_20) +target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/convert_mimi_to_gguf.py b/examples/tts/convert_mimi_to_gguf.py index 5dce72a398a91..81cb8f48cc25e 100644 --- a/examples/tts/convert_mimi_to_gguf.py +++ b/examples/tts/convert_mimi_to_gguf.py @@ -27,7 +27,7 @@ def __init__(self, endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.gguf_writer = gguf.GGUFWriter( path=None, - arch="if you see this, you are using the wrong file", + arch="this model cannot be used as LLM, use it via --model-vocoder in TTS examples", endianess=endianess) assert self.mimi_model.config.architectures[0] == "MimiModel" diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 427aeff8658bf..3663201dc5971 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -716,7 +716,7 @@ std::vector mimi_model::transpose_input(const std::vector & codes) { int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); - std::vector codes_T(n_codes_per_embd * n_codes); + std::vector codes_T(n_codes); for (int i = 0; i < n_codes / n_codes_per_embd; i++) { for (int j = 0; j < n_codes_per_embd; j++) { int src_idx = i * n_codes_per_embd + j; diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index b4b01331d2d22..843d7f6b79196 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -2,6 +2,7 @@ #include "common.h" #include "log.h" #include "arg.h" +#include "mimi-model.h" #include #include @@ -13,7 +14,13 @@ static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); - LOG("\n %s TODO ", argv[0]); + LOG("\n By default, model will be downloaded from https://huggingface.co/ggml-org/sesame-csm-1b-GGUF"); + LOG("\n %s -p \"[0]I have a dream that one day every valley shall be exalted\" -o output.wav", argv[0]); + LOG("\n"); + LOG("\n To use a local model, specify the path to the model file:"); + LOG("\n %s -p ... -m sesame-csm-backbone.gguf -mv kyutai-mimi.gguf -o output.wav", argv[0]); + LOG("\n"); + LOG("\n Note: the model need 2 files to run, one ends with '-backbone-.gguf' and the other ends with '-decoder.gguf'"); LOG("\n"); } @@ -51,10 +58,15 @@ static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { int main(int argc, char ** argv) { common_params params; - params.model = "sesame-csm-backbone.gguf"; - params.out_file = "output.wav"; - params.prompt = "[0]Hello from Sesame."; - params.n_predict = 2048; // CSM's max trained seq length + params.model = "sesame-csm-backbone.gguf"; + params.vocoder.model = "kyutai-mimi.gguf"; + params.out_file = "output.wav"; + params.prompt = "[0]Hello from Sesame."; + params.n_predict = 2048; // CSM's max trained seq length + + // HF model + params.model_url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf"; + params.vocoder.model_url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf"; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { return 1; @@ -71,6 +83,9 @@ int main(int argc, char ** argv) { common_params params_decoder(params); // duplicate the params params_decoder.n_ctx = 64; // we never use more than this string_replace_all(params_decoder.model, "-backbone", "-decoder"); + if (!params_decoder.model_url.empty()) { + string_replace_all(params_decoder.model_url, "-backbone", "-decoder"); + } common_init_result llama_backbone = common_init_from_params(params); llama_model * model_bb = llama_backbone.model.get(); @@ -88,6 +103,8 @@ int main(int argc, char ** argv) { return ENOENT; } + mimi_model mimi(params.vocoder.model.c_str(), true); + const llama_vocab * vocab = llama_model_get_vocab(model_bb); llama_tokens prompt_tokens = common_tokenize(vocab, params.prompt, false, true); prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab)); @@ -118,6 +135,7 @@ int main(int argc, char ** argv) { int64_t n_dc_gen = 0; // decoder generation count bool is_stop = false; + std::vector generated_codes; // backbone generation loop for (int k = 0; k < params.n_predict; ++k) { @@ -150,6 +168,7 @@ int main(int argc, char ** argv) { llama_token semantic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); printf("%d,", semantic_tok); + generated_codes.push_back(semantic_tok); // for (size_t i = 0; i < 10; ++i) { // printf("%4.2f, ", embd[i]); @@ -205,6 +224,7 @@ int main(int argc, char ** argv) { printf("%d,", acoustic_tok); tok = acoustic_tok; // next input token sum_codes += acoustic_tok; + generated_codes.push_back(acoustic_tok); } // do progressive hsum of embeddings @@ -246,5 +266,16 @@ int main(int argc, char ** argv) { llama_batch_free(batch_prompt); llama_batch_free(batch_past_embd); + printf("decode %zu RVQ tokens into wav...\n", generated_codes.size()); + generated_codes = mimi.transpose_input(generated_codes); + std::vector wav_data = mimi.decode(generated_codes); + + if (!save_wav16(params.out_file.c_str(), wav_data, mimi.get_sample_rate())) { + LOG_ERR("Failed to save wav file\n"); + return 1; + } + + printf("\n"); + return 0; } From 6976682fbc2226675610fedfdfd2ea9b8ce231a4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 14:28:13 +0200 Subject: [PATCH 16/31] update readme --- examples/tts/README-csm.md | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/examples/tts/README-csm.md b/examples/tts/README-csm.md index f660d8965ecbe..676b9889e157d 100644 --- a/examples/tts/README-csm.md +++ b/examples/tts/README-csm.md @@ -1,5 +1,27 @@ # Sesame CSM +This demo shows running inference of [Sesame CSM](https://github.com/SesameAILabs/csm) using llama.cpp / GGML + +It contains 3 components (each has its own GGUF file): +1. Backbone LLM +2. Decoder LLM +3. Mimi decoder + +## Quick start + +By default, all GGUF files are downloaded from [ggml-org Hugging Face's account](https://huggingface.co/ggml-org/sesame-csm-1b-GGUF) + +```sh +# build (make sure to have LLAMA_CURL enabled) +cmake -B build -DLLAMA_CURL=ON +cmake --build build -j --target llama-tts-csm + +# run it +./build/bin/llama-tts-csm -p "[0]Hi, my name is Xuan Son. I am software engineer at Hugging Face." +``` + +## Convert the model yourself + To get the GGUF: ```sh @@ -14,16 +36,10 @@ python examples/tts/convert_csm_to_gguf.py python examples/tts/convert_csm_to_gguf.py --outtype q8_0 ``` -Compile the example: - -```sh -cmake --build build -j --target llama-tts-csm -``` - -Run the example: +Run the example using local file: ```sh -./build/bin/llama-tts-csm -m sesame-csm-backbone.gguf -p "[0]Hello world." +./build/bin/llama-tts-csm -m sesame-csm-backbone.gguf -mv kyutai-mimi.gguf -p "[0]Hello world." # sesame-csm-backbone.gguf will automatically be loaded # make sure the place these 2 GGUF files in the same directory From 1e9afd9d816881d7a244c7639eda2eea85da3ad3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 14:45:04 +0200 Subject: [PATCH 17/31] nits --- examples/tts/mimi.cpp | 1 + examples/tts/tts-csm.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 502e0150634b7..17047c9b4f1ce 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -79,6 +79,7 @@ int main(int argc, const char ** argv) { while (std::getline(fin, line)) { // Skip empty lines if (line.empty()) continue; + // TODO: support both comma (with spaces) and new line try { int code = std::stoi(line); codes.push_back(code); diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 843d7f6b79196..4d77e5f6d3169 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -270,6 +270,8 @@ int main(int argc, char ** argv) { generated_codes = mimi.transpose_input(generated_codes); std::vector wav_data = mimi.decode(generated_codes); + printf("output wav file: %s\n", params.out_file.c_str()); + if (!save_wav16(params.out_file.c_str(), wav_data, mimi.get_sample_rate())) { LOG_ERR("Failed to save wav file\n"); return 1; From 40ab1ab30755d78f59c1e8ce16cc249a4e0c5954 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 14:51:29 +0200 Subject: [PATCH 18/31] fix mul_mat_id read out-of-bound --- examples/tts/convert_csm_to_gguf.py | 6 +++--- src/llama-model.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/tts/convert_csm_to_gguf.py b/examples/tts/convert_csm_to_gguf.py index 09b7748c2a63d..53f586f19962d 100644 --- a/examples/tts/convert_csm_to_gguf.py +++ b/examples/tts/convert_csm_to_gguf.py @@ -179,10 +179,10 @@ def rename_transformer(name: str) -> str: is_decoder = True name = "audio_head.weight" if component == "decoder": - # add padding at the beginning so that build_lora_mm_id can be used + # add padding at the beginning and the end so that build_lora_mm_id can be used zero_tensor = torch.zeros(1, 1024, 2051) - data_torch = torch.cat([zero_tensor, data_torch], dim=0) - assert data_torch.shape == (32, 1024, 2051) + data_torch = torch.cat([zero_tensor, data_torch, zero_tensor], dim=0) + assert data_torch.shape == (33, 1024, 2051) # then, transpose it data_torch = data_torch.transpose(1, 2) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 11c34b4deaa52..64c6978e98b31 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1662,7 +1662,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { // TODO: maybe store these in gguf metadata int64_t csm_audio_cbook_size = 2051; // audio codebook size - int64_t csm_acoustic_tokens = 32; // equal to number of acoutic tokens for Mimi + int64_t csm_audio_tokens = 32; // equal to number of audio tokens for Mimi //int64_t csm_n_audio_vocab = csm_audio_cbook_size*csm_acoustic_tokens; csm_output_cbook = create_tensor(tn(LLM_TENSOR_CSM_CBOOK_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size}, TENSOR_NOT_REQUIRED); @@ -1670,7 +1670,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { bool is_backbone = csm_output_cbook != nullptr; csm_output_audio = is_backbone ? nullptr - : create_tensor(tn(LLM_TENSOR_CSM_AUDIO_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size, csm_acoustic_tokens}, 0); + : create_tensor(tn(LLM_TENSOR_CSM_AUDIO_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size, csm_audio_tokens+1}, 0); tok_embd = is_backbone ? create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0) From eaba2bfbcf7af45a140c6dad3803084df29cb922 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 15:05:32 +0200 Subject: [PATCH 19/31] will this fix windows build? --- examples/tts/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index ab184a85ba17b..e66c298db461a 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -4,7 +4,7 @@ install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) -add_library(mimi-model mimi-model.h mimi-model.cpp) +add_library(mimi-model STATIC mimi-model.h mimi-model.cpp) target_link_libraries(mimi-model PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) # for using C++ designated initializers, TODO: can be changed back to C++17 in the future target_compile_features(mimi-model PRIVATE cxx_std_20) From 5fe27efcebc0f4d01cc0a52c763db5365ec0634c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 23:49:49 +0200 Subject: [PATCH 20/31] (try) fixing problem with long text --- examples/tts/tts-csm.cpp | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 4d77e5f6d3169..fb5146a3bbcc9 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -13,7 +13,7 @@ // For more details on how this works, see: https://github.com/ggml-org/llama.cpp/pull/12648 static void print_usage(int, char ** argv) { - LOG("\nexample usage:\n"); + LOG("\nExample usage:\n"); LOG("\n By default, model will be downloaded from https://huggingface.co/ggml-org/sesame-csm-1b-GGUF"); LOG("\n %s -p \"[0]I have a dream that one day every valley shall be exalted\" -o output.wav", argv[0]); LOG("\n"); @@ -22,6 +22,11 @@ static void print_usage(int, char ** argv) { LOG("\n"); LOG("\n Note: the model need 2 files to run, one ends with '-backbone-.gguf' and the other ends with '-decoder.gguf'"); LOG("\n"); + LOG("\nPrompt format:"); + LOG("\n Each line must start with speaker ID in square brackets, followed by the text. A full stop is recommended at the end of each turn"); + LOG("\n Example: [0]Hello world."); + LOG("\n If you want to enter long text, use -f file.txt to read from file"); + LOG("\n"); } // greedy sampling with custom n_vocab @@ -61,7 +66,7 @@ int main(int argc, char ** argv) { params.model = "sesame-csm-backbone.gguf"; params.vocoder.model = "kyutai-mimi.gguf"; params.out_file = "output.wav"; - params.prompt = "[0]Hello from Sesame."; + params.prompt = ""; params.n_predict = 2048; // CSM's max trained seq length // HF model @@ -75,6 +80,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); + if (params.prompt.empty()) { + LOG_ERR("prompt is empty\n"); + return 1; + } + std::vector embd; params.cb_eval = ggml_callback; params.cb_eval_user_data = &embd; @@ -167,7 +177,7 @@ int main(int argc, char ** argv) { // printf("\n"); llama_token semantic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); - printf("%d,", semantic_tok); + printf("Sem token %5d : %d,", 1+(int)generated_codes.size()/32, semantic_tok); generated_codes.push_back(semantic_tok); // for (size_t i = 0; i < 10; ++i) { @@ -200,7 +210,7 @@ int main(int argc, char ** argv) { // then, decode the semantic_tok to generate acoustic tokens llama_token tok = semantic_tok; int n_codes = 32; - int sum_codes = 0; // to check if all codes are 0 + int sum_codes = semantic_tok; // to check if all codes are 0 for (int i = 0; i < n_codes; ++i) { common_batch_clear(batch_token); // encoder vocab is further divided into 32 codebooks, each with 2051 entries @@ -228,9 +238,12 @@ int main(int argc, char ** argv) { } // do progressive hsum of embeddings - GGML_ASSERT(inp_past_embd.size() == embd.size()); - for (size_t i = 0; i < inp_past_embd.size(); ++i) { - inp_past_embd[i] += embd[i]; + // skip first semantic code + if (i > 0) { + GGML_ASSERT(inp_past_embd.size() == embd.size()); + for (size_t i = 0; i < inp_past_embd.size(); ++i) { + inp_past_embd[i] += embd[i]; + } } } printf("\n"); @@ -253,6 +266,8 @@ int main(int argc, char ** argv) { // printf("\n"); if (is_stop) { + // remove last 32 codes since they will be all zeros + generated_codes.resize(generated_codes.size() - 32); break; } } From c796ee0f6620ea049e9e8fe63310e1078b3d8cae Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 31 Mar 2025 00:02:25 +0200 Subject: [PATCH 21/31] mimi: fix frame splitting --- examples/tts/mimi-model.cpp | 3 ++- examples/tts/mimi-model.h | 4 ++-- examples/tts/mimi.cpp | 2 ++ examples/tts/tts-csm.cpp | 1 - 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 3663201dc5971..fee88c679e1f3 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -665,7 +665,8 @@ std::vector mimi_model::decode_frame(const std::vector & codes, int ctx->set_tensor_data("pos_dec", pos_data.data()); // code data - ctx->set_tensor_data("inp_dec", codes.data()); + auto codes_T = mimi_model::transpose_input(codes); + ctx->set_tensor_data("inp_dec", codes_T.data()); ctx->compute(); diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h index 96945981513c0..1ded07e875e7d 100644 --- a/examples/tts/mimi-model.h +++ b/examples/tts/mimi-model.h @@ -25,9 +25,9 @@ struct mimi_model { // transpose layout: // - from: (1 semantic code followed by 31 acoustic codes) repeast N times // - to: N semantic codes followed by (N*31) acoustic codes - std::vector transpose_input(const std::vector & codes); + static std::vector transpose_input(const std::vector & codes); - // layout of codes: N semantic codes followed by (N*31) acoustic codes + // layout of codes: (1 semantic code followed by 31 acoustic codes) repeast N times std::vector decode(const std::vector & codes); // TODO: implement encoding pass diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 17047c9b4f1ce..293f2fb775c3b 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -69,6 +69,8 @@ int main(int argc, const char ** argv) { 1740 ,1154 ,1839 ,912 ,731 ,602 ,1064 ,1508 ,834 ,1387 ,252 ,745 ,1034 ,1102 ,965 ,696 , 1971 ,1729 ,666 ,282 ,1993 ,1551 ,1703 ,1124 ,1628 ,1725 ,107 ,808 ,1096 ,1753 ,500 ,677 , }; + // this particular example is pre-transposed, we need to undo that + codes = mimi_model::transpose_input(codes); } else { std::ifstream fin(codes_path); if (!fin) { diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index fb5146a3bbcc9..2dda42198637d 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -282,7 +282,6 @@ int main(int argc, char ** argv) { llama_batch_free(batch_past_embd); printf("decode %zu RVQ tokens into wav...\n", generated_codes.size()); - generated_codes = mimi.transpose_input(generated_codes); std::vector wav_data = mimi.decode(generated_codes); printf("output wav file: %s\n", params.out_file.c_str()); From e31a75c209af16376fb53ea829def34ca4fbad9c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 31 Mar 2025 11:53:24 +0200 Subject: [PATCH 22/31] fix mimi example dummy1 --- examples/tts/mimi-model.h | 11 ++++--- examples/tts/mimi.cpp | 66 +++++++++++++++++++-------------------- 2 files changed, 38 insertions(+), 39 deletions(-) diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h index 1ded07e875e7d..eb5eb46c22807 100644 --- a/examples/tts/mimi-model.h +++ b/examples/tts/mimi-model.h @@ -22,11 +22,6 @@ struct mimi_model { int get_sample_rate() const; - // transpose layout: - // - from: (1 semantic code followed by 31 acoustic codes) repeast N times - // - to: N semantic codes followed by (N*31) acoustic codes - static std::vector transpose_input(const std::vector & codes); - // layout of codes: (1 semantic code followed by 31 acoustic codes) repeast N times std::vector decode(const std::vector & codes); @@ -35,4 +30,10 @@ struct mimi_model { private: std::vector decode_frame(const std::vector & codes, int & n_past); + + // transpose layout (from streaming layout to non-streaming): + // - from: (1 semantic code followed by 31 acoustic codes) repeast N times + // - to: N semantic codes followed by (N*31) acoustic codes + // streaming layout is 1-31, 1-31, 1-31, ..., used for real-time processing + static std::vector transpose_input(const std::vector & codes); }; diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 293f2fb775c3b..a50bd44f599a9 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -36,41 +36,39 @@ int main(int argc, const char ** argv) { } else if (strcmp(codes_path, "dummy1") == 0) { printf("Using dummy1 codes\n"); codes = { - 1049 ,1415 ,1962 ,914 ,1372 ,704 ,1922 ,2036 ,288 ,968 ,193 ,1139 ,897 ,897 ,1243 ,1511 , - 1597 ,175 ,1280 ,1202 ,1911 ,85 ,47 ,692 ,632 ,251 ,1553 ,1735 ,1577 ,132 ,471 ,433 , - 1325 ,1539 ,1943 ,1601 ,141 ,257 ,564 ,1435 ,876 ,1096 ,636 ,61 ,1497 ,1010 ,485 ,284 , - 839 ,776 ,878 ,1719 ,1069 ,1302 ,893 ,2005 ,875 ,908 ,586 ,2001 ,186 ,1932 ,1765 ,721 , - 592 ,1046 ,1588 ,1670 ,1485 ,1141 ,34 ,1465 ,1156 ,1938 ,435 ,753 ,1418 ,277 ,391 ,1741 , - 1440 ,117 ,723 ,412 ,642 ,1717 ,131 ,37 ,345 ,112 ,1979 ,2034 ,1822 ,1536 ,1281 ,56 , - 1341 ,803 ,568 ,568 ,1370 ,1995 ,1063 ,892 ,273 ,895 ,1226 ,354 ,1726 ,1541 ,1607 ,615 , - 985 ,1499 ,1736 ,1838 ,702 ,1345 ,1657 ,511 ,1774 ,1787 ,945 ,1927 ,947 ,952 ,1418 ,916 , - 1239 ,1457 ,1021 ,341 ,284 ,882 ,474 ,1559 ,1923 ,273 ,1330 ,1406 ,1782 ,19 ,116 ,887 , - 1146 ,1307 ,983 ,1237 ,1407 ,1350 ,1960 ,1255 ,878 ,1979 ,1500 ,1939 ,1415 ,88 ,1702 ,1253 , - 1778 ,2 ,10 ,1279 ,999 ,1549 ,1049 ,373 ,1355 ,1200 ,1466 ,1009 ,75 ,2042 ,1725 ,916 , - 1636 ,1135 ,833 ,830 ,1758 ,2015 ,1275 ,1675 ,287 ,744 ,89 ,430 ,1724 ,1232 ,1692 ,535 , - 1485 ,1287 ,973 ,1815 ,314 ,2020 ,424 ,1085 ,982 ,1994 ,1563 ,1269 ,1769 ,1681 ,1082 ,1666 , - 1622 ,1039 ,1209 ,32 ,679 ,732 ,976 ,1462 ,805 ,402 ,1150 ,170 ,1529 ,2013 ,350 ,1175 , - 757 ,1124 ,1091 ,1369 ,1061 ,415 ,1217 ,1135 ,1360 ,1578 ,1205 ,1785 ,1835 ,1241 ,14 ,716 , - 480 ,716 ,681 ,1686 ,1624 ,335 ,865 ,1356 ,1688 ,307 ,366 ,541 ,1262 ,1167 ,59 ,269 , - 1899 ,1798 ,1606 ,1307 ,1549 ,1814 ,114 ,483 ,958 ,1919 ,1179 ,898 ,834 ,1526 ,386 ,447 , - 1481 ,201 ,779 ,419 ,430 ,1451 ,1000 ,156 ,1062 ,615 ,1353 ,414 ,1214 ,1487 ,882 ,32 , - 840 ,1517 ,334 ,1143 ,823 ,454 ,725 ,1298 ,1325 ,649 ,1737 ,913 ,685 ,761 ,2010 ,63 , - 1397 ,1299 ,765 ,1158 ,1809 ,1299 ,1585 ,1776 ,625 ,1539 ,830 ,1563 ,461 ,308 ,1438 ,321 , - 82 ,886 ,1836 ,325 ,1976 ,761 ,359 ,1136 ,1720 ,2036 ,904 ,719 ,526 ,1567 ,145 ,1860 , - 1565 ,1786 ,1400 ,1696 ,232 ,1736 ,512 ,518 ,1895 ,1854 ,1584 ,1393 ,1869 ,1702 ,789 ,1986 , - 116 ,521 ,150 ,1597 ,727 ,1916 ,815 ,1826 ,1382 ,653 ,1596 ,286 ,1373 ,177 ,1397 ,1009 , - 1449 ,353 ,877 ,93 ,266 ,1853 ,1255 ,872 ,1974 ,556 ,1885 ,857 ,992 ,5 ,1921 ,1849 , - 1038 ,1912 ,464 ,795 ,747 ,56 ,124 ,431 ,1868 ,609 ,855 ,1522 ,912 ,1709 ,1507 ,1062 , - 1015 ,1357 ,1487 ,4 ,253 ,1871 ,933 ,215 ,1228 ,633 ,1306 ,2024 ,1453 ,900 ,457 ,471 , - 436 ,1311 ,870 ,1032 ,134 ,984 ,1983 ,1103 ,1627 ,1627 ,414 ,1845 ,583 ,1699 ,1458 ,2018 , - 150 ,450 ,1114 ,369 ,267 ,1273 ,1136 ,1578 ,1063 ,1820 ,120 ,779 ,652 ,1266 ,1929 ,1213 , - 159 ,297 ,1703 ,819 ,93 ,247 ,1366 ,144 ,1617 ,1428 ,812 ,121 ,1637 ,1620 ,289 ,1557 , - 1414 ,971 ,476 ,1685 ,428 ,1802 ,653 ,1290 ,614 ,1663 ,1528 ,1344 ,798 ,1027 ,1305 ,990 , - 1740 ,1154 ,1839 ,912 ,731 ,602 ,1064 ,1508 ,834 ,1387 ,252 ,745 ,1034 ,1102 ,965 ,696 , - 1971 ,1729 ,666 ,282 ,1993 ,1551 ,1703 ,1124 ,1628 ,1725 ,107 ,808 ,1096 ,1753 ,500 ,677 , + 1049 ,1597 ,1325 ,839 ,592 ,1440 ,1341 ,985 ,1239 ,1146 ,1778 ,1636 ,1485 ,1622 ,757 ,480 , + 1899 ,1481 ,840 ,1397 ,82 ,1565 ,116 ,1449 ,1038 ,1015 ,436 ,150 ,159 ,1414 ,1740 ,1971 , + 1415 ,175 ,1539 ,776 ,1046 ,117 ,803 ,1499 ,1457 ,1307 ,2 ,1135 ,1287 ,1039 ,1124 ,716 , + 1798 ,201 ,1517 ,1299 ,886 ,1786 ,521 ,353 ,1912 ,1357 ,1311 ,450 ,297 ,971 ,1154 ,1729 , + 1962 ,1280 ,1943 ,878 ,1588 ,723 ,568 ,1736 ,1021 ,983 ,10 ,833 ,973 ,1209 ,1091 ,681 , + 1606 ,779 ,334 ,765 ,1836 ,1400 ,150 ,877 ,464 ,1487 ,870 ,1114 ,1703 ,476 ,1839 ,666 , + 914 ,1202 ,1601 ,1719 ,1670 ,412 ,568 ,1838 ,341 ,1237 ,1279 ,830 ,1815 ,32 ,1369 ,1686 , + 1307 ,419 ,1143 ,1158 ,325 ,1696 ,1597 ,93 ,795 ,4 ,1032 ,369 ,819 ,1685 ,912 ,282 , + 1372 ,1911 ,141 ,1069 ,1485 ,642 ,1370 ,702 ,284 ,1407 ,999 ,1758 ,314 ,679 ,1061 ,1624 , + 1549 ,430 ,823 ,1809 ,1976 ,232 ,727 ,266 ,747 ,253 ,134 ,267 ,93 ,428 ,731 ,1993 , + 704 ,85 ,257 ,1302 ,1141 ,1717 ,1995 ,1345 ,882 ,1350 ,1549 ,2015 ,2020 ,732 ,415 ,335 , + 1814 ,1451 ,454 ,1299 ,761 ,1736 ,1916 ,1853 ,56 ,1871 ,984 ,1273 ,247 ,1802 ,602 ,1551 , + 1922 ,47 ,564 ,893 ,34 ,131 ,1063 ,1657 ,474 ,1960 ,1049 ,1275 ,424 ,976 ,1217 ,865 , + 114 ,1000 ,725 ,1585 ,359 ,512 ,815 ,1255 ,124 ,933 ,1983 ,1136 ,1366 ,653 ,1064 ,1703 , + 2036 ,692 ,1435 ,2005 ,1465 ,37 ,892 ,511 ,1559 ,1255 ,373 ,1675 ,1085 ,1462 ,1135 ,1356 , + 483 ,156 ,1298 ,1776 ,1136 ,518 ,1826 ,872 ,431 ,215 ,1103 ,1578 ,144 ,1290 ,1508 ,1124 , + 288 ,632 ,876 ,875 ,1156 ,345 ,273 ,1774 ,1923 ,878 ,1355 ,287 ,982 ,805 ,1360 ,1688 , + 958 ,1062 ,1325 ,625 ,1720 ,1895 ,1382 ,1974 ,1868 ,1228 ,1627 ,1063 ,1617 ,614 ,834 ,1628 , + 968 ,251 ,1096 ,908 ,1938 ,112 ,895 ,1787 ,273 ,1979 ,1200 ,744 ,1994 ,402 ,1578 ,307 , + 1919 ,615 ,649 ,1539 ,2036 ,1854 ,653 ,556 ,609 ,633 ,1627 ,1820 ,1428 ,1663 ,1387 ,1725 , + 193 ,1553 ,636 ,586 ,435 ,1979 ,1226 ,945 ,1330 ,1500 ,1466 ,89 ,1563 ,1150 ,1205 ,366 , + 1179 ,1353 ,1737 ,830 ,904 ,1584 ,1596 ,1885 ,855 ,1306 ,414 ,120 ,812 ,1528 ,252 ,107 , + 1139 ,1735 ,61 ,2001 ,753 ,2034 ,354 ,1927 ,1406 ,1939 ,1009 ,430 ,1269 ,170 ,1785 ,541 , + 898 ,414 ,913 ,1563 ,719 ,1393 ,286 ,857 ,1522 ,2024 ,1845 ,779 ,121 ,1344 ,745 ,808 , + 897 ,1577 ,1497 ,186 ,1418 ,1822 ,1726 ,947 ,1782 ,1415 ,75 ,1724 ,1769 ,1529 ,1835 ,1262 , + 834 ,1214 ,685 ,461 ,526 ,1869 ,1373 ,992 ,912 ,1453 ,583 ,652 ,1637 ,798 ,1034 ,1096 , + 897 ,132 ,1010 ,1932 ,277 ,1536 ,1541 ,952 ,19 ,88 ,2042 ,1232 ,1681 ,2013 ,1241 ,1167 , + 1526 ,1487 ,761 ,308 ,1567 ,1702 ,177 ,5 ,1709 ,900 ,1699 ,1266 ,1620 ,1027 ,1102 ,1753 , + 1243 ,471 ,485 ,1765 ,391 ,1281 ,1607 ,1418 ,116 ,1702 ,1725 ,1692 ,1082 ,350 ,14 ,59 , + 386 ,882 ,2010 ,1438 ,145 ,789 ,1397 ,1921 ,1507 ,457 ,1458 ,1929 ,289 ,1305 ,965 ,500 , + 1511 ,433 ,284 ,721 ,1741 ,56 ,615 ,916 ,887 ,1253 ,916 ,535 ,1666 ,1175 ,716 ,269 , + 447 ,32 ,63 ,321 ,1860 ,1986 ,1009 ,1849 ,1062 ,471 ,2018 ,1213 ,1557 ,990 ,696 ,677 , }; - // this particular example is pre-transposed, we need to undo that - codes = mimi_model::transpose_input(codes); } else { std::ifstream fin(codes_path); if (!fin) { From 5be8e7d64a5ff2c47b503a96d9e4730718b84b5b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 31 Mar 2025 12:41:12 +0200 Subject: [PATCH 23/31] add top-k and temp sampling --- examples/tts/tts-csm.cpp | 54 +++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 2dda42198637d..19c6d46a834a2 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -29,17 +29,27 @@ static void print_usage(int, char ** argv) { LOG("\n"); } -// greedy sampling with custom n_vocab -static llama_token sample_greedy(const float * logits, int n_vocab) { - llama_token max_idx = -1; - float max_val = -FLT_MAX; - for (int i = 0; i < n_vocab; ++i) { - if (logits[i] > max_val) { - max_val = logits[i]; - max_idx = i; - } +// sampling with custom n_vocab +// modified version of llama_sampler_sample() +static llama_token sample_token(struct llama_sampler * smpl, const float * logits, int n_vocab) { + std::vector cur; + cur.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); } - return max_idx; + + llama_token_data_array cur_p = { + /* .data = */ cur.data(), + /* .size = */ cur.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; + + llama_sampler_apply(smpl, &cur_p); + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); + auto token = cur_p.data[cur_p.selected].id; + llama_sampler_accept(smpl, token); + return token; } // hook to retrieve the embeddings @@ -63,11 +73,13 @@ static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { int main(int argc, char ** argv) { common_params params; - params.model = "sesame-csm-backbone.gguf"; - params.vocoder.model = "kyutai-mimi.gguf"; - params.out_file = "output.wav"; - params.prompt = ""; - params.n_predict = 2048; // CSM's max trained seq length + params.model = "sesame-csm-backbone.gguf"; + params.vocoder.model = "kyutai-mimi.gguf"; + params.out_file = "output.wav"; + params.prompt = ""; + params.n_predict = 2048; // CSM's max trained seq length + params.sampling.top_k = 50; // default param from CSM python code + params.sampling.temp = 0.9; // default param from CSM python code // HF model params.model_url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf"; @@ -115,11 +127,19 @@ int main(int argc, char ** argv) { mimi_model mimi(params.vocoder.model.c_str(), true); + // tokenize the prompt const llama_vocab * vocab = llama_model_get_vocab(model_bb); llama_tokens prompt_tokens = common_tokenize(vocab, params.prompt, false, true); prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab)); prompt_tokens.insert(prompt_tokens.end(), llama_vocab_eos(vocab)); + // init sampler + // the python implementation only has top-k and temperature sampling, so we'll use just that + llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_top_k(params.sampling.top_k)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(params.sampling.temp)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(params.sampling.seed)); + printf("prompt tokens: \n"); for (size_t i = 0; i < prompt_tokens.size(); ++i) { printf("%d, ", prompt_tokens[i]); @@ -176,7 +196,7 @@ int main(int argc, char ** argv) { // } // printf("\n"); - llama_token semantic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); + llama_token semantic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc)); printf("Sem token %5d : %d,", 1+(int)generated_codes.size()/32, semantic_tok); generated_codes.push_back(semantic_tok); @@ -227,7 +247,7 @@ int main(int argc, char ** argv) { // sample the acoustic token auto logits = llama_get_logits_ith(ctx_dc, 0); - llama_token acoustic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc)); + llama_token acoustic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc)); // discard last code (only for embeddings) if (i < n_codes - 1) { From 90231cc2514907ae2779ad85cdb88af3fe21b49a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 1 Apr 2025 09:51:27 +0200 Subject: [PATCH 24/31] much better on long generation --- examples/tts/tts-csm.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 19c6d46a834a2..915653c518a73 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -258,12 +258,9 @@ int main(int argc, char ** argv) { } // do progressive hsum of embeddings - // skip first semantic code - if (i > 0) { - GGML_ASSERT(inp_past_embd.size() == embd.size()); - for (size_t i = 0; i < inp_past_embd.size(); ++i) { - inp_past_embd[i] += embd[i]; - } + GGML_ASSERT(inp_past_embd.size() == embd.size()); + for (size_t i = 0; i < inp_past_embd.size(); ++i) { + inp_past_embd[i] += embd[i]; } } printf("\n"); From e9dc47687c8a92135f8d15db8f7e2057598995bb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 2 Apr 2025 16:41:28 +0200 Subject: [PATCH 25/31] fix tts-csm --- examples/tts/tts-csm.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 915653c518a73..3cb844615681e 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -73,17 +73,17 @@ static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { int main(int argc, char ** argv) { common_params params; - params.model = "sesame-csm-backbone.gguf"; - params.vocoder.model = "kyutai-mimi.gguf"; - params.out_file = "output.wav"; - params.prompt = ""; - params.n_predict = 2048; // CSM's max trained seq length - params.sampling.top_k = 50; // default param from CSM python code - params.sampling.temp = 0.9; // default param from CSM python code + params.model.path = "sesame-csm-backbone.gguf"; + params.vocoder.model.path = "kyutai-mimi.gguf"; + params.out_file = "output.wav"; + params.prompt = ""; + params.n_predict = 2048; // CSM's max trained seq length + params.sampling.top_k = 50; // default param from CSM python code + params.sampling.temp = 0.9; // default param from CSM python code // HF model - params.model_url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf"; - params.vocoder.model_url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf"; + params.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf"; + params.vocoder.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf"; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { return 1; @@ -104,10 +104,8 @@ int main(int argc, char ** argv) { common_params params_decoder(params); // duplicate the params params_decoder.n_ctx = 64; // we never use more than this - string_replace_all(params_decoder.model, "-backbone", "-decoder"); - if (!params_decoder.model_url.empty()) { - string_replace_all(params_decoder.model_url, "-backbone", "-decoder"); - } + string_replace_all(params_decoder.model.path, "-backbone", "-decoder"); + string_replace_all(params_decoder.model.url, "-backbone", "-decoder"); common_init_result llama_backbone = common_init_from_params(params); llama_model * model_bb = llama_backbone.model.get(); @@ -125,7 +123,7 @@ int main(int argc, char ** argv) { return ENOENT; } - mimi_model mimi(params.vocoder.model.c_str(), true); + mimi_model mimi(params.vocoder.model.path.c_str(), true); // tokenize the prompt const llama_vocab * vocab = llama_model_get_vocab(model_bb); From c681257e58d171c8a33fa867f555539a1db8ff0a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 2 Apr 2025 17:31:29 +0200 Subject: [PATCH 26/31] ability to do multi-turns --- examples/tts/csm-demo.txt | 5 + examples/tts/tts-csm.cpp | 293 +++++++++++++++++++++----------------- 2 files changed, 171 insertions(+), 127 deletions(-) create mode 100644 examples/tts/csm-demo.txt diff --git a/examples/tts/csm-demo.txt b/examples/tts/csm-demo.txt new file mode 100644 index 0000000000000..1c913388bfb3d --- /dev/null +++ b/examples/tts/csm-demo.txt @@ -0,0 +1,5 @@ +[0]Hey how are you doing. +[1]Pretty good, pretty good. +[0]I'm great, so happy to be speaking to you. +What about you? +[1]Me too, this is some cool stuff huh? diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 3cb844615681e..a8a9bd22d955b 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -5,6 +5,7 @@ #include "mimi-model.h" #include +#include #include #include #include // memcpy and strcmp @@ -23,12 +24,39 @@ static void print_usage(int, char ** argv) { LOG("\n Note: the model need 2 files to run, one ends with '-backbone-.gguf' and the other ends with '-decoder.gguf'"); LOG("\n"); LOG("\nPrompt format:"); - LOG("\n Each line must start with speaker ID in square brackets, followed by the text. A full stop is recommended at the end of each turn"); - LOG("\n Example: [0]Hello world."); + LOG("\n Each line must start with speaker ID in square brackets, followed by the text. One turn per line. A full stop is recommended at the end of each turn"); + LOG("\n Example:"); + LOG("\n [0]Hey how are you doing."); + LOG("\n [1]Pretty good, pretty good."); LOG("\n If you want to enter long text, use -f file.txt to read from file"); LOG("\n"); } +// split text containing "[N]..." into speaker turns +static std::vector get_speaker_turns(const std::string & input) { + if (input.empty()) { + LOG_ERR("Empty input\n"); + return {}; + } + if (input[0] != '[') { + LOG_ERR("Invalid input format: missing speaker ID\n"); + return {}; + } + std::regex re(R"((\[\d+\][\s\S]*?)(?=\[\d+\]|$))"); + std::smatch match; + std::vector turns; + std::string::const_iterator searchStart(input.cbegin()); + while (std::regex_search(searchStart, input.cend(), match, re)) { + std::string turn = match[1].str(); + if (turn.empty()) { + continue; + } + turns.push_back(turn); + searchStart = match.suffix().first; + } + return turns; +} + // sampling with custom n_vocab // modified version of llama_sampler_sample() static llama_token sample_token(struct llama_sampler * smpl, const float * logits, int n_vocab) { @@ -81,9 +109,11 @@ int main(int argc, char ** argv) { params.sampling.top_k = 50; // default param from CSM python code params.sampling.temp = 0.9; // default param from CSM python code - // HF model - params.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf"; - params.vocoder.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf"; + // HF model (hack: we temporary reuse speculative.model as the decoder model, only to get it downloaded) + params.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf"; + params.speculative.model.path = "sesame-csm-decoder.gguf"; + params.speculative.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-decoder.gguf"; + params.vocoder.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf"; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { return 1; @@ -125,12 +155,6 @@ int main(int argc, char ** argv) { mimi_model mimi(params.vocoder.model.path.c_str(), true); - // tokenize the prompt - const llama_vocab * vocab = llama_model_get_vocab(model_bb); - llama_tokens prompt_tokens = common_tokenize(vocab, params.prompt, false, true); - prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab)); - prompt_tokens.insert(prompt_tokens.end(), llama_vocab_eos(vocab)); - // init sampler // the python implementation only has top-k and temperature sampling, so we'll use just that llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); @@ -138,19 +162,8 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(params.sampling.temp)); llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(params.sampling.seed)); - printf("prompt tokens: \n"); - for (size_t i = 0; i < prompt_tokens.size(); ++i) { - printf("%d, ", prompt_tokens[i]); - } - printf("\n"); - - llama_pos n_past_bb = 0; llama_batch batch_prompt = llama_batch_init(params.n_batch, 0, 1); - common_batch_clear(batch_prompt); - for (size_t i = 0; i < prompt_tokens.size(); ++i) { - common_batch_add(batch_prompt, prompt_tokens[i], n_past_bb++, { 0 }, false); - } - batch_prompt.logits[batch_prompt.n_tokens - 1] = true; + llama_pos n_past_bb = 0; // inp_past_embd is the "squashed" embeddings from the decoder std::vector inp_past_embd(2048, 0.0f); @@ -162,128 +175,154 @@ int main(int argc, char ** argv) { int64_t t_dc = 0; // decoder time int64_t n_dc_gen = 0; // decoder generation count - bool is_stop = false; std::vector generated_codes; - // backbone generation loop - for (int k = 0; k < params.n_predict; ++k) { - bool is_prompt_processing = k == 0; - - if (!is_prompt_processing) { - // generate the next RVQ semantic token - batch_past_embd.n_tokens = 1; - batch_past_embd.pos[0] = n_past_bb++; - batch_past_embd.seq_id[0][0] = 0; - batch_past_embd.n_seq_id[0] = 1; - batch_past_embd.logits[0] = true; - std::memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float)); - } + auto turns = get_speaker_turns(params.prompt); + + for (const std::string & turn : turns) { + // tokenize the turn + llama_tokens prompt_tokens; + { + printf("\n---\nturn: %s\n\n", turn.c_str()); + const llama_vocab * vocab = llama_model_get_vocab(model_bb); + prompt_tokens = common_tokenize(vocab, turn, false, true); + prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab)); + prompt_tokens.insert(prompt_tokens.end(), llama_vocab_eos(vocab)); + + printf("prompt (%zu tokens): \n", prompt_tokens.size()); + for (size_t i = 0; i < prompt_tokens.size(); ++i) { + printf("%d, ", prompt_tokens[i]); + } + printf("\n"); - int64_t t_bb_start = ggml_time_ms(); - if (llama_decode(ctx_bb, is_prompt_processing ? batch_prompt : batch_past_embd) != 0) { - LOG_ERR("%s: backbone llama_decode() failed\n", __func__); - return 1; + common_batch_clear(batch_prompt); + for (size_t i = 0; i < prompt_tokens.size(); ++i) { + common_batch_add(batch_prompt, prompt_tokens[i], n_past_bb++, { 0 }, false); + } + batch_prompt.logits[batch_prompt.n_tokens - 1] = true; } - n_bb_gen++; - t_bb += ggml_time_ms() - t_bb_start; - auto vocab_dc = llama_model_get_vocab(model_dc); - auto logits = llama_get_logits_ith(ctx_bb, is_prompt_processing ? (batch_prompt.n_tokens - 1) : 0); - // for (size_t i = 0; i < 10; ++i) { - // printf("%4.2f, ", logits[i]); - // } - // printf("\n"); + // backbone generation loop + bool is_end_of_turn = false; + for (int k = 0; k < params.n_predict; ++k) { + bool is_prompt_processing = k == 0; + + if (!is_prompt_processing) { + // generate the next RVQ semantic token + batch_past_embd.n_tokens = 1; + batch_past_embd.pos[0] = n_past_bb++; + batch_past_embd.seq_id[0][0] = 0; + batch_past_embd.n_seq_id[0] = 1; + batch_past_embd.logits[0] = true; + std::memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float)); + } - llama_token semantic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc)); - printf("Sem token %5d : %d,", 1+(int)generated_codes.size()/32, semantic_tok); - generated_codes.push_back(semantic_tok); + int64_t t_bb_start = ggml_time_ms(); + if (llama_decode(ctx_bb, is_prompt_processing ? batch_prompt : batch_past_embd) != 0) { + LOG_ERR("%s: backbone llama_decode() failed\n", __func__); + return 1; + } + n_bb_gen++; + t_bb += ggml_time_ms() - t_bb_start; - // for (size_t i = 0; i < 10; ++i) { - // printf("%4.2f, ", embd[i]); - // } - // printf("\n"); + auto vocab_dc = llama_model_get_vocab(model_dc); + auto logits = llama_get_logits_ith(ctx_bb, is_prompt_processing ? (batch_prompt.n_tokens - 1) : 0); + // for (size_t i = 0; i < 10; ++i) { + // printf("%4.2f, ", logits[i]); + // } + // printf("\n"); + llama_token semantic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc)); + printf("Sem token %5d : %d,", 1+(int)generated_codes.size()/32, semantic_tok); + generated_codes.push_back(semantic_tok); - // decoder generation loop - inp_past_embd = std::vector(inp_past_embd.size(), 0.0f); - { - llama_kv_self_clear(ctx_dc); - llama_batch batch_embd = llama_batch_init(1, embd.size(), 1); - llama_batch batch_token = llama_batch_init(1, 0, 1); + // for (size_t i = 0; i < 10; ++i) { + // printf("%4.2f, ", embd[i]); + // } + // printf("\n"); - // first "token" is the latent embeddings from backbone - { - batch_embd.n_tokens = 1; - batch_embd.pos[0] = 0; - batch_embd.seq_id[0][0] = 0; - batch_embd.n_seq_id[0] = 1; - batch_embd.logits[0] = false; - std::memcpy(batch_embd.embd, embd.data(), embd.size() * sizeof(float)); - } - if (llama_decode(ctx_dc, batch_embd) != 0) { - LOG_ERR("%s: decoder llama_decode(embd) failed\n", __func__); - return 1; - } - // then, decode the semantic_tok to generate acoustic tokens - llama_token tok = semantic_tok; - int n_codes = 32; - int sum_codes = semantic_tok; // to check if all codes are 0 - for (int i = 0; i < n_codes; ++i) { - common_batch_clear(batch_token); - // encoder vocab is further divided into 32 codebooks, each with 2051 entries - llama_token inp_tok = tok + 2051*i; - common_batch_add(batch_token, inp_tok, i+1, { 0 }, true); - - int64_t t_bb_start = ggml_time_ms(); - if (llama_decode(ctx_dc, batch_token) != 0) { - LOG_ERR("%s: decoder llama_decode(token) failed\n", __func__); - return 1; + // decoder generation loop + inp_past_embd = std::vector(inp_past_embd.size(), 0.0f); + { + llama_kv_self_clear(ctx_dc); + llama_batch batch_embd = llama_batch_init(1, embd.size(), 1); + llama_batch batch_token = llama_batch_init(1, 0, 1); + + // first "token" is the latent embeddings from backbone + { + batch_embd.n_tokens = 1; + batch_embd.pos[0] = 0; + batch_embd.seq_id[0][0] = 0; + batch_embd.n_seq_id[0] = 1; + batch_embd.logits[0] = false; + std::memcpy(batch_embd.embd, embd.data(), embd.size() * sizeof(float)); } - n_dc_gen++; - t_dc += ggml_time_ms() - t_bb_start; - - // sample the acoustic token - auto logits = llama_get_logits_ith(ctx_dc, 0); - llama_token acoustic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc)); - - // discard last code (only for embeddings) - if (i < n_codes - 1) { - printf("%d,", acoustic_tok); - tok = acoustic_tok; // next input token - sum_codes += acoustic_tok; - generated_codes.push_back(acoustic_tok); + if (llama_decode(ctx_dc, batch_embd) != 0) { + LOG_ERR("%s: decoder llama_decode(embd) failed\n", __func__); + return 1; } - // do progressive hsum of embeddings - GGML_ASSERT(inp_past_embd.size() == embd.size()); - for (size_t i = 0; i < inp_past_embd.size(); ++i) { - inp_past_embd[i] += embd[i]; + // then, decode the semantic_tok to generate acoustic tokens + llama_token tok = semantic_tok; + int n_codes = 32; + int sum_codes = semantic_tok; // to check if all codes are 0 + for (int i = 0; i < n_codes; ++i) { + common_batch_clear(batch_token); + // encoder vocab is further divided into 32 codebooks, each with 2051 entries + llama_token inp_tok = tok + 2051*i; + common_batch_add(batch_token, inp_tok, i+1, { 0 }, true); + + int64_t t_bb_start = ggml_time_ms(); + if (llama_decode(ctx_dc, batch_token) != 0) { + LOG_ERR("%s: decoder llama_decode(token) failed\n", __func__); + return 1; + } + n_dc_gen++; + t_dc += ggml_time_ms() - t_bb_start; + + // sample the acoustic token + auto logits = llama_get_logits_ith(ctx_dc, 0); + llama_token acoustic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc)); + + // discard last code (only for embeddings) + if (i < n_codes - 1) { + printf("%d,", acoustic_tok); + tok = acoustic_tok; // next input token + sum_codes += acoustic_tok; + generated_codes.push_back(acoustic_tok); + } + + // do progressive hsum of embeddings + GGML_ASSERT(inp_past_embd.size() == embd.size()); + for (size_t i = 0; i < inp_past_embd.size(); ++i) { + inp_past_embd[i] += embd[i]; + } } - } - printf("\n"); + printf("\n"); - llama_batch_free(batch_embd); - llama_batch_free(batch_token); + llama_batch_free(batch_embd); + llama_batch_free(batch_token); - // if all codes are 0, then we are done - is_stop = sum_codes == 0; - } + // if all codes are 0, then we are done + is_end_of_turn = sum_codes == 0; + } - // printf("inp_past_embd, n_past_bb = %d\n", n_past_bb); - // for (size_t i = 0; i < inp_past_embd.size(); ++i) { - // printf("%4.4f, ", inp_past_embd[i]); - // if (i == 2) { - // printf("... "); - // i = inp_past_embd.size() - 4; - // } - // } - // printf("\n"); - - if (is_stop) { - // remove last 32 codes since they will be all zeros - generated_codes.resize(generated_codes.size() - 32); - break; + // printf("inp_past_embd, n_past_bb = %d\n", n_past_bb); + // for (size_t i = 0; i < inp_past_embd.size(); ++i) { + // printf("%4.4f, ", inp_past_embd[i]); + // if (i == 2) { + // printf("... "); + // i = inp_past_embd.size() - 4; + // } + // } + // printf("\n"); + + if (is_end_of_turn) { + // remove last 32 codes since they will be all zeros + generated_codes.resize(generated_codes.size() - 32); + break; + } } } From d17809999de22b03abb451bd4530a7ccb4ae98ba Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 3 Apr 2025 14:34:39 +0200 Subject: [PATCH 27/31] add audio EOS token --- examples/tts/tts-csm.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index a8a9bd22d955b..937948425cdc4 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -225,6 +225,11 @@ int main(int argc, char ** argv) { n_bb_gen++; t_bb += ggml_time_ms() - t_bb_start; + if (is_end_of_turn) { + // done decoding audio's EOS token + break; + } + auto vocab_dc = llama_model_get_vocab(model_dc); auto logits = llama_get_logits_ith(ctx_bb, is_prompt_processing ? (batch_prompt.n_tokens - 1) : 0); // for (size_t i = 0; i < 10; ++i) { @@ -304,8 +309,13 @@ int main(int argc, char ** argv) { llama_batch_free(batch_embd); llama_batch_free(batch_token); - // if all codes are 0, then we are done + // if all codes are 0, then we are done (got audio EOS token) + // note: we still need to run backbone decode one more time to decode the audio's EOS token is_end_of_turn = sum_codes == 0; + if (is_end_of_turn) { + // remove last 32 codes since they will be all zeros + generated_codes.resize(generated_codes.size() - 32); + } } // printf("inp_past_embd, n_past_bb = %d\n", n_past_bb); @@ -317,12 +327,6 @@ int main(int argc, char ** argv) { // } // } // printf("\n"); - - if (is_end_of_turn) { - // remove last 32 codes since they will be all zeros - generated_codes.resize(generated_codes.size() - 32); - break; - } } } From d1de6cc5ee7ea9c7db3856930c495e1e2ed4418d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 9 Apr 2025 17:32:05 +0200 Subject: [PATCH 28/31] add speaker reference --- examples/tts/csm_generate_speaker.py | 79 ++ examples/tts/tts-csm-data.h | 1513 ++++++++++++++++++++++++++ examples/tts/tts-csm.cpp | 182 +++- src/llama-model.cpp | 8 + 4 files changed, 1753 insertions(+), 29 deletions(-) create mode 100644 examples/tts/csm_generate_speaker.py create mode 100644 examples/tts/tts-csm-data.h diff --git a/examples/tts/csm_generate_speaker.py b/examples/tts/csm_generate_speaker.py new file mode 100644 index 0000000000000..0dc6929a23d4c --- /dev/null +++ b/examples/tts/csm_generate_speaker.py @@ -0,0 +1,79 @@ +import argparse +from pathlib import Path +from transformers import MimiModel, AutoFeatureExtractor +from transformers.models.mimi.modeling_mimi import MimiEncoderOutput + +from scipy.io.wavfile import read +from scipy.signal import resample +import numpy as np + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate speaker reference file, used by llama-tts-csm example",) + parser.add_argument( + "--model-path", type=Path, + help="custom Mimi model path (safetensors model). If not specified, will use the default model from Hugging Face hub", + ) + parser.add_argument( + "infile", type=Path, + help="the wav input file to use for generating the speaker reference file", + nargs="?", + ) + # parser.add_argument( + # "outfile", type=Path, + # help="the output file, defaults to the input file with .codes suffix", + # nargs="?", + # ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + if args.infile is None: + raise ValueError("Input file is required") + + if not args.infile.exists(): + raise FileNotFoundError(f"Input file {args.infile} not found") + + # if args.outfile is None: + # args.outfile = args.infile.with_suffix(".codes") + + model = MimiModel.from_pretrained(args.model_path or "kyutai/mimi") + feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_path or "kyutai/mimi") + + inp_audio = read(args.infile) + original_sample_rate = inp_audio[0] + audio_data = inp_audio[1] + + # If stereo, get only the first channel + if len(audio_data.shape) > 1 and audio_data.shape[1] >= 2: + audio_data = audio_data[:, 0] + + # resample + target_sample_rate = 24000 + number_of_samples = round(len(audio_data) * float(target_sample_rate) / original_sample_rate) + resampled_audio = resample(audio_data, number_of_samples) + resampled_audio = resampled_audio / max(np.max(np.abs(resampled_audio)), 1e-10) + + # pre-process the inputs + audio_sample = np.array(resampled_audio, dtype=float) + inputs = feature_extractor(raw_audio=audio_sample, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt") + print('inputs', inputs["input_values"], inputs["input_values"].shape) + + # encode + encoder_outputs = model.encode(inputs["input_values"]) + assert isinstance(encoder_outputs, MimiEncoderOutput), "encoder_outputs should be of type MimiEncoderOutput" + + # output + flattened_audio_codes = encoder_outputs.audio_codes.transpose(-1, -2).flatten() + for i in range(0, len(flattened_audio_codes), 16): + for code in flattened_audio_codes[i:i+16].tolist(): + print(f"{code:<5}", end=",") + print() + + +if __name__ == '__main__': + main() diff --git a/examples/tts/tts-csm-data.h b/examples/tts/tts-csm-data.h new file mode 100644 index 0000000000000..c3c47ca7ac3a2 --- /dev/null +++ b/examples/tts/tts-csm-data.h @@ -0,0 +1,1513 @@ +#pragma once + +#include + +// https://huggingface.co/spaces/sesame/csm-1b/blob/main/prompts/conversational_a.wav +const char * default_speaker_a_text = "[0]like revising for an exam I'd have to try and like keep up the momentum because I'd start really early I'd be like okay I'm gonna start revising now and then like you're revising for ages and then I just like start losing steam I didn't do that for the exam we had recently to be fair that was a more of a last minute scenario but like yeah I'm trying to like yeah I noticed this yesterday that like Mondays I sort of start the day with this not like a panic but like a"; +std::initializer_list default_speaker_a_codes = {}; + +// https://huggingface.co/spaces/sesame/csm-1b/blob/main/prompts/conversational_b.wav +const char * default_speaker_b_text = "[1]like a super Mario level. Like it's very like high detail. And like, once you get into the park, it just like, everything looks like a computer game and they have all these, like, you know, if, if there's like a, you know, like in a Mario game, they will have like a question block. And if you like, you know, punch it, a coin will come out. So like everyone, when they come into the park, they get like this little bracelet and then you can go punching question blocks around."; +std::initializer_list default_speaker_b_codes = {}; diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index 937948425cdc4..d47b0e598dd28 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -1,9 +1,12 @@ +#include "ggml.h" #include "llama.h" #include "common.h" #include "log.h" #include "arg.h" #include "mimi-model.h" +#include "tts-csm-data.h" +#include #include #include #include @@ -32,8 +35,14 @@ static void print_usage(int, char ** argv) { LOG("\n"); } +struct speaker_turn { + std::string text; + std::vector audio_embd; // only used for system prompt (speaker reference) processing + size_t n_embd_tokens = 0; +}; + // split text containing "[N]..." into speaker turns -static std::vector get_speaker_turns(const std::string & input) { +static std::vector get_speaker_turns(const std::string & input) { if (input.empty()) { LOG_ERR("Empty input\n"); return {}; @@ -44,19 +53,60 @@ static std::vector get_speaker_turns(const std::string & input) { } std::regex re(R"((\[\d+\][\s\S]*?)(?=\[\d+\]|$))"); std::smatch match; - std::vector turns; + std::vector turns; std::string::const_iterator searchStart(input.cbegin()); while (std::regex_search(searchStart, input.cend(), match, re)) { - std::string turn = match[1].str(); - if (turn.empty()) { + std::string turn_text = match[1].str(); + if (turn_text.empty()) { continue; } + // clean up newline, the model is quite sensitive to this + string_replace_all(turn_text, "\n", " "); + turn_text = string_strip(turn_text); + // add turn + speaker_turn turn; + turn.text = turn_text; turns.push_back(turn); searchStart = match.suffix().first; } return turns; } +static speaker_turn get_ref_speaker_turn(const char * text, std::initializer_list & codes, std::vector & codebook) { + const size_t n_embd = 2048; + const size_t n_codes_per_codebook = 2051; + const size_t n_codebooks = 32; + GGML_ASSERT(codebook.size() == n_embd * n_codes_per_codebook * n_codebooks); + GGML_ASSERT(codes.size() % 32 == 0); + + // 1 frame = 32 codes + size_t n_frames = codes.size() / n_codebooks; + speaker_turn turn; + turn.text = text; + turn.audio_embd.reserve((n_frames+1) * n_embd); + turn.n_embd_tokens = n_frames+1; // +1 for EOS frame + + for (size_t i_fr = 0; i_fr <= n_frames; i_fr++) { + std::vector frame_embd_sum(n_embd, 0.0f); + + for (size_t i_cb = 0; i_cb < n_codebooks; i_cb++) { + const size_t code = i_fr == n_frames + ? 0 // insert audio EOS for last pseudo-frame + : codes.begin()[i_fr*n_codebooks + i_cb]; + printf("code %zu: %zu, codebook entry %zu\n", i_cb, code, i_cb*n_codes_per_codebook + code); + float * entry = codebook.data() + i_cb*n_codes_per_codebook*n_embd + code*n_embd; + for (size_t i_embd = 0; i_embd < n_embd; i_embd++) { + frame_embd_sum[i_embd] += entry[i_embd]; + } + } + + turn.audio_embd.insert(turn.audio_embd.end(), frame_embd_sum.begin(), frame_embd_sum.end()); + } + + GGML_ASSERT(turn.audio_embd.size() == (n_frames+1) * n_embd); + return turn; +} + // sampling with custom n_vocab // modified version of llama_sampler_sample() static llama_token sample_token(struct llama_sampler * smpl, const float * logits, int n_vocab) { @@ -80,24 +130,75 @@ static llama_token sample_token(struct llama_sampler * smpl, const float * logit return token; } +struct hook_data { + std::vector embd; + std::vector codebook; +}; + // hook to retrieve the embeddings static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { - std::vector * embd = (std::vector *) user_data; + hook_data * data = (hook_data *) user_data; // output_csm_proj is the embeddings output from backbone // output_audio_embd is the embeddings output from decoder if (t && (strcmp(t->name, "output_csm_proj") == 0 || strcmp(t->name, "output_audio_embd") == 0)) { if (ask) return true; - embd->resize(ggml_nelements(t)); - ggml_backend_tensor_get(t, embd->data(), 0, ggml_nbytes(t)); + GGML_ASSERT(t->type == GGML_TYPE_F32); + data->embd.resize(ggml_nelements(t)); + ggml_backend_tensor_get(t, data->embd.data(), 0, ggml_nbytes(t)); // printf("%s tensor size: %lld, %lld\n", t->name, t->ne[0], t->ne[1]); return true; } + if (t && strncmp(t->name, "audio_embd.weight", 18) == 0) { + if (ask) return true; + + printf("%s tensor size: %lld, %lld\n", t->name, t->ne[0], t->ne[1]); + GGML_ASSERT(t->type == GGML_TYPE_F32); + GGML_ASSERT(t->ne[0] == 2048); // backbone embd size + data->codebook.resize(ggml_nelements(t)); + ggml_backend_tensor_get(t, data->codebook.data(), 0, ggml_nbytes(t)); + return true; + } + return false; } +// convenience wrapper around llama_batch to handle memory allocation +struct decode_embd_batch { + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + pos .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } +}; + int main(int argc, char ** argv) { common_params params; @@ -127,10 +228,9 @@ int main(int argc, char ** argv) { return 1; } - std::vector embd; + hook_data cb_data; params.cb_eval = ggml_callback; - params.cb_eval_user_data = &embd; - params.warmup = false; + params.cb_eval_user_data = &cb_data; common_params params_decoder(params); // duplicate the params params_decoder.n_ctx = 64; // we never use more than this @@ -177,15 +277,22 @@ int main(int argc, char ** argv) { std::vector generated_codes; - auto turns = get_speaker_turns(params.prompt); + std::vector turns; + // speaker reference + turns.push_back(get_ref_speaker_turn(default_speaker_a_text, default_speaker_a_codes, cb_data.codebook)); + turns.push_back(get_ref_speaker_turn(default_speaker_b_text, default_speaker_b_codes, cb_data.codebook)); + + // user input + auto custom_turns = get_speaker_turns(params.prompt); + turns.insert(turns.end(), custom_turns.begin(), custom_turns.end()); - for (const std::string & turn : turns) { + for (speaker_turn & turn : turns) { // tokenize the turn llama_tokens prompt_tokens; { - printf("\n---\nturn: %s\n\n", turn.c_str()); + printf("\n---\n\nturn: %s\n\n", turn.text.c_str()); const llama_vocab * vocab = llama_model_get_vocab(model_bb); - prompt_tokens = common_tokenize(vocab, turn, false, true); + prompt_tokens = common_tokenize(vocab, turn.text, false, true); prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab)); prompt_tokens.insert(prompt_tokens.end(), llama_vocab_eos(vocab)); @@ -193,21 +300,38 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < prompt_tokens.size(); ++i) { printf("%d, ", prompt_tokens[i]); } - printf("\n"); + printf("\n\n"); common_batch_clear(batch_prompt); for (size_t i = 0; i < prompt_tokens.size(); ++i) { common_batch_add(batch_prompt, prompt_tokens[i], n_past_bb++, { 0 }, false); } batch_prompt.logits[batch_prompt.n_tokens - 1] = true; + + if (llama_decode(ctx_bb, batch_prompt) != 0) { + LOG_ERR("%s: backbone llama_decode(text) failed\n", __func__); + return 1; + } + } + + // optionally process the system prompt (speaker reference) + if (turn.n_embd_tokens) { + decode_embd_batch batch_embd(turn.audio_embd.data(), turn.n_embd_tokens, n_past_bb, 0); + if (llama_decode(ctx_bb, batch_embd.batch) != 0) { + LOG_ERR("%s: backbone llama_decode(embeddings) failed\n", __func__); + return 1; + } + LOG_INF("%s: backbone done decoding %zu audio codes\n\n", __func__, turn.n_embd_tokens); + n_past_bb += turn.n_embd_tokens; + continue; // no need to generate the audio } // backbone generation loop bool is_end_of_turn = false; for (int k = 0; k < params.n_predict; ++k) { - bool is_prompt_processing = k == 0; + bool is_first_tok = k == 0; - if (!is_prompt_processing) { + if (!is_first_tok) { // generate the next RVQ semantic token batch_past_embd.n_tokens = 1; batch_past_embd.pos[0] = n_past_bb++; @@ -215,15 +339,15 @@ int main(int argc, char ** argv) { batch_past_embd.n_seq_id[0] = 1; batch_past_embd.logits[0] = true; std::memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float)); - } - int64_t t_bb_start = ggml_time_ms(); - if (llama_decode(ctx_bb, is_prompt_processing ? batch_prompt : batch_past_embd) != 0) { - LOG_ERR("%s: backbone llama_decode() failed\n", __func__); - return 1; + int64_t t_bb_start = ggml_time_ms(); + if (llama_decode(ctx_bb, batch_past_embd) != 0) { + LOG_ERR("%s: backbone llama_decode() failed\n", __func__); + return 1; + } + n_bb_gen++; + t_bb += ggml_time_ms() - t_bb_start; } - n_bb_gen++; - t_bb += ggml_time_ms() - t_bb_start; if (is_end_of_turn) { // done decoding audio's EOS token @@ -231,7 +355,7 @@ int main(int argc, char ** argv) { } auto vocab_dc = llama_model_get_vocab(model_dc); - auto logits = llama_get_logits_ith(ctx_bb, is_prompt_processing ? (batch_prompt.n_tokens - 1) : 0); + auto logits = llama_get_logits_ith(ctx_bb, is_first_tok ? (batch_prompt.n_tokens - 1) : 0); // for (size_t i = 0; i < 10; ++i) { // printf("%4.2f, ", logits[i]); // } @@ -251,7 +375,7 @@ int main(int argc, char ** argv) { inp_past_embd = std::vector(inp_past_embd.size(), 0.0f); { llama_kv_self_clear(ctx_dc); - llama_batch batch_embd = llama_batch_init(1, embd.size(), 1); + llama_batch batch_embd = llama_batch_init(1, cb_data.embd.size(), 1); llama_batch batch_token = llama_batch_init(1, 0, 1); // first "token" is the latent embeddings from backbone @@ -261,7 +385,7 @@ int main(int argc, char ** argv) { batch_embd.seq_id[0][0] = 0; batch_embd.n_seq_id[0] = 1; batch_embd.logits[0] = false; - std::memcpy(batch_embd.embd, embd.data(), embd.size() * sizeof(float)); + std::memcpy(batch_embd.embd, cb_data.embd.data(), cb_data.embd.size() * sizeof(float)); } if (llama_decode(ctx_dc, batch_embd) != 0) { LOG_ERR("%s: decoder llama_decode(embd) failed\n", __func__); @@ -299,9 +423,9 @@ int main(int argc, char ** argv) { } // do progressive hsum of embeddings - GGML_ASSERT(inp_past_embd.size() == embd.size()); + GGML_ASSERT(inp_past_embd.size() == cb_data.embd.size()); for (size_t i = 0; i < inp_past_embd.size(); ++i) { - inp_past_embd[i] += embd[i]; + inp_past_embd[i] += cb_data.embd[i]; } } printf("\n"); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 81d460d55e75c..54aa80ce3dbd2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4639,6 +4639,14 @@ struct llm_build_llama_csm : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); + // hacky way to get the audio embedding from user code (used in prompt processing) + // this will be triggered during warmup + if (is_decoder && n_tokens == 2) { + ggml_tensor * tmp = ggml_cast(ctx0, model.tok_embd, GGML_TYPE_F32); + cb(tmp, "audio_embd.weight", -1); + ggml_build_forward_expand(gf, tmp); + } + ggml_tensor * input_embd = inpL; // inp_pos - contains the positions From 9533fb752cb6e7a5d57436d015be41134ba8ead7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 23 Apr 2025 14:30:16 +0200 Subject: [PATCH 29/31] fix build_attn --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5c4be5b8c3729..cd549e986c2a9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4783,7 +4783,7 @@ struct llm_build_llama_csm : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1) { From e5bb5606976dd305aea072adc721451d8ab7d00a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 23 Apr 2025 14:44:45 +0200 Subject: [PATCH 30/31] rm print --- examples/tts/tts-csm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp index d47b0e598dd28..d9a5ef1102d89 100644 --- a/examples/tts/tts-csm.cpp +++ b/examples/tts/tts-csm.cpp @@ -154,7 +154,7 @@ static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) { if (t && strncmp(t->name, "audio_embd.weight", 18) == 0) { if (ask) return true; - printf("%s tensor size: %lld, %lld\n", t->name, t->ne[0], t->ne[1]); + // printf("%s tensor size: %lld, %lld\n", t->name, t->ne[0], t->ne[1]); GGML_ASSERT(t->type == GGML_TYPE_F32); GGML_ASSERT(t->ne[0] == 2048); // backbone embd size data->codebook.resize(ggml_nelements(t)); From c1cd710f592e6f825d8d9e199479ece0c53e2259 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 23 Apr 2025 14:56:17 +0200 Subject: [PATCH 31/31] fix pyright --- examples/tts/csm_generate_speaker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tts/csm_generate_speaker.py b/examples/tts/csm_generate_speaker.py index 0dc6929a23d4c..a06dee6846eac 100644 --- a/examples/tts/csm_generate_speaker.py +++ b/examples/tts/csm_generate_speaker.py @@ -3,6 +3,7 @@ from transformers import MimiModel, AutoFeatureExtractor from transformers.models.mimi.modeling_mimi import MimiEncoderOutput +# pyright: reportMissingImports=false from scipy.io.wavfile import read from scipy.signal import resample import numpy as np