From 84ae217bd8c6080ba03902a3a9e8a87624a4e758 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 25 Aug 2025 15:01:34 -0700 Subject: [PATCH] Add Voxtral runner [ghstack-poisoned] --- examples/models/voxtral/CMakeLists.txt | 102 ++++++++++++ examples/models/voxtral/multimodal.cpp | 172 +++++++++++++++++++++ extension/llm/runner/llm_runner_helper.cpp | 25 +++ 3 files changed, 299 insertions(+) create mode 100644 examples/models/voxtral/CMakeLists.txt create mode 100644 examples/models/voxtral/multimodal.cpp diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt new file mode 100644 index 00000000000..60290d29449 --- /dev/null +++ b/examples/models/voxtral/CMakeLists.txt @@ -0,0 +1,102 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# Simple CMake build system for voxtral runner. +# +cmake_minimum_required(VERSION 3.24) +project(voxtral) + +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + +if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") + set(CMAKE_TOOLCHAIN_IOS ON) +else() + set(CMAKE_TOOLCHAIN_IOS OFF) +endif() + +# Let files say "include " +set(_common_include_directories ${EXECUTORCH_ROOT}/..) + +# Need this for gflags for some reason +set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) +find_package(gflags REQUIRED) + +# Find `executorch` libraries, same as for gflags +list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..) +find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH) +executorch_target_link_options_shared_lib(executorch) + +set(LINK_LIBS executorch gflags) +set(link_libraries ${LINK_LIBS}) +set(_srcs multimodal.cpp) + +list( + APPEND + link_libraries + optimized_native_cpu_ops_lib + quantized_ops_lib + portable_kernels + optimized_kernels + quantized_kernels + custom_ops + cpublas + eigen_blas +) +executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) +executorch_target_link_options_shared_lib(quantized_ops_lib) +executorch_target_link_options_shared_lib(custom_ops) + +# XNNPACK +if(TARGET xnnpack_backend) + set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod) + if(TARGET kleidiai) + list(APPEND xnnpack_backend_libs kleidiai) + endif() + list(APPEND link_libraries ${xnnpack_backend_libs}) + executorch_target_link_options_shared_lib(xnnpack_backend) +endif() + +# Add LLM runner and extension module +if(NOT TARGET extension_llm_runner) + message( + FATAL_ERROR + "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled." + ) +endif() + +# Needed for cpuinfo where it uses android specific log lib +if(ANDROID) + list(APPEND link_libraries log) +endif() + +# Add the required ExecutorTorch extensions for multimodal LLM runner +list( + APPEND + link_libraries + extension_llm_runner + extension_module + extension_data_loader + extension_tensor + extension_flat_tensor +) + +# Add tokenizers +list(APPEND link_libraries tokenizers::tokenizers) + +add_executable(voxtral_runner ${_srcs}) +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + target_link_options_gc_sections(voxtral_runner) + if(NOT APPLE) + target_link_options(voxtral_runner PRIVATE "LINKER:-s") + endif() +endif() + +target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) +target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) +target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp new file mode 100644 index 00000000000..c46c9e30662 --- /dev/null +++ b/examples/models/voxtral/multimodal.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(ET_USE_THREADPOOL) +#include +#include +#endif + +DEFINE_string( + model_path, + "multimodal.pte", + "Model serialized in flatbuffer format."); + +DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff."); + +DEFINE_string(prompt, "What is happening in this audio?", "Text prompt."); + +DEFINE_string(audio_path, "", "Path to input audio file."); + +DEFINE_double( + temperature, + 0.8f, + "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); + +DEFINE_int32( + cpu_threads, + -1, + "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); + +DEFINE_bool(warmup, false, "Whether to run a warmup run."); + +namespace { + +using ::executorch::extension::llm::Image; +using ::executorch::extension::llm::make_image_input; +using ::executorch::extension::llm::make_text_input; +using ::executorch::extension::llm::MultimodalInput; + +} // namespace + +int32_t main(int32_t argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + + const char* model_path = FLAGS_model_path.c_str(); + + const char* tokenizer_path = FLAGS_tokenizer_path.c_str(); + const char* prompt = FLAGS_prompt.c_str(); + const char* audio_path = FLAGS_audio_path.c_str(); + float temperature = FLAGS_temperature; + int32_t cpu_threads = FLAGS_cpu_threads; + bool warmup = FLAGS_warmup; + +#if defined(ET_USE_THREADPOOL) + uint32_t num_performant_cores = cpu_threads == -1 + ? ::executorch::extension::cpuinfo::get_num_performant_cores() + : static_cast(cpu_threads); + ET_LOG( + Info, "Resetting threadpool with num threads = %d", num_performant_cores); + if (num_performant_cores > 0) { + ::executorch::extension::threadpool::get_threadpool() + ->_unsafe_reset_threadpool(num_performant_cores); + } +#endif + + // Load tokenizer + std::unique_ptr<::tokenizers::Tokenizer> tokenizer = + ::executorch::extension::llm::load_tokenizer(tokenizer_path); + if (tokenizer == nullptr) { + ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path); + return 1; + } + + // Create multimodal runner + std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner = + ::executorch::extension::llm::create_multimodal_runner( + model_path, std::move(tokenizer)); + if (runner == nullptr) { + ET_LOG(Error, "Failed to create multimodal runner"); + return 1; + } + + // Load runner + auto load_error = runner->load(); + if (load_error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to load multimodal runner"); + return 1; + } + + // Prepare inputs + std::vector inputs; + + // 1. Add start bos-related text inputs and modality start token. + inputs.emplace_back(make_text_input("[INST][BEGIN_AUDIO]")); + + // 2. Add audio input + // Using a preprocessed audio, saved using: + // with open("tensor.bin", "wb") as f: + // f.write(t.numpy().tobytes()) + std::ifstream f(audio_path, std::ios::binary | std::ios::ate); + int32_t n_bins = 128; + int32_t n_frames = 3000; + std::size_t n_floats = + f.tellg() / sizeof(float); // Number of floats in the audio file. + f.seekg(0, std::ios::beg); + int32_t batch_size = ceil( + n_floats / + (n_bins * n_frames)); // Batch in increments of n_frames, rounding up. + std::vector audio_data(batch_size * n_bins * n_frames); + f.read( + reinterpret_cast(audio_data.data()), + audio_data.size() * sizeof(float)); + + ET_LOG(Info, "audio_data len = %d", audio_data.size()); + + auto audio = std::make_unique<::executorch::extension::llm::Audio>(); + audio->batch_size = batch_size; + audio->n_bins = n_bins; + audio->n_frames = n_frames; + audio->data.resize(audio_data.size() * sizeof(float)); + std::memcpy( + audio->data.data(), audio_data.data(), audio_data.size() * sizeof(float)); + inputs.emplace_back( + ::executorch::extension::llm::make_audio_input(std::move(*audio))); + + // 3. Add text input + inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]")); + + ::executorch::extension::llm::GenerationConfig config; + config.max_new_tokens = 100; + config.temperature = temperature; + + // Run warmup if requested + if (warmup) { + ET_LOG(Info, "Running warmup..."); + auto warmup_error = runner->generate(inputs, config); + if (warmup_error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to run warmup"); + return 1; + } + runner->reset(); + } + + // Generate + ET_LOG(Info, "Starting generation..."); + auto error = runner->generate(inputs, config); + if (error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to generate with multimodal runner"); + return 1; + } + + printf("\n"); + return 0; +} diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp index 2e17e518c4a..d8cfd4c84f6 100644 --- a/extension/llm/runner/llm_runner_helper.cpp +++ b/extension/llm/runner/llm_runner_helper.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include namespace executorch::extension::llm { @@ -35,6 +36,18 @@ std::unique_ptr load_tokenizer( size_t bos_token_index, size_t eos_token_index) { runtime::runtime_init(); + auto tekken_tokenizer = std::make_unique(); + // Prevent the case where tekken tokenizer accidentally successfully loads a + // HuggingFace tokenizer, which is also .json. + const std::string tekken_name = "tekken.json"; + if (tokenizer_path.size() >= tekken_name.size() && + tokenizer_path.rfind(tekken_name) == + tokenizer_path.size() - tekken_name.size()) { + if (tekken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded tekken tokenizer"); + return tekken_tokenizer; + } + } auto json_tokenizer = std::make_unique(); if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { ET_LOG(Info, "Loaded json tokenizer"); @@ -109,6 +122,18 @@ std::unordered_map get_llm_metadata( } ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); } + + // If kMaxContextLen method not found but kMaxSeqLen is + // available, set kMaxContextLen to the value of kMaxSeqLen. + if (!method_names.count(llm::kMaxContextLen) && + method_names.count(llm::kMaxSeqLen)) { + metadata[llm::kMaxContextLen] = metadata[llm::kMaxSeqLen]; + ET_LOG( + Info, + "Setting kMaxContextLen to kMaxSeqLen value: %" PRId64, + metadata[llm::kMaxContextLen]); + } + // Set tokenizer-related metadata metadata[llm::kBosId] = tokenizer->bos_tok(); metadata[llm::kVocabSize] = tokenizer->vocab_size();