Skip to content

Commit a53b396

Browse files
committed
Add Voxtral runner
ghstack-source-id: 0fbf7c4 Pull Request resolved: #13663
1 parent c6883b1 commit a53b396

File tree

3 files changed

+296
-0
lines changed

3 files changed

+296
-0
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
#
8+
# Simple CMake build system for voxtral runner.
9+
#
10+
cmake_minimum_required(VERSION 3.24)
11+
project(voxtral)
12+
13+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
14+
15+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
16+
17+
if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
18+
set(CMAKE_TOOLCHAIN_IOS ON)
19+
else()
20+
set(CMAKE_TOOLCHAIN_IOS OFF)
21+
endif()
22+
23+
# Let files say "include <executorch/path/to/header.h>"
24+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
25+
26+
# Need this for gflags for some reason
27+
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
28+
find_package(gflags REQUIRED)
29+
30+
# Find `executorch` libraries, same as for gflags
31+
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
32+
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
33+
executorch_target_link_options_shared_lib(executorch)
34+
35+
set(LINK_LIBS executorch gflags)
36+
set(link_libraries ${LINK_LIBS})
37+
set(_srcs multimodal.cpp)
38+
39+
list(
40+
APPEND
41+
link_libraries
42+
optimized_native_cpu_ops_lib
43+
quantized_ops_lib
44+
custom_ops
45+
cpublas
46+
eigen_blas
47+
)
48+
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
49+
executorch_target_link_options_shared_lib(quantized_ops_lib)
50+
executorch_target_link_options_shared_lib(custom_ops)
51+
52+
# XNNPACK
53+
if(TARGET xnnpack_backend)
54+
set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
55+
if(TARGET kleidiai)
56+
list(APPEND xnnpack_backend_libs kleidiai)
57+
endif()
58+
list(APPEND link_libraries ${xnnpack_backend_libs})
59+
executorch_target_link_options_shared_lib(xnnpack_backend)
60+
endif()
61+
62+
# Add LLM runner and extension module
63+
if(NOT TARGET extension_llm_runner)
64+
message(
65+
FATAL_ERROR
66+
"ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
67+
)
68+
endif()
69+
70+
# Needed for cpuinfo where it uses android specific log lib
71+
if(ANDROID)
72+
list(APPEND link_libraries log)
73+
endif()
74+
75+
# Add the required ExecutorTorch extensions for multimodal LLM runner
76+
list(
77+
APPEND
78+
link_libraries
79+
extension_llm_runner
80+
extension_module
81+
extension_data_loader
82+
extension_tensor
83+
extension_flat_tensor
84+
)
85+
86+
# Add tokenizers
87+
list(APPEND link_libraries tokenizers::tokenizers)
88+
89+
add_executable(voxtral_runner ${_srcs})
90+
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
91+
target_link_options_gc_sections(voxtral_runner)
92+
if(NOT APPLE)
93+
target_link_options(voxtral_runner PRIVATE "LINKER:-s")
94+
endif()
95+
endif()
96+
97+
target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
98+
target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
99+
target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <cmath>
10+
#include <cstring>
11+
#include <fstream>
12+
13+
#include <gflags/gflags.h>
14+
15+
#include <executorch/extension/llm/runner/audio.h>
16+
#include <executorch/extension/llm/runner/image.h>
17+
#include <executorch/extension/llm/runner/llm_runner_helper.h>
18+
#include <executorch/extension/llm/runner/multimodal_input.h>
19+
#include <executorch/extension/llm/runner/multimodal_runner.h>
20+
#include <executorch/runtime/core/error.h>
21+
#include <executorch/runtime/platform/log.h>
22+
23+
#if defined(ET_USE_THREADPOOL)
24+
#include <executorch/extension/threadpool/cpuinfo_utils.h>
25+
#include <executorch/extension/threadpool/threadpool.h>
26+
#endif
27+
28+
DEFINE_string(
29+
model_path,
30+
"multimodal.pte",
31+
"Model serialized in flatbuffer format.");
32+
33+
DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");
34+
35+
DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
36+
37+
DEFINE_string(audio_path, "", "Path to input audio file.");
38+
39+
DEFINE_double(
40+
temperature,
41+
0.8f,
42+
"Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
43+
44+
DEFINE_int32(
45+
cpu_threads,
46+
-1,
47+
"Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
48+
49+
DEFINE_bool(warmup, false, "Whether to run a warmup run.");
50+
51+
namespace {
52+
53+
using ::executorch::extension::llm::Image;
54+
using ::executorch::extension::llm::make_image_input;
55+
using ::executorch::extension::llm::make_text_input;
56+
using ::executorch::extension::llm::MultimodalInput;
57+
58+
} // namespace
59+
60+
int32_t main(int32_t argc, char** argv) {
61+
gflags::ParseCommandLineFlags(&argc, &argv, true);
62+
63+
const char* model_path = FLAGS_model_path.c_str();
64+
65+
const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
66+
const char* prompt = FLAGS_prompt.c_str();
67+
const char* audio_path = FLAGS_audio_path.c_str();
68+
float temperature = FLAGS_temperature;
69+
int32_t cpu_threads = FLAGS_cpu_threads;
70+
bool warmup = FLAGS_warmup;
71+
72+
#if defined(ET_USE_THREADPOOL)
73+
uint32_t num_performant_cores = cpu_threads == -1
74+
? ::executorch::extension::cpuinfo::get_num_performant_cores()
75+
: static_cast<uint32_t>(cpu_threads);
76+
ET_LOG(
77+
Info, "Resetting threadpool with num threads = %d", num_performant_cores);
78+
if (num_performant_cores > 0) {
79+
::executorch::extension::threadpool::get_threadpool()
80+
->_unsafe_reset_threadpool(num_performant_cores);
81+
}
82+
#endif
83+
84+
// Load tokenizer
85+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
86+
::executorch::extension::llm::load_tokenizer(tokenizer_path);
87+
if (tokenizer == nullptr) {
88+
ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
89+
return 1;
90+
}
91+
92+
// Create multimodal runner
93+
std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
94+
::executorch::extension::llm::create_multimodal_runner(
95+
model_path, std::move(tokenizer));
96+
if (runner == nullptr) {
97+
ET_LOG(Error, "Failed to create multimodal runner");
98+
return 1;
99+
}
100+
101+
// Load runner
102+
auto load_error = runner->load();
103+
if (load_error != ::executorch::runtime::Error::Ok) {
104+
ET_LOG(Error, "Failed to load multimodal runner");
105+
return 1;
106+
}
107+
108+
// Prepare inputs
109+
std::vector<MultimodalInput> inputs;
110+
111+
// 1. Add start bos-related text inputs and modality start token.
112+
inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
113+
114+
// 2. Add audio input
115+
// Using a preprocessed audio, saved using:
116+
// with open("tensor.bin", "wb") as f:
117+
// f.write(t.numpy().tobytes())
118+
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
119+
int32_t n_bins = 128;
120+
int32_t n_frames = 3000;
121+
std::size_t n_floats =
122+
f.tellg() / sizeof(float); // Number of floats in the audio file.
123+
f.seekg(0, std::ios::beg);
124+
int32_t batch_size = ceil(
125+
n_floats /
126+
(n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
127+
std::vector<float> audio_data(batch_size * n_bins * n_frames);
128+
f.read(
129+
reinterpret_cast<char*>(audio_data.data()),
130+
audio_data.size() * sizeof(float));
131+
132+
ET_LOG(Info, "audio_data len = %d", audio_data.size());
133+
134+
auto audio = std::make_unique<::executorch::extension::llm::Audio>();
135+
audio->batch_size = batch_size;
136+
audio->n_bins = n_bins;
137+
audio->n_frames = n_frames;
138+
audio->data.resize(audio_data.size() * sizeof(float));
139+
std::memcpy(
140+
audio->data.data(), audio_data.data(), audio_data.size() * sizeof(float));
141+
inputs.emplace_back(
142+
::executorch::extension::llm::make_audio_input(std::move(*audio)));
143+
144+
// 3. Add text input
145+
inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));
146+
147+
::executorch::extension::llm::GenerationConfig config;
148+
config.max_new_tokens = 100;
149+
config.temperature = temperature;
150+
151+
// Run warmup if requested
152+
if (warmup) {
153+
ET_LOG(Info, "Running warmup...");
154+
auto warmup_error = runner->generate(inputs, config);
155+
if (warmup_error != ::executorch::runtime::Error::Ok) {
156+
ET_LOG(Error, "Failed to run warmup");
157+
return 1;
158+
}
159+
runner->reset();
160+
}
161+
162+
// Generate
163+
ET_LOG(Info, "Starting generation...");
164+
auto error = runner->generate(inputs, config);
165+
if (error != ::executorch::runtime::Error::Ok) {
166+
ET_LOG(Error, "Failed to generate with multimodal runner");
167+
return 1;
168+
}
169+
170+
printf("\n");
171+
return 0;
172+
}

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <pytorch/tokenizers/hf_tokenizer.h>
2222
#include <pytorch/tokenizers/llama2c_tokenizer.h>
2323
#include <pytorch/tokenizers/sentencepiece.h>
24+
#include <pytorch/tokenizers/tekken.h>
2425
#include <pytorch/tokenizers/tiktoken.h>
2526

2627
namespace executorch::extension::llm {
@@ -35,6 +36,18 @@ std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
3536
size_t bos_token_index,
3637
size_t eos_token_index) {
3738
runtime::runtime_init();
39+
auto tekken_tokenizer = std::make_unique<tokenizers::Tekken>();
40+
// Prevent the case where tekken tokenizer accidentally successfully loads a
41+
// HuggingFace tokenizer, which is also .json.
42+
const std::string tekken_name = "tekken.json";
43+
if (tokenizer_path.size() >= tekken_name.size() &&
44+
tokenizer_path.rfind(tekken_name) ==
45+
tokenizer_path.size() - tekken_name.size()) {
46+
if (tekken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
47+
ET_LOG(Info, "Loaded tekken tokenizer");
48+
return tekken_tokenizer;
49+
}
50+
}
3851
auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
3952
if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
4053
ET_LOG(Info, "Loaded json tokenizer");
@@ -109,6 +122,18 @@ std::unordered_map<std::string, int64_t> get_llm_metadata(
109122
}
110123
ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
111124
}
125+
126+
// If kMaxContextLen method not found but kMaxSeqLen is
127+
// available, set kMaxContextLen to the value of kMaxSeqLen.
128+
if (!method_names.count(llm::kMaxContextLen) &&
129+
method_names.count(llm::kMaxSeqLen)) {
130+
metadata[llm::kMaxContextLen] = metadata[llm::kMaxSeqLen];
131+
ET_LOG(
132+
Info,
133+
"Setting kMaxContextLen to kMaxSeqLen value: %" PRId64,
134+
metadata[llm::kMaxContextLen]);
135+
}
136+
112137
// Set tokenizer-related metadata
113138
metadata[llm::kBosId] = tokenizer->bos_tok();
114139
metadata[llm::kVocabSize] = tokenizer->vocab_size();

0 commit comments

Comments
 (0)