Skip to content

Commit 844132d

Browse files
committed
Include audio preprocessing for raw audio tensor
ghstack-source-id: 1ce35e1 Pull Request resolved: #13752
1 parent 70560f5 commit 844132d

File tree

1 file changed

+239
-27
lines changed

1 file changed

+239
-27
lines changed

examples/models/voxtral/multimodal.cpp

Lines changed: 239 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212

1313
#include <gflags/gflags.h>
1414

15+
#include <executorch/extension/module/module.h>
16+
#include <executorch/extension/tensor/tensor_ptr_maker.h>
17+
#include <executorch/runtime/core/evalue.h>
18+
1519
#include <executorch/extension/llm/runner/audio.h>
1620
#include <executorch/extension/llm/runner/image.h>
1721
#include <executorch/extension/llm/runner/llm_runner_helper.h>
@@ -36,6 +40,11 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
3640

3741
DEFINE_string(audio_path, "", "Path to input audio file.");
3842

43+
DEFINE_string(
44+
processor_path,
45+
"",
46+
"Path to processor .pte file for raw audio processing.");
47+
3948
DEFINE_double(
4049
temperature,
4150
0.8f,
@@ -50,16 +59,48 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
5059

5160
namespace {
5261

62+
using ::executorch::extension::from_blob;
63+
using ::executorch::extension::Module;
5364
using ::executorch::extension::llm::Image;
5465
using ::executorch::extension::llm::make_image_input;
5566
using ::executorch::extension::llm::make_text_input;
5667
using ::executorch::extension::llm::MultimodalInput;
68+
using ::executorch::runtime::EValue;
5769

5870
bool ends_with(const std::string& str, const std::string& suffix) {
5971
return str.size() >= suffix.size() &&
6072
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
6173
}
6274

75+
/**
76+
* @brief Loads float data from a binary file
77+
*
78+
* @param audio_path Path to the binary audio file (.bin)
79+
* @return Vector of float data loaded from the file
80+
* @throws std::runtime_error if file loading fails
81+
*/
82+
std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
83+
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
84+
if (!f.is_open()) {
85+
ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
86+
throw std::runtime_error("Failed to open audio file");
87+
}
88+
89+
std::size_t n_floats =
90+
f.tellg() / sizeof(float); // Number of floats in the audio file
91+
f.seekg(0, std::ios::beg);
92+
93+
std::vector<float> audio_data(n_floats);
94+
f.read(
95+
reinterpret_cast<char*>(audio_data.data()),
96+
audio_data.size() * sizeof(float));
97+
f.close();
98+
99+
ET_LOG(
100+
Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
101+
return audio_data;
102+
}
103+
63104
/**
64105
* @brief Loads preprocessed audio data from a binary file
65106
*
@@ -70,22 +111,83 @@ bool ends_with(const std::string& str, const std::string& suffix) {
70111
* f.write(t.numpy().tobytes())
71112
*
72113
* @param audio_path Path to the binary audio file (.bin)
114+
* @param processor_path Path to the processor .pte file to get metadata
73115
* @return MultimodalInput containing the loaded audio data
74116
*/
75-
MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
76-
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
77-
int32_t n_bins = 128;
78-
int32_t n_frames = 3000;
79-
std::size_t n_floats =
80-
f.tellg() / sizeof(float); // Number of floats in the audio file.
81-
f.seekg(0, std::ios::beg);
117+
MultimodalInput loadPreprocessedAudio(
118+
const std::string& audio_path,
119+
const std::string& processor_path = "") {
120+
std::vector<float> audio_data = loadBinaryFloatData(audio_path);
121+
122+
int32_t n_bins, n_frames;
123+
124+
if (!processor_path.empty()) {
125+
// Load processor module to get metadata
126+
std::unique_ptr<Module> processor_module;
127+
try {
128+
processor_module =
129+
std::make_unique<Module>(processor_path, Module::LoadMode::File);
130+
auto load_error = processor_module->load();
131+
if (load_error != ::executorch::runtime::Error::Ok) {
132+
ET_LOG(
133+
Error,
134+
"Failed to load processor module from: %s",
135+
processor_path.c_str());
136+
throw std::runtime_error("Failed to load processor module");
137+
}
138+
} catch (const std::exception& e) {
139+
ET_LOG(Error, "Exception while loading processor module: %s", e.what());
140+
throw std::runtime_error("Exception while loading processor module");
141+
}
142+
143+
// Get n_bins by running "feature_size" method
144+
auto feature_size_result = processor_module->execute("feature_size");
145+
if (!feature_size_result.ok()) {
146+
ET_LOG(
147+
Error, "Failed to execute 'feature_size' method on processor module");
148+
throw std::runtime_error(
149+
"Failed to execute 'feature_size' method on processor module");
150+
}
151+
auto feature_size_outputs = feature_size_result.get();
152+
if (feature_size_outputs.empty()) {
153+
ET_LOG(Error, "'feature_size' method returned no outputs");
154+
throw std::runtime_error("'feature_size' method returned no outputs");
155+
}
156+
n_bins = static_cast<int32_t>(feature_size_outputs[0].toInt());
157+
158+
// Get n_frames by running "nb_max_frames" method
159+
auto nb_max_frames_result = processor_module->execute("nb_max_frames");
160+
if (!nb_max_frames_result.ok()) {
161+
ET_LOG(
162+
Error,
163+
"Failed to execute 'nb_max_frames' method on processor module");
164+
throw std::runtime_error(
165+
"Failed to execute 'nb_max_frames' method on processor module");
166+
}
167+
auto nb_max_frames_outputs = nb_max_frames_result.get();
168+
if (nb_max_frames_outputs.empty()) {
169+
ET_LOG(Error, "'nb_max_frames' method returned no outputs");
170+
throw std::runtime_error("'nb_max_frames' method returned no outputs");
171+
}
172+
n_frames = static_cast<int32_t>(nb_max_frames_outputs[0].toInt());
173+
174+
ET_LOG(
175+
Info,
176+
"Got values from processor methods: n_bins=%d, n_frames=%d",
177+
n_bins,
178+
n_frames);
179+
} else {
180+
ET_LOG(
181+
Error,
182+
"Processor path is required to get feature_size and nb_max_frames");
183+
throw std::runtime_error(
184+
"Processor path is required to get feature_size and nb_max_frames");
185+
}
186+
187+
std::size_t n_floats = audio_data.size();
82188
int32_t batch_size = ceil(
83189
n_floats /
84190
(n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
85-
std::vector<float> audio_data(batch_size * n_bins * n_frames);
86-
f.read(
87-
reinterpret_cast<char*>(audio_data.data()),
88-
audio_data.size() * sizeof(float));
89191

90192
ET_LOG(Info, "audio_data len = %d", audio_data.size());
91193

@@ -100,29 +202,138 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
100202
}
101203

102204
/**
103-
* @brief Processes audio files for multimodal input
205+
* @brief Loads a .bin file into a tensor and processes it using a .pte
206+
* processor
104207
*
105-
* Dispatches audio file processing based on file extension:
106-
* - .bin files: Loads preprocessed mel spectrogram features directly
107-
* - .wav/.mp3 files: Currently unsupported, throws runtime_error
208+
* This function loads raw audio data from a .bin file (similar to
209+
* loadPreprocessedAudio), creates a tensor from it, and then passes it through
210+
* a processor module loaded from a .pte file to generate processed audio
211+
* features.
212+
*
213+
* @param audio_path Path to the .bin audio file
214+
* @param processor_path Path to the .pte processor file
215+
* @return MultimodalInput containing the processed audio data
216+
* @throws std::runtime_error if file loading or processing fails
217+
*/
218+
MultimodalInput processRawAudioFile(
219+
const std::string& audio_path,
220+
const std::string& processor_path) {
221+
if (processor_path.empty()) {
222+
ET_LOG(Error, "Processor path is required for raw audio processing");
223+
throw std::runtime_error(
224+
"Processor path is required for raw audio processing");
225+
}
226+
227+
// Load the audio processor .pte.
228+
std::unique_ptr<Module> processor_module;
229+
try {
230+
processor_module =
231+
std::make_unique<Module>(processor_path, Module::LoadMode::File);
232+
auto load_error = processor_module->load();
233+
if (load_error != ::executorch::runtime::Error::Ok) {
234+
ET_LOG(
235+
Error,
236+
"Failed to load processor module from: %s",
237+
processor_path.c_str());
238+
throw std::runtime_error("Failed to load processor module");
239+
}
240+
} catch (const std::exception& e) {
241+
ET_LOG(Error, "Exception while loading processor module: %s", e.what());
242+
throw std::runtime_error("Exception while loading processor module");
243+
}
244+
245+
// Load the audio data from file.
246+
std::vector<float> audio_data = loadBinaryFloatData(audio_path);
247+
248+
// Execute the processor
249+
std::vector<executorch::aten::SizesType> tensor_shape = {
250+
static_cast<executorch::aten::SizesType>(audio_data.size())};
251+
auto input_tensor = from_blob(
252+
audio_data.data(), tensor_shape, ::executorch::aten::ScalarType::Float);
253+
254+
ET_LOG(Info, "Processing audio through processor module...");
255+
auto result = processor_module->execute("forward", input_tensor);
256+
if (!result.ok()) {
257+
ET_LOG(Error, "Failed to execute processor's forward method");
258+
throw std::runtime_error("Failed to execute processor forward method");
259+
}
260+
261+
auto outputs = result.get();
262+
if (outputs.empty()) {
263+
ET_LOG(Error, "Processor returned no outputs");
264+
throw std::runtime_error("Processor returned no outputs");
265+
}
266+
267+
// Extract processed audio features
268+
const auto& processed_tensor = outputs[0].toTensor();
269+
const float* processed_data = processed_tensor.const_data_ptr<float>();
270+
const auto& sizes = processed_tensor.sizes();
271+
272+
ET_LOG(
273+
Info,
274+
"Processed audio tensor shape: [%d, %d, %d]",
275+
static_cast<int>(sizes[0]),
276+
static_cast<int>(sizes[1]),
277+
static_cast<int>(sizes[2]));
278+
279+
// Create Audio multimodal input from processed features
280+
auto processed_audio =
281+
std::make_unique<::executorch::extension::llm::Audio>();
282+
processed_audio->batch_size = static_cast<int32_t>(
283+
sizes[0]); // Note: batching for s > 30 doesn't work yet.
284+
processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
285+
processed_audio->n_frames = static_cast<int32_t>(sizes[2]);
286+
287+
size_t total_elements = processed_audio->batch_size *
288+
processed_audio->n_bins * processed_audio->n_frames;
289+
processed_audio->data.resize(total_elements * sizeof(float));
290+
std::memcpy(
291+
processed_audio->data.data(),
292+
processed_data,
293+
total_elements * sizeof(float));
294+
295+
ET_LOG(
296+
Info,
297+
"Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
298+
processed_audio->batch_size,
299+
processed_audio->n_bins,
300+
processed_audio->n_frames);
301+
302+
return ::executorch::extension::llm::make_audio_input(
303+
std::move(*processed_audio));
304+
}
305+
306+
/**
307+
* @brief Processes audio files for multimodal input
108308
*
109-
* This function provides a interface for different audio input formats
110-
* and can be extended to support raw audio processing in the future.
309+
* Dispatches audio file processing based on file extension and processor
310+
* availability:
311+
* - .bin files with processor: Loads raw audio from .bin and processes through
312+
* processor
313+
* - .bin files without processor: Loads preprocessed mel spectrogram features
314+
* directly
111315
*
112-
* @param audio_path Path to the audio file
316+
* @param audio_path Path to the audio file (.bin)
317+
* @param processor_path Path to the processor .pte file (optional)
113318
* @return MultimodalInput containing the processed audio data
114319
* @throws std::runtime_error if file format is unsupported or processing fails
115320
*/
116-
MultimodalInput processAudioFile(const std::string& audio_path) {
321+
MultimodalInput processAudioFile(
322+
const std::string& audio_path,
323+
const std::string& processor_path = "") {
117324
if (ends_with(audio_path, ".bin")) {
118-
// Current behavior - load preprocessed audio stored as a binary file.
119-
return loadPreprocessedAudio(audio_path);
120-
} else if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".mp3")) {
121-
// New: Process raw audio files - unsupported for now
122-
ET_LOG(Error, "Raw audio file processing (.wav/.mp3) is not yet supported");
123-
throw std::runtime_error("Raw audio file processing not supported");
325+
if (!processor_path.empty()) {
326+
// Process raw audio from .bin file through the processor
327+
return processRawAudioFile(audio_path, processor_path);
328+
} else {
329+
// Load preprocessed audio stored as a binary file (existing behavior)
330+
return loadPreprocessedAudio(audio_path, processor_path);
331+
}
124332
} else {
125-
ET_LOG(Error, "Unsupported audio file format: %s", audio_path.c_str());
333+
ET_LOG(
334+
Error,
335+
"Unsupported audio file format: %s (only .bin files are supported)",
336+
audio_path.c_str());
126337
throw std::runtime_error("Unsupported audio file format");
127338
}
128339
}
@@ -137,6 +348,7 @@ int32_t main(int32_t argc, char** argv) {
137348
const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
138349
const char* prompt = FLAGS_prompt.c_str();
139350
const char* audio_path = FLAGS_audio_path.c_str();
351+
const char* processor_path = FLAGS_processor_path.c_str();
140352
float temperature = FLAGS_temperature;
141353
int32_t cpu_threads = FLAGS_cpu_threads;
142354
bool warmup = FLAGS_warmup;
@@ -184,7 +396,7 @@ int32_t main(int32_t argc, char** argv) {
184396
inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
185397

186398
// 2. Add audio input
187-
inputs.emplace_back(processAudioFile(audio_path));
399+
inputs.emplace_back(processAudioFile(audio_path, processor_path));
188400

189401
// 3. Add text input (the actual user-submitted prompt)
190402
inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));

0 commit comments

Comments
 (0)