1212
1313#include < gflags/gflags.h>
1414
15+ #include < executorch/extension/module/module.h>
16+ #include < executorch/extension/tensor/tensor_ptr_maker.h>
17+ #include < executorch/runtime/core/evalue.h>
18+
1519#include < executorch/extension/llm/runner/audio.h>
1620#include < executorch/extension/llm/runner/image.h>
1721#include < executorch/extension/llm/runner/llm_runner_helper.h>
@@ -36,6 +40,11 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
3640
3741DEFINE_string (audio_path, " " , " Path to input audio file." );
3842
43+ DEFINE_string (
44+ processor_path,
45+ " " ,
46+ " Path to processor .pte file for raw audio processing." );
47+
3948DEFINE_double (
4049 temperature,
4150 0 .8f ,
@@ -50,16 +59,48 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
5059
5160namespace {
5261
62+ using ::executorch::extension::from_blob;
63+ using ::executorch::extension::Module;
5364using ::executorch::extension::llm::Image;
5465using ::executorch::extension::llm::make_image_input;
5566using ::executorch::extension::llm::make_text_input;
5667using ::executorch::extension::llm::MultimodalInput;
68+ using ::executorch::runtime::EValue;
5769
5870bool ends_with (const std::string& str, const std::string& suffix) {
5971 return str.size () >= suffix.size () &&
6072 str.compare (str.size () - suffix.size (), suffix.size (), suffix) == 0 ;
6173}
6274
75+ /* *
76+ * @brief Loads float data from a binary file
77+ *
78+ * @param audio_path Path to the binary audio file (.bin)
79+ * @return Vector of float data loaded from the file
80+ * @throws std::runtime_error if file loading fails
81+ */
82+ std::vector<float > loadBinaryFloatData (const std::string& audio_path) {
83+ std::ifstream f (audio_path, std::ios::binary | std::ios::ate);
84+ if (!f.is_open ()) {
85+ ET_LOG (Error, " Failed to open audio file: %s" , audio_path.c_str ());
86+ throw std::runtime_error (" Failed to open audio file" );
87+ }
88+
89+ std::size_t n_floats =
90+ f.tellg () / sizeof (float ); // Number of floats in the audio file
91+ f.seekg (0 , std::ios::beg);
92+
93+ std::vector<float > audio_data (n_floats);
94+ f.read (
95+ reinterpret_cast <char *>(audio_data.data ()),
96+ audio_data.size () * sizeof (float ));
97+ f.close ();
98+
99+ ET_LOG (
100+ Info, " Loaded .bin file: %s, %zu floats" , audio_path.c_str (), n_floats);
101+ return audio_data;
102+ }
103+
63104/* *
64105 * @brief Loads preprocessed audio data from a binary file
65106 *
@@ -70,22 +111,83 @@ bool ends_with(const std::string& str, const std::string& suffix) {
70111 * f.write(t.numpy().tobytes())
71112 *
72113 * @param audio_path Path to the binary audio file (.bin)
114+ * @param processor_path Path to the processor .pte file to get metadata
73115 * @return MultimodalInput containing the loaded audio data
74116 */
75- MultimodalInput loadPreprocessedAudio (const std::string& audio_path) {
76- std::ifstream f (audio_path, std::ios::binary | std::ios::ate);
77- int32_t n_bins = 128 ;
78- int32_t n_frames = 3000 ;
79- std::size_t n_floats =
80- f.tellg () / sizeof (float ); // Number of floats in the audio file.
81- f.seekg (0 , std::ios::beg);
117+ MultimodalInput loadPreprocessedAudio (
118+ const std::string& audio_path,
119+ const std::string& processor_path = " " ) {
120+ std::vector<float > audio_data = loadBinaryFloatData (audio_path);
121+
122+ int32_t n_bins, n_frames;
123+
124+ if (!processor_path.empty ()) {
125+ // Load processor module to get metadata
126+ std::unique_ptr<Module> processor_module;
127+ try {
128+ processor_module =
129+ std::make_unique<Module>(processor_path, Module::LoadMode::File);
130+ auto load_error = processor_module->load ();
131+ if (load_error != ::executorch::runtime::Error::Ok) {
132+ ET_LOG (
133+ Error,
134+ " Failed to load processor module from: %s" ,
135+ processor_path.c_str ());
136+ throw std::runtime_error (" Failed to load processor module" );
137+ }
138+ } catch (const std::exception& e) {
139+ ET_LOG (Error, " Exception while loading processor module: %s" , e.what ());
140+ throw std::runtime_error (" Exception while loading processor module" );
141+ }
142+
143+ // Get n_bins by running "feature_size" method
144+ auto feature_size_result = processor_module->execute (" feature_size" );
145+ if (!feature_size_result.ok ()) {
146+ ET_LOG (
147+ Error, " Failed to execute 'feature_size' method on processor module" );
148+ throw std::runtime_error (
149+ " Failed to execute 'feature_size' method on processor module" );
150+ }
151+ auto feature_size_outputs = feature_size_result.get ();
152+ if (feature_size_outputs.empty ()) {
153+ ET_LOG (Error, " 'feature_size' method returned no outputs" );
154+ throw std::runtime_error (" 'feature_size' method returned no outputs" );
155+ }
156+ n_bins = static_cast <int32_t >(feature_size_outputs[0 ].toInt ());
157+
158+ // Get n_frames by running "nb_max_frames" method
159+ auto nb_max_frames_result = processor_module->execute (" nb_max_frames" );
160+ if (!nb_max_frames_result.ok ()) {
161+ ET_LOG (
162+ Error,
163+ " Failed to execute 'nb_max_frames' method on processor module" );
164+ throw std::runtime_error (
165+ " Failed to execute 'nb_max_frames' method on processor module" );
166+ }
167+ auto nb_max_frames_outputs = nb_max_frames_result.get ();
168+ if (nb_max_frames_outputs.empty ()) {
169+ ET_LOG (Error, " 'nb_max_frames' method returned no outputs" );
170+ throw std::runtime_error (" 'nb_max_frames' method returned no outputs" );
171+ }
172+ n_frames = static_cast <int32_t >(nb_max_frames_outputs[0 ].toInt ());
173+
174+ ET_LOG (
175+ Info,
176+ " Got values from processor methods: n_bins=%d, n_frames=%d" ,
177+ n_bins,
178+ n_frames);
179+ } else {
180+ ET_LOG (
181+ Error,
182+ " Processor path is required to get feature_size and nb_max_frames" );
183+ throw std::runtime_error (
184+ " Processor path is required to get feature_size and nb_max_frames" );
185+ }
186+
187+ std::size_t n_floats = audio_data.size ();
82188 int32_t batch_size = ceil (
83189 n_floats /
84190 (n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
85- std::vector<float > audio_data (batch_size * n_bins * n_frames);
86- f.read (
87- reinterpret_cast <char *>(audio_data.data ()),
88- audio_data.size () * sizeof (float ));
89191
90192 ET_LOG (Info, " audio_data len = %d" , audio_data.size ());
91193
@@ -100,29 +202,138 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
100202}
101203
102204/* *
103- * @brief Processes audio files for multimodal input
205+ * @brief Loads a .bin file into a tensor and processes it using a .pte
206+ * processor
104207 *
105- * Dispatches audio file processing based on file extension:
106- * - .bin files: Loads preprocessed mel spectrogram features directly
107- * - .wav/.mp3 files: Currently unsupported, throws runtime_error
208+ * This function loads raw audio data from a .bin file (similar to
209+ * loadPreprocessedAudio), creates a tensor from it, and then passes it through
210+ * a processor module loaded from a .pte file to generate processed audio
211+ * features.
212+ *
213+ * @param audio_path Path to the .bin audio file
214+ * @param processor_path Path to the .pte processor file
215+ * @return MultimodalInput containing the processed audio data
216+ * @throws std::runtime_error if file loading or processing fails
217+ */
218+ MultimodalInput processRawAudioFile (
219+ const std::string& audio_path,
220+ const std::string& processor_path) {
221+ if (processor_path.empty ()) {
222+ ET_LOG (Error, " Processor path is required for raw audio processing" );
223+ throw std::runtime_error (
224+ " Processor path is required for raw audio processing" );
225+ }
226+
227+ // Load the audio processor .pte.
228+ std::unique_ptr<Module> processor_module;
229+ try {
230+ processor_module =
231+ std::make_unique<Module>(processor_path, Module::LoadMode::File);
232+ auto load_error = processor_module->load ();
233+ if (load_error != ::executorch::runtime::Error::Ok) {
234+ ET_LOG (
235+ Error,
236+ " Failed to load processor module from: %s" ,
237+ processor_path.c_str ());
238+ throw std::runtime_error (" Failed to load processor module" );
239+ }
240+ } catch (const std::exception& e) {
241+ ET_LOG (Error, " Exception while loading processor module: %s" , e.what ());
242+ throw std::runtime_error (" Exception while loading processor module" );
243+ }
244+
245+ // Load the audio data from file.
246+ std::vector<float > audio_data = loadBinaryFloatData (audio_path);
247+
248+ // Execute the processor
249+ std::vector<executorch::aten::SizesType> tensor_shape = {
250+ static_cast <executorch::aten::SizesType>(audio_data.size ())};
251+ auto input_tensor = from_blob (
252+ audio_data.data (), tensor_shape, ::executorch::aten::ScalarType::Float);
253+
254+ ET_LOG (Info, " Processing audio through processor module..." );
255+ auto result = processor_module->execute (" forward" , input_tensor);
256+ if (!result.ok ()) {
257+ ET_LOG (Error, " Failed to execute processor's forward method" );
258+ throw std::runtime_error (" Failed to execute processor forward method" );
259+ }
260+
261+ auto outputs = result.get ();
262+ if (outputs.empty ()) {
263+ ET_LOG (Error, " Processor returned no outputs" );
264+ throw std::runtime_error (" Processor returned no outputs" );
265+ }
266+
267+ // Extract processed audio features
268+ const auto & processed_tensor = outputs[0 ].toTensor ();
269+ const float * processed_data = processed_tensor.const_data_ptr <float >();
270+ const auto & sizes = processed_tensor.sizes ();
271+
272+ ET_LOG (
273+ Info,
274+ " Processed audio tensor shape: [%d, %d, %d]" ,
275+ static_cast <int >(sizes[0 ]),
276+ static_cast <int >(sizes[1 ]),
277+ static_cast <int >(sizes[2 ]));
278+
279+ // Create Audio multimodal input from processed features
280+ auto processed_audio =
281+ std::make_unique<::executorch::extension::llm::Audio>();
282+ processed_audio->batch_size = static_cast <int32_t >(
283+ sizes[0 ]); // Note: batching for s > 30 doesn't work yet.
284+ processed_audio->n_bins = static_cast <int32_t >(sizes[1 ]);
285+ processed_audio->n_frames = static_cast <int32_t >(sizes[2 ]);
286+
287+ size_t total_elements = processed_audio->batch_size *
288+ processed_audio->n_bins * processed_audio->n_frames ;
289+ processed_audio->data .resize (total_elements * sizeof (float ));
290+ std::memcpy (
291+ processed_audio->data .data (),
292+ processed_data,
293+ total_elements * sizeof (float ));
294+
295+ ET_LOG (
296+ Info,
297+ " Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d" ,
298+ processed_audio->batch_size ,
299+ processed_audio->n_bins ,
300+ processed_audio->n_frames );
301+
302+ return ::executorch::extension::llm::make_audio_input (
303+ std::move (*processed_audio));
304+ }
305+
306+ /* *
307+ * @brief Processes audio files for multimodal input
108308 *
109- * This function provides a interface for different audio input formats
110- * and can be extended to support raw audio processing in the future.
309+ * Dispatches audio file processing based on file extension and processor
310+ * availability:
311+ * - .bin files with processor: Loads raw audio from .bin and processes through
312+ * processor
313+ * - .bin files without processor: Loads preprocessed mel spectrogram features
314+ * directly
111315 *
112- * @param audio_path Path to the audio file
316+ * @param audio_path Path to the audio file (.bin)
317+ * @param processor_path Path to the processor .pte file (optional)
113318 * @return MultimodalInput containing the processed audio data
114319 * @throws std::runtime_error if file format is unsupported or processing fails
115320 */
116- MultimodalInput processAudioFile (const std::string& audio_path) {
321+ MultimodalInput processAudioFile (
322+ const std::string& audio_path,
323+ const std::string& processor_path = " " ) {
117324 if (ends_with (audio_path, " .bin" )) {
118- // Current behavior - load preprocessed audio stored as a binary file.
119- return loadPreprocessedAudio (audio_path);
120- } else if (ends_with (audio_path, " .wav" ) || ends_with (audio_path, " .mp3" )) {
121- // New: Process raw audio files - unsupported for now
122- ET_LOG (Error, " Raw audio file processing (.wav/.mp3) is not yet supported" );
123- throw std::runtime_error (" Raw audio file processing not supported" );
325+ if (!processor_path.empty ()) {
326+ // Process raw audio from .bin file through the processor
327+ return processRawAudioFile (audio_path, processor_path);
328+ } else {
329+ // Load preprocessed audio stored as a binary file (existing behavior)
330+ return loadPreprocessedAudio (audio_path, processor_path);
331+ }
124332 } else {
125- ET_LOG (Error, " Unsupported audio file format: %s" , audio_path.c_str ());
333+ ET_LOG (
334+ Error,
335+ " Unsupported audio file format: %s (only .bin files are supported)" ,
336+ audio_path.c_str ());
126337 throw std::runtime_error (" Unsupported audio file format" );
127338 }
128339}
@@ -137,6 +348,7 @@ int32_t main(int32_t argc, char** argv) {
137348 const char * tokenizer_path = FLAGS_tokenizer_path.c_str ();
138349 const char * prompt = FLAGS_prompt.c_str ();
139350 const char * audio_path = FLAGS_audio_path.c_str ();
351+ const char * processor_path = FLAGS_processor_path.c_str ();
140352 float temperature = FLAGS_temperature;
141353 int32_t cpu_threads = FLAGS_cpu_threads;
142354 bool warmup = FLAGS_warmup;
@@ -184,7 +396,7 @@ int32_t main(int32_t argc, char** argv) {
184396 inputs.emplace_back (make_text_input (" <s>[INST][BEGIN_AUDIO]" ));
185397
186398 // 2. Add audio input
187- inputs.emplace_back (processAudioFile (audio_path));
399+ inputs.emplace_back (processAudioFile (audio_path, processor_path ));
188400
189401 // 3. Add text input (the actual user-submitted prompt)
190402 inputs.emplace_back (make_text_input (std::string (prompt) + " [/INST]" ));
0 commit comments