Update on "Add Voxtral runner"

jackzhxng · jackzhxng · commit f9c17716b6a3 · 2025-08-28T12:35:59.000-07:00
### Summary Utilize `multimodal_runner.h` to run [Voxtral exported from Optimum Executorch](huggingface/optimum-executorch#126). The runner takes in a `.pt` file of a preprocessed audio recording and feeds it a C++ multimodal runner. Example output: ``` This audio is a casual and somewhat silly conversation between two speakers who seem to be discussing their tattoos. The speakers are engaging in a game where they ask each other what their tattoos say, but both repeatedly say "sweet" instead of the actual words. The speakers are aware of their mistake and try to correct it by asking the other what their tattoo says, but they still end up saying "sweet" again. The conversation ends with a speaker telling the other that their tattoo says " PyTorchObserver {"prompt_tokens":1138,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1756159197436,"inference_end_ms":1756159222710,"prompt_eval_end_ms":1756159209605,"first_token_ms":1756159209605,"aggregate_sampling_time_ms":96,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:33.116291 executorch:stats.h:104] Prompt Tokens: 1138 Generated Tokens: 99 I 00:00:33.116304 executorch:stats.h:110] Model Load Time: 0.000000 (seconds) I 00:00:33.116312 executorch:stats.h:117] Total inference time: 25.274000 (seconds) Rate: 3.917069 (tokens/second) I 00:00:33.116320 executorch:stats.h:127] Prompt evaluation: 12.169000 (seconds) Rate: 93.516312 (tokens/second) I 00:00:33.116327 executorch:stats.h:136] Generated 99 tokens: 13.105000 (seconds) Rate: 7.554369 (tokens/second) I 00:00:33.116338 executorch:stats.h:147] Time to first generated token: 12.169000 (seconds) I 00:00:33.116344 executorch:stats.h:153] Sampling time over 1237 tokens: 0.096000 (seconds) ``` ### Test plan Build and run: ``` # Build and install ExecuTorch cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_ENABLE_LOGGING=ON && cmake --build cmake-out -j16 --target install --config Release # Build and install Voxtral runner cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -Bcmake-out/examples/models/voxtral examples/models/voxtral && cmake --build cmake-out/examples/models/voxtral -j16 --config Release # Run Voxtral runner ./cmake-out/examples/models/voxtral/voxtral_runner --model_path ~/models/voxtral/voxtral_q8da4w_edm_qe4w_d_split_metadata_unsqueeze.pte --tokenizer_path ~/hf/models--mistralai--Voxtral-Mini-3B-2507/snapshots/3060fe34b35ba5d44202ce9ff3c097642914f8f3/tekken.json --prompt "What can you tell me about this audio?" --audio_path ~/models/voxtral/input_features.bin ``` [ghstack-poisoned]
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -226,11 +226,11 @@ def export_all(llava_model: LlavaModel):
         {
             "image_encoder": image_encoder_ep,
             "token_embedding": token_embedding_ep,
-            "text_model": text_model_ep,
+            "text_decoder": text_model_ep,
         },
         partitioner={
             "image_encoder": [XnnpackPartitioner()],
-            "text_model": [
+            "text_decoder": [
                 # First partition the DQLinear nodes, then partition the rest of the nodes,
                 # to avoid multiple DQLinear nodes in the same partition,
                 # to avoid holding multiple unpacked and packed weight buffers in memory,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
                 "image_encoder": ConstraintBasedSymShapeEvalPass(),
-                "text_model": ConstraintBasedSymShapeEvalPass(),
+                "text_decoder": ConstraintBasedSymShapeEvalPass(),
                 "token_embedding": HintBasedSymShapeEvalPass(),
             },
         )
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -89,7 +89,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner
   }
 
   inline static const std::string kTokenEmbeddingMethod = "token_embedding";
-  inline static const std::string kTextModelMethod = "text_model";
+  inline static const std::string kTextModelMethod = "text_decoder";
 };
 
 } // namespace example
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
@@ -96,7 +96,7 @@ def test_llava_export(self):
             "token_embedding", (prompt_before_image,)
         )[0]
         llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
         )
 
@@ -107,7 +107,7 @@ def test_llava_export(self):
         # pte prefill image
         pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
         llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (
                 torch.tensor([start_pos], dtype=torch.int64),
                 pte_embeds_img,
@@ -122,7 +122,7 @@ def test_llava_export(self):
             "token_embedding", (prompt_after_image,)
         )[0]
         pte_prefill_after_img = llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
         )[0]
 
@@ -139,7 +139,7 @@ def test_llava_export(self):
                 "token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
             )[0]
             logits = llava_module.run_method(
-                "text_model",
+                "text_decoder",
                 (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
             )[0]
             new_tokens.append(torch.argmax(logits).item())
diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py
@@ -47,7 +47,7 @@ def main():
         "token_embedding", (prompt_before_image,)
     )[0]
     pte_prefill_before_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
     )[0]
     print(pte_prefill_before_img)
@@ -60,7 +60,7 @@ def main():
     logging.warning("Image encoder finished")
     logging.warning("Image token prefill started")
     pte_prefill_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (
             torch.tensor([start_pos], dtype=torch.int64),
             pte_embeds_img,
@@ -77,7 +77,7 @@ def main():
         "token_embedding", (prompt_after_image,)
     )[0]
     pte_prefill_after_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
     )[0]
     logging.warning("Text token prefill finished")
@@ -91,7 +91,7 @@ def main():
             "token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
         )[0]
         logits = llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
         )[0]
         new_tokens.append(torch.argmax(logits[..., -1, :]).item())
diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h
@@ -22,7 +22,7 @@ inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 // Multimodal method name conventions
 inline constexpr auto kImageEncoderMethod = "image_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
-inline constexpr auto kTokenEmbeddingMethod = "token_embeddings";
-inline constexpr auto kTextModelMethod = "decoder";
+inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
+inline constexpr auto kTextModelMethod = "text_decoder";
 
 } // namespace executorch::extension::llm

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner`
`89`	`89`	`}`
`90`	`90`
`91`	`91`	`inline static const std::string kTokenEmbeddingMethod = "token_embedding";`
`92`		`- inline static const std::string kTextModelMethod = "text_model";`
	`92`	`+ inline static const std::string kTextModelMethod = "text_decoder";`
`93`	`93`	`};`
`94`	`94`
`95`	`95`	`} // namespace example`