Skip to content

Commit f9c1771

Browse files
committed
Update on "Add Voxtral runner"
### Summary Utilize `multimodal_runner.h` to run [Voxtral exported from Optimum Executorch](huggingface/optimum-executorch#126). The runner takes in a `.pt` file of a preprocessed audio recording and feeds it a C++ multimodal runner. Example output: ``` This audio is a casual and somewhat silly conversation between two speakers who seem to be discussing their tattoos. The speakers are engaging in a game where they ask each other what their tattoos say, but both repeatedly say "sweet" instead of the actual words. The speakers are aware of their mistake and try to correct it by asking the other what their tattoo says, but they still end up saying "sweet" again. The conversation ends with a speaker telling the other that their tattoo says " PyTorchObserver {"prompt_tokens":1138,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1756159197436,"inference_end_ms":1756159222710,"prompt_eval_end_ms":1756159209605,"first_token_ms":1756159209605,"aggregate_sampling_time_ms":96,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:33.116291 executorch:stats.h:104] Prompt Tokens: 1138 Generated Tokens: 99 I 00:00:33.116304 executorch:stats.h:110] Model Load Time: 0.000000 (seconds) I 00:00:33.116312 executorch:stats.h:117] Total inference time: 25.274000 (seconds) Rate: 3.917069 (tokens/second) I 00:00:33.116320 executorch:stats.h:127] Prompt evaluation: 12.169000 (seconds) Rate: 93.516312 (tokens/second) I 00:00:33.116327 executorch:stats.h:136] Generated 99 tokens: 13.105000 (seconds) Rate: 7.554369 (tokens/second) I 00:00:33.116338 executorch:stats.h:147] Time to first generated token: 12.169000 (seconds) I 00:00:33.116344 executorch:stats.h:153] Sampling time over 1237 tokens: 0.096000 (seconds) ``` ### Test plan Build and run: ``` # Build and install ExecuTorch cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_ENABLE_LOGGING=ON && cmake --build cmake-out -j16 --target install --config Release # Build and install Voxtral runner cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -Bcmake-out/examples/models/voxtral examples/models/voxtral && cmake --build cmake-out/examples/models/voxtral -j16 --config Release # Run Voxtral runner ./cmake-out/examples/models/voxtral/voxtral_runner --model_path ~/models/voxtral/voxtral_q8da4w_edm_qe4w_d_split_metadata_unsqueeze.pte --tokenizer_path ~/hf/models--mistralai--Voxtral-Mini-3B-2507/snapshots/3060fe34b35ba5d44202ce9ff3c097642914f8f3/tekken.json --prompt "What can you tell me about this audio?" --audio_path ~/models/voxtral/input_features.bin ``` [ghstack-poisoned]
2 parents 720b2c0 + fa1d327 commit f9c1771

File tree

5 files changed

+14
-14
lines changed

5 files changed

+14
-14
lines changed

examples/models/llava/export_llava.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,11 +226,11 @@ def export_all(llava_model: LlavaModel):
226226
{
227227
"image_encoder": image_encoder_ep,
228228
"token_embedding": token_embedding_ep,
229-
"text_model": text_model_ep,
229+
"text_decoder": text_model_ep,
230230
},
231231
partitioner={
232232
"image_encoder": [XnnpackPartitioner()],
233-
"text_model": [
233+
"text_decoder": [
234234
# First partition the DQLinear nodes, then partition the rest of the nodes,
235235
# to avoid multiple DQLinear nodes in the same partition,
236236
# to avoid holding multiple unpacked and packed weight buffers in memory,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
254254
memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
255255
sym_shape_eval_pass={
256256
"image_encoder": ConstraintBasedSymShapeEvalPass(),
257-
"text_model": ConstraintBasedSymShapeEvalPass(),
257+
"text_decoder": ConstraintBasedSymShapeEvalPass(),
258258
"token_embedding": HintBasedSymShapeEvalPass(),
259259
},
260260
)

examples/models/llava/runner/llava_text_decoder_runner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner
8989
}
9090

9191
inline static const std::string kTokenEmbeddingMethod = "token_embedding";
92-
inline static const std::string kTextModelMethod = "text_model";
92+
inline static const std::string kTextModelMethod = "text_decoder";
9393
};
9494

9595
} // namespace example

examples/models/llava/test/test_llava.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def test_llava_export(self):
9696
"token_embedding", (prompt_before_image,)
9797
)[0]
9898
llava_module.run_method(
99-
"text_model",
99+
"text_decoder",
100100
(torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
101101
)
102102

@@ -107,7 +107,7 @@ def test_llava_export(self):
107107
# pte prefill image
108108
pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
109109
llava_module.run_method(
110-
"text_model",
110+
"text_decoder",
111111
(
112112
torch.tensor([start_pos], dtype=torch.int64),
113113
pte_embeds_img,
@@ -122,7 +122,7 @@ def test_llava_export(self):
122122
"token_embedding", (prompt_after_image,)
123123
)[0]
124124
pte_prefill_after_img = llava_module.run_method(
125-
"text_model",
125+
"text_decoder",
126126
(torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
127127
)[0]
128128

@@ -139,7 +139,7 @@ def test_llava_export(self):
139139
"token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
140140
)[0]
141141
logits = llava_module.run_method(
142-
"text_model",
142+
"text_decoder",
143143
(torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
144144
)[0]
145145
new_tokens.append(torch.argmax(logits).item())

examples/models/llava/test/test_pte.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def main():
4747
"token_embedding", (prompt_before_image,)
4848
)[0]
4949
pte_prefill_before_img = llava_module.run_method(
50-
"text_model",
50+
"text_decoder",
5151
(torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
5252
)[0]
5353
print(pte_prefill_before_img)
@@ -60,7 +60,7 @@ def main():
6060
logging.warning("Image encoder finished")
6161
logging.warning("Image token prefill started")
6262
pte_prefill_img = llava_module.run_method(
63-
"text_model",
63+
"text_decoder",
6464
(
6565
torch.tensor([start_pos], dtype=torch.int64),
6666
pte_embeds_img,
@@ -77,7 +77,7 @@ def main():
7777
"token_embedding", (prompt_after_image,)
7878
)[0]
7979
pte_prefill_after_img = llava_module.run_method(
80-
"text_model",
80+
"text_decoder",
8181
(torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
8282
)[0]
8383
logging.warning("Text token prefill finished")
@@ -91,7 +91,7 @@ def main():
9191
"token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
9292
)[0]
9393
logits = llava_module.run_method(
94-
"text_model",
94+
"text_decoder",
9595
(torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
9696
)[0]
9797
new_tokens.append(torch.argmax(logits[..., -1, :]).item())

extension/llm/runner/constants.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
2222
// Multimodal method name conventions
2323
inline constexpr auto kImageEncoderMethod = "image_encoder";
2424
inline constexpr auto kAudioEncoderMethod = "audio_encoder";
25-
inline constexpr auto kTokenEmbeddingMethod = "token_embeddings";
26-
inline constexpr auto kTextModelMethod = "decoder";
25+
inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
26+
inline constexpr auto kTextModelMethod = "text_decoder";
2727

2828
} // namespace executorch::extension::llm

0 commit comments

Comments
 (0)