Update on "Include audio preprocessing for raw audio tensor"

jackzhxng · jackzhxng · commit ca19f6895450 · 2025-08-28T12:36:00.000-07:00
## Summary Runs audio preprocessing (mel spectrogram conversion) on raw audio tensor, using an exported `.pte` from https://github.com/pytorch/executorch/blob/main/extension/audio/mel_spectrogram.py Current limitations - no batching support in the mel spectrogram, so can only support audio of <30 seconds. ``` The speaker in this audio seems to be talking about their concerns about a device called the model or maybe they're just talking about the model in general. They mention that the model was trained with the speaker for inference, which suggests that the model was trained based on the speaker's data or instructions. They also mention that the volume is quite small, which could imply that the speaker is trying to control the volume of the model's output, likely because they are concerned about how loud the model's responses might PyTorchObserver {"prompt_tokens":388,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1756351346381,"inference_end_ms":1756351362602,"prompt_eval_end_ms":1756351351435,"first_token_ms":1756351351435,"aggregate_sampling_time_ms":99,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:24.036773 executorch:stats.h:104] Prompt Tokens: 388 Generated Tokens: 99 I 00:00:24.036800 executorch:stats.h:110] Model Load Time: 0.000000 (seconds) I 00:00:24.036805 executorch:stats.h:117] Total inference time: 16.221000 (seconds) Rate: 6.103200 (tokens/second) I 00:00:24.036815 executorch:stats.h:127] Prompt evaluation: 5.054000 (seconds) Rate: 76.770875 (tokens/second) I 00:00:24.036819 executorch:stats.h:136] Generated 99 tokens: 11.167000 (seconds) Rate: 8.865407 (tokens/second) I 00:00:24.036822 executorch:stats.h:147] Time to first generated token: 5.054000 (seconds) I 00:00:24.036828 executorch:stats.h:153] Sampling time over 487 tokens: 0.099000 (seconds) ``` [ghstack-poisoned]
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -226,11 +226,11 @@ def export_all(llava_model: LlavaModel):
         {
             "image_encoder": image_encoder_ep,
             "token_embedding": token_embedding_ep,
-            "text_model": text_model_ep,
+            "text_decoder": text_model_ep,
         },
         partitioner={
             "image_encoder": [XnnpackPartitioner()],
-            "text_model": [
+            "text_decoder": [
                 # First partition the DQLinear nodes, then partition the rest of the nodes,
                 # to avoid multiple DQLinear nodes in the same partition,
                 # to avoid holding multiple unpacked and packed weight buffers in memory,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
                 "image_encoder": ConstraintBasedSymShapeEvalPass(),
-                "text_model": ConstraintBasedSymShapeEvalPass(),
+                "text_decoder": ConstraintBasedSymShapeEvalPass(),
                 "token_embedding": HintBasedSymShapeEvalPass(),
             },
         )
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -89,7 +89,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner
   }
 
   inline static const std::string kTokenEmbeddingMethod = "token_embedding";
-  inline static const std::string kTextModelMethod = "text_model";
+  inline static const std::string kTextModelMethod = "text_decoder";
 };
 
 } // namespace example
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
@@ -96,7 +96,7 @@ def test_llava_export(self):
             "token_embedding", (prompt_before_image,)
         )[0]
         llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
         )
 
@@ -107,7 +107,7 @@ def test_llava_export(self):
         # pte prefill image
         pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
         llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (
                 torch.tensor([start_pos], dtype=torch.int64),
                 pte_embeds_img,
@@ -122,7 +122,7 @@ def test_llava_export(self):
             "token_embedding", (prompt_after_image,)
         )[0]
         pte_prefill_after_img = llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
         )[0]
 
@@ -139,7 +139,7 @@ def test_llava_export(self):
                 "token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
             )[0]
             logits = llava_module.run_method(
-                "text_model",
+                "text_decoder",
                 (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
             )[0]
             new_tokens.append(torch.argmax(logits).item())
diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py
@@ -47,7 +47,7 @@ def main():
         "token_embedding", (prompt_before_image,)
     )[0]
     pte_prefill_before_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
     )[0]
     print(pte_prefill_before_img)
@@ -60,7 +60,7 @@ def main():
     logging.warning("Image encoder finished")
     logging.warning("Image token prefill started")
     pte_prefill_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (
             torch.tensor([start_pos], dtype=torch.int64),
             pte_embeds_img,
@@ -77,7 +77,7 @@ def main():
         "token_embedding", (prompt_after_image,)
     )[0]
     pte_prefill_after_img = llava_module.run_method(
-        "text_model",
+        "text_decoder",
         (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
     )[0]
     logging.warning("Text token prefill finished")
@@ -91,7 +91,7 @@ def main():
             "token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
         )[0]
         logits = llava_module.run_method(
-            "text_model",
+            "text_decoder",
             (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
         )[0]
         new_tokens.append(torch.argmax(logits[..., -1, :]).item())
diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h
@@ -22,7 +22,7 @@ inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 // Multimodal method name conventions
 inline constexpr auto kImageEncoderMethod = "image_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
-inline constexpr auto kTokenEmbeddingMethod = "token_embeddings";
-inline constexpr auto kTextModelMethod = "decoder";
+inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
+inline constexpr auto kTextModelMethod = "text_decoder";
 
 } // namespace executorch::extension::llm

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner`
`89`	`89`	`}`
`90`	`90`
`91`	`91`	`inline static const std::string kTokenEmbeddingMethod = "token_embedding";`
`92`		`- inline static const std::string kTextModelMethod = "text_model";`
	`92`	`+ inline static const std::string kTextModelMethod = "text_decoder";`
`93`	`93`	`};`
`94`	`94`
`95`	`95`	`} // namespace example`