Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions daras_ai_v2/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ class AsrModels(Enum):
ghana_nlp_asr_v2 = "Ghana NLP ASR v2"
lelapa = "Vulavula (Lelapa AI)"
whisper_sunbird_large_v3 = "Sunbird Ugandan Whisper v3 (Sunbird AI)"
whisper_akera_large_v3 = "Akera Whisper v3 (akera)"
whisper_swahili_medium_v3 = "Jacaranda Health Swahili Whisper v3 (Jacaranda Health)"
mbaza_ctc_large = "Mbaza Conformer LG (MbazaNLP)"

Expand Down Expand Up @@ -336,6 +337,7 @@ def supports_input_prompt(self) -> bool:


asr_model_ids = {
AsrModels.whisper_akera_large_v3: "akera/whisper-large-v3-kik-full_v2",
AsrModels.gpt_4_o_audio: "gpt-4o-transcribe",
AsrModels.gpt_4_o_mini_audio: "gpt-4o-mini-transcribe",
AsrModels.whisper_large_v3: "vaibhavs10/incredibly-fast-whisper:3ab86df6c8f54c11309d4d1f930ac292bad43ace52d10c80d87eb258b3c9f79c",
Expand All @@ -362,6 +364,7 @@ def supports_input_prompt(self) -> bool:
AsrModels.vakyansh_bhojpuri: "bho",
AsrModels.nemo_english: "en",
AsrModels.nemo_hindi: "hi",
AsrModels.whisper_akera_large_v3: "kik",
}

asr_supported_languages = {
Expand All @@ -386,6 +389,7 @@ def supports_input_prompt(self) -> bool:
AsrModels.lelapa: LELAPA_ASR_SUPPORTED,
AsrModels.whisper_sunbird_large_v3: SUNBIRD_SUPPORTED_LANGUAGES,
AsrModels.whisper_swahili_medium_v3: {"sw", "en"},
AsrModels.whisper_akera_large_v3: {"kik"},
AsrModels.mbaza_ctc_large: {"sw", "rw", "lg"},
}

Expand Down Expand Up @@ -1286,13 +1290,17 @@ def run_asr(
)
# call one of the self-hosted models
else:
kwargs = {}
kwargs = {"task": "translate" if speech_translation_target else "transcribe"}
if "vakyansh" in selected_model.name:
# fixes https://github.com/huggingface/transformers/issues/15275#issuecomment-1624879632
kwargs["decoder_kwargs"] = dict(skip_special_tokens=True)
kwargs["chunk_length_s"] = 60
kwargs["stride_length_s"] = (6, 0)
kwargs["batch_size"] = 32
elif "akera" in selected_model.name:
# don't pass language or task
kwargs.pop("task")
kwargs["max_length"] = 448
elif "whisper" in selected_model.name:
Copy link

@coderabbitai coderabbitai bot Sep 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Akera branch drops task: translation requests will be silently ignored.

Because you pop("task"), any non-empty speech_translation_target is effectively ignored for Akera. If that’s intentional (Akera doesn’t support ST), either:

  • raise a clear error when ST is requested, or
  • do a client-side post-translation fallback.

Also, prefer safe pop and explicit enum match to avoid brittle substring checks.

Minimal guard + safer pop:

-        kwargs = {"task": "translate" if speech_translation_target else "transcribe"}
+        kwargs = {"task": "translate" if speech_translation_target else "transcribe"}
         if "vakyansh" in selected_model.name:
             # fixes https://github.com/huggingface/transformers/issues/15275#issuecomment-1624879632
             kwargs["decoder_kwargs"] = dict(skip_special_tokens=True)
             kwargs["chunk_length_s"] = 60
             kwargs["stride_length_s"] = (6, 0)
             kwargs["batch_size"] = 32
-        elif "akera" in selected_model.name:
-            # don't pass language or task
-            kwargs.pop("task")
-            kwargs["max_length"] = 448
+        elif selected_model == AsrModels.whisper_akera_large_v3:
+            # Akera: don't pass language or task to worker
+            kwargs.pop("task", None)
+            if speech_translation_target:
+                raise UserError("Speech translation is not supported by Akera Whisper v3.")
+            kwargs["max_length"] = 448

If you prefer a fallback rather than an error, I can wire run_translate(...) post-ASR and document that SRT/VTT won’t be available for the translated text without time-aligned chunks.

I can add a small unit test that asserts: (a) no task/language is sent for Akera, (b) translation requests raise UserError. Want me to open a test PR?


🏁 Script executed:

#!/bin/bash
# Search for AsrModels enum class and its members, and any literal usage of "akera" models
rg -n "class\s\+AsrModels" -C3 --type py || true
rg -n "AsrModels\." -C1 --type py || true
rg -n "whisper_akera" -C3 --type py || true

Length of output: 10493


🏁 Script executed:

#!/bin/bash
# Extract context around the snippet for Akera branch in daras_ai_v2/asr.py
sed -n '1280,1320p' daras_ai_v2/asr.py
# Confirm the exact model check logic for "akera"
rg -nP --type py 'elif .*akera.*' -C2 daras_ai_v2/asr.py

Length of output: 2075


🏁 Script executed:

#!/bin/bash
# Preview the top of the ASR module to locate the AsrModels definition/import
sed -n '1,200p' daras_ai_v2/asr.py

Length of output: 15273


🏁 Script executed:

#!/bin/bash
# Locate call_celery_task in gpu_server.py
rg -n "def call_celery_task" -C5 daras_ai_v2/gpu_server.py

Length of output: 756


Guard Akera model translation support and use explicit enum match
Akera’s branch currently drops task without notifying when speech_translation_target is set, silently ignoring translation requests.

-        elif "akera" in selected_model.name:
-            # don't pass language or task
-            kwargs.pop("task")
-            kwargs["max_length"] = 448
+        elif selected_model == AsrModels.whisper_akera_large_v3:
+            # Akera Whisper v3 doesn’t support speech translation
+            kwargs.pop("task", None)
+            if speech_translation_target:
+                raise UserError("Speech translation is not supported by Akera Whisper v3.")
+            kwargs["max_length"] = 448
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
kwargs = {"task": "translate" if speech_translation_target else "transcribe"}
if "vakyansh" in selected_model.name:
# fixes https://github.com/huggingface/transformers/issues/15275#issuecomment-1624879632
kwargs["decoder_kwargs"] = dict(skip_special_tokens=True)
kwargs["chunk_length_s"] = 60
kwargs["stride_length_s"] = (6, 0)
kwargs["batch_size"] = 32
elif "akera" in selected_model.name:
# don't pass language or task
kwargs.pop("task")
kwargs["max_length"] = 448
elif "whisper" in selected_model.name:
kwargs = {"task": "translate" if speech_translation_target else "transcribe"}
if "vakyansh" in selected_model.name:
# fixes https://github.com/huggingface/transformers/issues/15275#issuecomment-1624879632
kwargs["decoder_kwargs"] = dict(skip_special_tokens=True)
kwargs["chunk_length_s"] = 60
kwargs["stride_length_s"] = (6, 0)
kwargs["batch_size"] = 32
elif selected_model == AsrModels.whisper_akera_large_v3:
# Akera Whisper v3 doesn’t support speech translation
kwargs.pop("task", None)
if speech_translation_target:
raise UserError("Speech translation is not supported by Akera Whisper v3.")
kwargs["max_length"] = 448
elif "whisper" in selected_model.name:

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, i would prefer kwargs.pop("task", None) too

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@devxpy Perfect! Using kwargs.pop("task", None) is definitely the safer approach. It prevents potential KeyError exceptions if the "task" key somehow doesn't exist in the kwargs dictionary, making the code more robust through defensive programming.

This small change improves the reliability of the Akera model handling while maintaining the same functionality. Good call on preferring the defensive approach!

🐰 ✨

forced_lang = forced_asr_languages.get(selected_model)
if forced_lang:
Expand All @@ -1308,7 +1316,6 @@ def run_asr(
),
inputs=dict(
audio=audio_url,
task="translate" if speech_translation_target else "transcribe",
return_timestamps=output_format != AsrOutputFormat.text,
**kwargs,
),
Expand Down