@@ -283,6 +283,7 @@ class AsrModels(Enum):
283283    ghana_nlp_asr_v2  =  "Ghana NLP ASR v2" 
284284    lelapa  =  "Vulavula (Lelapa AI)" 
285285    whisper_sunbird_large_v3  =  "Sunbird Ugandan Whisper v3 (Sunbird AI)" 
286+     whisper_akera_large_v3  =  "Akera Whisper v3 (akera)" 
286287    whisper_swahili_medium_v3  =  "Jacaranda Health Swahili Whisper v3 (Jacaranda Health)" 
287288    mbaza_ctc_large  =  "Mbaza Conformer LG (MbazaNLP)" 
288289
@@ -336,6 +337,7 @@ def supports_input_prompt(self) -> bool:
336337
337338
338339asr_model_ids  =  {
340+     AsrModels .whisper_akera_large_v3 : "akera/whisper-large-v3-kik-full_v2" ,
339341    AsrModels .gpt_4_o_audio : "gpt-4o-transcribe" ,
340342    AsrModels .gpt_4_o_mini_audio : "gpt-4o-mini-transcribe" ,
341343    AsrModels .whisper_large_v3 : "vaibhavs10/incredibly-fast-whisper:3ab86df6c8f54c11309d4d1f930ac292bad43ace52d10c80d87eb258b3c9f79c" ,
@@ -362,6 +364,7 @@ def supports_input_prompt(self) -> bool:
362364    AsrModels .vakyansh_bhojpuri : "bho" ,
363365    AsrModels .nemo_english : "en" ,
364366    AsrModels .nemo_hindi : "hi" ,
367+     AsrModels .whisper_akera_large_v3 : "kik" ,
365368}
366369
367370asr_supported_languages  =  {
@@ -386,6 +389,7 @@ def supports_input_prompt(self) -> bool:
386389    AsrModels .lelapa : LELAPA_ASR_SUPPORTED ,
387390    AsrModels .whisper_sunbird_large_v3 : SUNBIRD_SUPPORTED_LANGUAGES ,
388391    AsrModels .whisper_swahili_medium_v3 : {"sw" , "en" },
392+     AsrModels .whisper_akera_large_v3 : {"kik" },
389393    AsrModels .mbaza_ctc_large : {"sw" , "rw" , "lg" },
390394}
391395
@@ -1286,13 +1290,17 @@ def run_asr(
12861290        )
12871291    # call one of the self-hosted models 
12881292    else :
1289-         kwargs  =  {}
1293+         kwargs  =  {"task" :  "translate"   if   speech_translation_target   else   "transcribe" }
12901294        if  "vakyansh"  in  selected_model .name :
12911295            # fixes https://github.com/huggingface/transformers/issues/15275#issuecomment-1624879632 
12921296            kwargs ["decoder_kwargs" ] =  dict (skip_special_tokens = True )
12931297            kwargs ["chunk_length_s" ] =  60 
12941298            kwargs ["stride_length_s" ] =  (6 , 0 )
12951299            kwargs ["batch_size" ] =  32 
1300+         elif  "akera"  in  selected_model .name :
1301+             # don't pass language or task 
1302+             kwargs .pop ("task" )
1303+             kwargs ["max_length" ] =  448 
12961304        elif  "whisper"  in  selected_model .name :
12971305            forced_lang  =  forced_asr_languages .get (selected_model )
12981306            if  forced_lang :
@@ -1308,7 +1316,6 @@ def run_asr(
13081316            ),
13091317            inputs = dict (
13101318                audio = audio_url ,
1311-                 task = "translate"  if  speech_translation_target  else  "transcribe" ,
13121319                return_timestamps = output_format  !=  AsrOutputFormat .text ,
13131320                ** kwargs ,
13141321            ),
0 commit comments