@@ -188,7 +188,9 @@ def _get_prompt_replacements(
188
188
hf_processor_mm_kwargs : Mapping [str , object ],
189
189
out_mm_kwargs : MultiModalKwargs ,
190
190
) -> list [PromptReplacement ]:
191
- processor = self .info .get_hf_processor ()
191
+ processor = self .info .get_hf_processor (** hf_processor_mm_kwargs )
192
+ tokenizer = self .info .get_tokenizer ()
193
+ vocab = tokenizer .get_vocab ()
192
194
193
195
# Use getattr with default to be compatible with transformers<4.48
194
196
audio_token = getattr (processor , "audio_token" , "<|AUDIO|>" )
@@ -197,6 +199,10 @@ def _get_prompt_replacements(
197
199
audio_eos_token = getattr (processor , "audio_eos_token" ,
198
200
"<|audio_eos|>" )
199
201
202
+ audio_token_id = vocab [audio_token ]
203
+ audio_bos_id = vocab [audio_bos_token ]
204
+ audio_eos_id = vocab [audio_eos_token ]
205
+
200
206
feature_attention_mask = out_mm_kwargs .get ("feature_attention_mask" )
201
207
if feature_attention_mask is None :
202
208
audio_output_lengths = []
@@ -208,22 +214,18 @@ def _get_prompt_replacements(
208
214
audio_output_lengths = audio_output_lens .tolist ()
209
215
210
216
def get_replacement_qwen2_audio (item_idx : int ):
211
- num_placeholders = audio_output_lengths [item_idx ]
212
- if num_placeholders == 0 :
217
+ num_features = audio_output_lengths [item_idx ]
218
+ if num_features == 0 :
213
219
audios = mm_items .get_items ("audio" , AudioProcessorItems )
214
220
audio = audios .get (item_idx )
215
221
raise ValueError (
216
222
f"The audio { audio } (len={ len (audio )} ) is too short "
217
223
"to be represented inside the model" )
218
224
219
- audio_tokens = audio_token * num_placeholders
225
+ audio_tokens = [ audio_token_id ] * num_features
220
226
221
227
return PromptReplacementDetails (
222
- full = "" .join ([
223
- audio_bos_token ,
224
- audio_tokens ,
225
- audio_eos_token ,
226
- ]),
228
+ full = [audio_bos_id ] + audio_tokens + [audio_eos_id ],
227
229
features = audio_tokens ,
228
230
)
229
231
0 commit comments