@@ -92,13 +92,13 @@ def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
92
92
93
93
audio_token_index = ctx .model_config .hf_config .audio_token_index
94
94
95
- dummy_seqdata = SequenceData .from_token_counts (
95
+ dummy_seqdata = SequenceData .from_prompt_token_counts (
96
96
(audio_token_index , max_llm_audio_tokens ),
97
97
(0 , seq_len - max_llm_audio_tokens ),
98
98
)
99
99
dummy_audio = np .full ((max_llm_audio_tokens * 2 * 2 * 160 , ), 0. )
100
100
return dummy_seqdata , {
101
- "audio" : dummy_audio if num_audios == 1 else [ dummy_audio ] * num_audios
101
+ "audio" : [( dummy_audio , 16000 ) ] * num_audios
102
102
}
103
103
104
104
@@ -165,11 +165,12 @@ def input_processor_for_qwen2_audio(ctx: InputContext,
165
165
multi_modal_data = llm_inputs .get ("multi_modal_data" )
166
166
if multi_modal_data is None or "audio" not in multi_modal_data :
167
167
return llm_inputs
168
+ if len (multi_modal_data ["audio" ]) == 0 :
169
+ return llm_inputs
170
+ assert isinstance (multi_modal_data ['audio' ], list ) and isinstance (multi_modal_data ['audio' ][0 ], tuple )
168
171
169
- audios = multi_modal_data ['audio' ]
170
172
processor = cached_get_processor (ctx .model_config .model )
171
- if len (audios ) == 0 :
172
- return llm_inputs
173
+ audios = [_ [0 ] for _ in multi_modal_data ['audio' ]]
173
174
174
175
audio_inputs = processor .feature_extractor (audios ,
175
176
sampling_rate = 16000 ,
@@ -227,14 +228,15 @@ def input_mapper_for_qwen2_audio(
227
228
}
228
229
return batch_data
229
230
try :
230
- batch_data = audio_feature_extractor (multi_modal_data ,
231
+ audios = [_ [0 ] for _ in multi_modal_data ]
232
+ batch_data = audio_feature_extractor (audios ,
231
233
sampling_rate = 16000 ,
232
234
return_attention_mask = True ,
233
235
padding = "max_length" ,
234
236
return_tensors = "pt" ).data
235
237
batch_data ["feature_attention_mask" ] = batch_data .pop ("attention_mask" )
236
238
except Exception :
237
- logger .error ("Failed to process audio (%s)" , multi_modal_data )
239
+ logger .error ("Failed to process audio (%s)" , audios )
238
240
raise
239
241
240
242
return MultiModalInputs (batch_data )
0 commit comments