Skip to content

Commit 5dff6ea

Browse files
committed
fix bug
1 parent 2eb71ac commit 5dff6ea

File tree

2 files changed

+12
-9
lines changed

2 files changed

+12
-9
lines changed

examples/offline_inference_audio_language.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,11 @@ def run_qwen2_audio(question, audio_count):
5151
max_num_seqs=5,
5252
limit_mm_per_prompt={"audio": audio_count})
5353

54+
audio_in_prompt = "".join([f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)])
55+
5456
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
5557
"<|im_start|>user\n"
56-
"Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
57-
f"{question}<|im_end|>\n"
58+
f"{audio_in_prompt}{question}<|im_end|>\n"
5859
"<|im_start|>assistant\n")
5960
stop_token_ids = None
6061
return llm, prompt, stop_token_ids

vllm/model_executor/models/qwen2_audio.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,13 @@ def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
9292

9393
audio_token_index = ctx.model_config.hf_config.audio_token_index
9494

95-
dummy_seqdata = SequenceData.from_token_counts(
95+
dummy_seqdata = SequenceData.from_prompt_token_counts(
9696
(audio_token_index, max_llm_audio_tokens),
9797
(0, seq_len - max_llm_audio_tokens),
9898
)
9999
dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
100100
return dummy_seqdata, {
101-
"audio": dummy_audio if num_audios == 1 else [dummy_audio] * num_audios
101+
"audio": [(dummy_audio, 16000)] * num_audios
102102
}
103103

104104

@@ -165,11 +165,12 @@ def input_processor_for_qwen2_audio(ctx: InputContext,
165165
multi_modal_data = llm_inputs.get("multi_modal_data")
166166
if multi_modal_data is None or "audio" not in multi_modal_data:
167167
return llm_inputs
168+
if len(multi_modal_data["audio"]) == 0:
169+
return llm_inputs
170+
assert isinstance(multi_modal_data['audio'], list) and isinstance(multi_modal_data['audio'][0], tuple)
168171

169-
audios = multi_modal_data['audio']
170172
processor = cached_get_processor(ctx.model_config.model)
171-
if len(audios) == 0:
172-
return llm_inputs
173+
audios = [_[0] for _ in multi_modal_data['audio']]
173174

174175
audio_inputs = processor.feature_extractor(audios,
175176
sampling_rate=16000,
@@ -227,14 +228,15 @@ def input_mapper_for_qwen2_audio(
227228
}
228229
return batch_data
229230
try:
230-
batch_data = audio_feature_extractor(multi_modal_data,
231+
audios = [_[0] for _ in multi_modal_data]
232+
batch_data = audio_feature_extractor(audios,
231233
sampling_rate=16000,
232234
return_attention_mask=True,
233235
padding="max_length",
234236
return_tensors="pt").data
235237
batch_data["feature_attention_mask"] = batch_data.pop("attention_mask")
236238
except Exception:
237-
logger.error("Failed to process audio (%s)", multi_modal_data)
239+
logger.error("Failed to process audio (%s)", audios)
238240
raise
239241

240242
return MultiModalInputs(batch_data)

0 commit comments

Comments
 (0)