Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ea4ac91
support qwen2audio
Oct 10, 2024
262a401
fix format
Oct 11, 2024
aa4ad09
fix format
Oct 11, 2024
52e6c94
fix format
Oct 11, 2024
3fa3e83
update BaseMultiModalItemTracker._placeholder_str and offline_inferen…
Oct 12, 2024
dd53a22
fix format
Oct 12, 2024
38b28b3
fix format
Oct 12, 2024
6a75b32
fix format
Oct 12, 2024
a7492fd
fix format
Oct 12, 2024
2eb71ac
fix format
Oct 12, 2024
5dff6ea
fix bug
faychu Oct 18, 2024
a23e1c4
fix format
faychu Oct 18, 2024
fb0c391
fix format
faychu Oct 18, 2024
d70e9b9
update support models in docs
faychu Oct 18, 2024
833cb93
fix format
faychu Oct 18, 2024
e3b658d
update
faychu Oct 18, 2024
6ae9c33
Get PP to pass
DarkLight1337 Oct 18, 2024
40a7dd9
Update imports
DarkLight1337 Oct 18, 2024
092fa07
remove feature_extractor in processor
faychu Oct 22, 2024
86c9923
fix format
faychu Oct 22, 2024
e5ba469
fix format
faychu Oct 22, 2024
a013d24
fix docs
faychu Oct 22, 2024
c9ae514
rm unused print info
faychu Oct 22, 2024
3eedd46
fix sample rate problem
faychu Oct 23, 2024
a8413c2
fix format
faychu Oct 23, 2024
0a2efa8
Add text-only input for testing
DarkLight1337 Oct 23, 2024
9f75ec4
Update vllm/model_executor/models/qwen2_audio.py
faychu Oct 23, 2024
9ffefb3
Update vllm/model_executor/models/qwen2_audio.py
faychu Oct 23, 2024
7a2c6f2
Fix failure when max-audios is 0
DarkLight1337 Oct 23, 2024
cb4be14
format
DarkLight1337 Oct 23, 2024
663a8fe
Fix online inference
DarkLight1337 Oct 23, 2024
2261d8f
Merge branch 'main' into qwen2audio_new
DarkLight1337 Oct 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,12 @@ Text Generation
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
-
- ✅︎
* - :code:`Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
- T + A\ :sup:`+`
- :code:`Qwen/Qwen2-Audio-7B-Instruct`
-
- ✅︎
* - :code:`Qwen2VLForConditionalGeneration`
- Qwen2-VL
- T + I\ :sup:`E+` + V\ :sup:`+`
Expand Down
54 changes: 38 additions & 16 deletions examples/offline_inference_audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
from vllm.utils import FlexibleArgumentParser

audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
question_per_audio_count = [
"What is recited in the audio?",
"What sport and what nursery rhyme are referenced?"
]
question_per_audio_count = {
0: "What is 1+1?",
1: "What is recited in the audio?",
2: "What sport and what nursery rhyme are referenced?"
}


# Ultravox 0.3
def run_ultravox(question, audio_count):
def run_ultravox(question: str, audio_count: int):
model_name = "fixie-ai/ultravox-v0_3"

tokenizer = AutoTokenizer.from_pretrained(model_name)
Expand All @@ -42,9 +43,29 @@ def run_ultravox(question, audio_count):
return llm, prompt, stop_token_ids


model_example_map = {
"ultravox": run_ultravox,
}
# Qwen2-Audio
def run_qwen2_audio(question: str, audio_count: int):
model_name = "Qwen/Qwen2-Audio-7B-Instruct"

llm = LLM(model=model_name,
max_model_len=4096,
max_num_seqs=5,
limit_mm_per_prompt={"audio": audio_count})

audio_in_prompt = "".join([
f"Audio {idx+1}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
])

prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None
return llm, prompt, stop_token_ids


model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}


def main(args):
Expand All @@ -54,24 +75,25 @@ def main(args):

audio_count = args.num_audios
llm, prompt, stop_token_ids = model_example_map[model](
question_per_audio_count[audio_count - 1], audio_count)
question_per_audio_count[audio_count], audio_count)

# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2,
max_tokens=64,
stop_token_ids=stop_token_ids)

assert args.num_prompts > 0
inputs = {
"prompt": prompt,
"multi_modal_data": {
mm_data = {}
if audio_count > 0:
mm_data = {
"audio": [
asset.audio_and_sample_rate
for asset in audio_assets[:audio_count]
]
},
}
}

assert args.num_prompts > 0
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
if args.num_prompts > 1:
# Batch inference
inputs = [inputs] * args.num_prompts
Expand Down Expand Up @@ -100,7 +122,7 @@ def main(args):
parser.add_argument("--num-audios",
type=int,
default=1,
choices=[1, 2],
choices=[0, 1, 2],
help="Number of audio items per prompt.")

args = parser.parse_args()
Expand Down
1 change: 1 addition & 0 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def iter_params(self, model_name: str):
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}
Expand Down
5 changes: 4 additions & 1 deletion vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,10 @@ def _placeholder_str(self, modality: ModalityStr,
elif modality == "audio":
if model_type == "ultravox":
return "<|reserved_special_token_0|>"
raise TypeError(f"Unknown {modality} model type: {model_type}")
if model_type == "qwen2_audio":
return (f"Audio {current_count}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
raise TypeError(f"Unknown model type: {model_type}")
elif modality == "video":
if model_type == "qwen2_vl":
return "<|vision_start|><|video_pad|><|vision_end|>"
Expand Down
Loading