Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 47 additions & 11 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,26 @@ become available.
<tr>
<td><strong>HuggingFace</strong></td>
<td style="text-align: center;">✅</td>
<td style="text-align: center;">🚧</td>
<td style="text-align: center;">🟡</td>
<td>Specify your dataset path on HuggingFace</td>
</tr>
<tr>
<td><strong>VisionArena</strong></td>
<td style="text-align: center;">✅</td>
<td style="text-align: center;">🚧</td>
<td style="text-align: center;"></td>
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
</tr>
</tbody>
</table>
✅: supported

✅: supported

🚧: to be supported

🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
formats, please consider contributing.

**Note**: VisionArena’s `dataset-name` should be set to `hf`

---
Expand All @@ -79,7 +85,7 @@ NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
```

If successful, you will see the following output
Expand Down Expand Up @@ -123,7 +129,7 @@ DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'

python3 benchmarks/benchmark_serving.py \
python3 vllm/benchmarks/benchmark_serving.py \
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
Expand All @@ -140,35 +146,65 @@ python3 benchmarks/benchmark_serving.py \
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
DATASET_PATH="benchmarks/sonnet.txt"
DATASET_PATH="vllm/benchmarks/sonnet.txt"

python3 benchmarks/benchmark_throughput.py \
python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}"
```
```

If successful, you will see the following output

```
Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
Total num prompt tokens: 5014
Total num output tokens: 1500
```

### VisionArena Benchmark for Vision Language Models

``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"

python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--backend "vllm-chat" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}" \
--hf-split "${DATASET_SPLIT}"
```

The `num prompt tokens` now includes image token counts

```
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
Total num prompt tokens: 14527
Total num output tokens: 1280
```

### Benchmark with LoRA Adapters

``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"

python3 benchmarks/benchmark_throughput.py \
python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--backend "${BACKEND}" \
--dataset_path "${DATASET_PATH}" \
Expand Down
65 changes: 43 additions & 22 deletions benchmarks/benchmark_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class SampleRequest:
Represents a single inference request for benchmarking.
"""

prompt: str
prompt: Union[str, Any]
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
Expand Down Expand Up @@ -84,6 +84,20 @@ def __init__(
if random_seed is not None else self.DEFAULT_SEED)
self.data = None

def apply_multimodal_chat_transformation(
self,
prompt: str,
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
"""
Transform a prompt and optional multimodal content into a chat format.
This method is used for chat models that expect a specific
conversation format.
"""
content = [{"text": prompt, "type": "text"}]
if mm_content is not None:
content.append(mm_content)
return [{"role": "user", "content": content}]

def load_data(self) -> None:
"""
Load data from the dataset path into self.data.
Expand Down Expand Up @@ -338,6 +352,7 @@ def sample(self,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
samples: list = []
for entry in self.data:
Expand All @@ -358,6 +373,9 @@ def sample(self,
skip_min_output_len_check=output_len
is not None):
continue
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
prompt=prompt,
Expand Down Expand Up @@ -550,34 +568,32 @@ def load_data(self) -> None:
split=self.dataset_split,
streaming=True,
)

if "conversations" not in self.data.features:
raise ValueError("HF Dataset must have a 'conversations' column.")

if self.data.features is None or "conversations" \
not in self.data.features:
raise ValueError(
"HuggingFaceDataset currently only supports datasets with "
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
"Please consider contributing if you would like to add "
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter(
lambda x: len(x["conversations"]) >= 2)

def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
sampled_requests = []
dynamic_output = output_len is None

for item in self.data:
if len(sampled_requests) >= num_requests:
break

conv = item["conversations"]
prompt, completion = conv[0]["value"], conv[1]["value"]

lora_request, tokenizer = self.get_random_lora_request(
tokenizer, lora_path=lora_path, max_loras=max_loras)

prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
Expand All @@ -587,16 +603,20 @@ def sample(self,
if dynamic_output and not is_valid_sequence(
prompt_len, completion_len):
continue

mm_content = process_image(
item["image"]) if "image" in item else None
if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len and output len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
lora_request=lora_request,
))
return sampled_requests

Expand All @@ -606,7 +626,7 @@ def sample(self,
# -----------------------------------------------------------------------------


class VisionArenaDataset(BenchmarkDataset):
class VisionArenaDataset(HuggingFaceDataset):
"""
Vision Arena Dataset.
"""
Expand All @@ -617,14 +637,9 @@ class VisionArenaDataset(BenchmarkDataset):

def __init__(
self,
dataset_split: str,
dataset_subset: Optional[str] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.dataset_split = dataset_split
self.dataset_subset = dataset_subset

if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
Expand All @@ -645,18 +660,24 @@ def load_data(self) -> None:
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: int = DEFAULT_OUTPUT_LEN,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
# TODO (jenniferzhao): Add support for offline benchmark sampling
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = item["turns"][0][0]["content"]
prompt_len = len(tokenizer(prompt).input_ids)
mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,
Expand Down
Loading