diff --git a/benchmarking/benchmark-values.yaml b/benchmarking/benchmark-values.yaml new file mode 100644 index 000000000..1ef8ae71b --- /dev/null +++ b/benchmarking/benchmark-values.yaml @@ -0,0 +1,63 @@ +job: + image: + repository: quay.io/inference-perf/inference-perf + tag: "latest" # Defaults to .Chart.AppVersion + serviceAccountName: "" + nodeSelector: {} + # Example resources: + # resources: + # requests: + # cpu: "1" + # memory: "4Gi" + # limits: + # cpu: "2" + # memory: "8Gi" + resources: {} + +logLevel: INFO + +# A GCS bucket path that points to the dataset file. +# The file will be copied from this path to the local file system +# at /dataset/dataset.json for use during the run. +# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json. +gcsPath: "" + +# hfToken optionally creates a secret with the specified token. +# Can be set using helm install --set hftoken= +hfToken: "" + +config: + load: + type: constant + interval: 15 + stages: + - rate: 10 + duration: 20 + - rate: 20 + duration: 20 + - rate: 30 + duration: 20 + api: + type: completion + streaming: true + server: + type: vllm + model_name: meta-llama/Llama-3.1-8B-Instruct + base_url: http://0.0.0.0:8000 + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct + data: + type: shareGPT + metrics: + type: prometheus + prometheus: + google_managed: true + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + prometheus: + summary: true + per_stage: true diff --git a/benchmarking/benchmark.ipynb b/benchmarking/benchmark.ipynb new file mode 100644 index 000000000..fe072dd3d --- /dev/null +++ b/benchmarking/benchmark.ipynb @@ -0,0 +1,596 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "executionInfo": { + "elapsed": 391, + "status": "ok", + "timestamp": 1741734317446, + "user": { + "displayName": "Cong Liu", + "userId": "18222691451061354557" + }, + "user_tz": 420 + }, + "id": "ziJD5zt0c1Rt" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "<>:8: SyntaxWarning: invalid escape sequence '\\d'\n", + "<>:8: SyntaxWarning: invalid escape sequence '\\d'\n", + "/tmp/ipykernel_2612482/2189011373.py:8: SyntaxWarning: invalid escape sequence '\\d'\n", + " FILE_MATCHER='.*stage_\\d_lifecycle_metrics*'\n" + ] + } + ], + "source": [ + "#@title Configuration. Edit this before running the rest.\n", + "\n", + "OUTPUT_DIR='output'\n", + "RUN_ID='default-run'\n", + "# Path to the benchmark dir under `gateway-api-inference-extension/benchmark`\n", + "BENCHMARK_DIR =\"./\"\n", + "# A regex to match the output file name.\n", + "FILE_MATCHER='.*stage_\\d_lifecycle_metrics*'\n", + "INTERACTIVE_PLOT='False'" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "executionInfo": { + "elapsed": 33, + "status": "ok", + "timestamp": 1741735749209, + "user": { + "displayName": "Cong Liu", + "userId": "18222691451061354557" + }, + "user_tz": 420 + }, + "id": "dB7xALgLawN-" + }, + "outputs": [], + "source": [ + "#@title Plot Helper\n", + "import os\n", + "import pandas as pd\n", + "import re\n", + "import json\n", + "from collections import OrderedDict\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import math\n", + "from sklearn.metrics import r2_score\n", + "import logging\n", + "level = logging.INFO\n", + "logger = logging.getLogger(__name__)\n", + "logger.setLevel(level)\n", + "handler = logging.StreamHandler() # This sends output to the console\n", + "handler.setLevel(level) # Set handler level\n", + "logger.addHandler(handler)\n", + "\n", + "title_fontsize = 18\n", + "axis_label_fontsize = 18\n", + "legend_fontsize = 16\n", + "tick_label_fontsize = 14\n", + "\n", + "# Encapsulates some basic information needed to plot metrics.\n", + "class XY:\n", + " def __init__(self, x: str, y: str, x_label=None, y_label=None):\n", + " self.x = x\n", + " self.y = y\n", + " self.x_label = x if x_label is None else x_label\n", + " self.y_label = y if y_label is None else y_label\n", + "\n", + "NUM_PLOTS_PER_ROW = 4\n", + "# The arguments need to match the metric name fields generated by the benchmark tool.\n", + "CORE_METRICS = [\n", + " XY(x = 'load_summary_requested_rate', x_label = 'QPS', y = 'successes_throughput_output_tokens_per_sec', y_label= 'output_tokens_per_sec'),\n", + " XY(x = \"load_summary_requested_rate\", x_label = 'QPS', y = \"successes_latency_time_per_output_token_p90\", y_label= 'p90_per_output_token_latency'),\n", + " XY(x = \"load_summary_requested_rate\", x_label = 'QPS', y = \"successes_latency_request_latency_p90\", y_label= 'p90_latency'),\n", + " XY(x = \"load_summary_requested_rate\", x_label = 'QPS', y = \"load_summary_count\", y_label= 'num_prompts_attempted'),\n", + " XY(x = \"load_summary_requested_rate\", x_label = 'QPS', y = \"successes_count\", y_label= 'num_prompts_succeeded'),\n", + "]\n", + "SANITY_CHECK_METRICS = [\n", + " XY(x = 'load_summary_requested_rate', x_label = 'QPS', y = 'successes_throughput_requests_per_sec', y_label= 'throughput_rps'),\n", + " XY(x = 'load_summary_requested_rate', x_label = 'QPS', y = 'successes_throughput_input_tokens_per_sec', y_label= 'total_input_tokens'),\n", + " XY(x = 'load_summary_requested_rate', x_label = 'QPS', y = 'successes_throughput_output_tokens_per_sec', y_label= 'total_output_token'),\n", + " XY(x = 'load_summary_requested_rate', x_label = 'QPS', y = 'prompt_len_mean', y_label= 'avg_input_len'),\n", + " XY(x = 'load_summary_requested_rate', x_label = 'QPS', y = 'output_len_mean', y_label= 'avg_output_len'),\n", + "]\n", + "\n", + "class Label:\n", + " def __init__(self, name, alias=None):\n", + " self.name = name\n", + " self.alias = name if alias is None else alias\n", + "\n", + "ALL_METRICS = CORE_METRICS + SANITY_CHECK_METRICS\n", + "\n", + "class Plotter:\n", + " def __init__(self, run_id, labels=None, metrics=CORE_METRICS, num_plots_per_row=5, interactive=False, annotate=False, output_dir=OUTPUT_DIR):\n", + " self.run_id = run_id\n", + " self.labels = labels\n", + " self.metrics = metrics\n", + " self.num_plots_per_row = num_plots_per_row\n", + " self.interactive = interactive\n", + " self.annotate = annotate\n", + " self.output_dir = output_dir\n", + " self.data = load_data(self.labels, self.run_id, self.output_dir)\n", + " self.groups = group_data(self.data, self.metrics)\n", + "\n", + " def withRunId(self, run_id):\n", + " return Plotter(run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n", + "\n", + " def withLabels(self, labels):\n", + " return Plotter(self.run_id, labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n", + "\n", + " def withMetrics(self, metrics):\n", + " return Plotter(self.run_id, self.labels, metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n", + "\n", + " def withOutputDir(self, output_dir):\n", + " return Plotter(self.run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, output_dir)\n", + "\n", + " def plot_bar(self):\n", + " \n", + " logger.debug(\"Plotting run id...\")\n", + " plot_bar(self.labels, self.groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n", + "\n", + " def plot_delta(self):\n", + " \"\"\"\n", + " Plot the delta between two labels.\n", + " \"\"\"\n", + " logger.debug(\"Plotting delta for run id...\")\n", + " plot_delta(self.labels, self.groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n", + "\n", + "def filepaths(root_dir):\n", + " \"\"\"\n", + " Recursively reads files within a directory and returns a list of file paths.\n", + " \"\"\"\n", + "\n", + " filepaths = []\n", + " for dirpath, dirnames, filenames in os.walk(root_dir):\n", + " for filename in filenames:\n", + " filepath = os.path.join(dirpath, filename)\n", + " filepaths.append(filepath)\n", + " return filepaths\n", + "\n", + "def flatten_server_metrics(server_metrics):\n", + " \"\"\"\n", + " Flattens the server metrics json to a single level.\n", + " \"\"\"\n", + " flattend = {}\n", + " for k, v in server_metrics.items():\n", + " if isinstance(v, dict):\n", + " for k2, v2 in v.items():\n", + " flattend[k + \".\" + k2] = v2\n", + "\n", + " return flattend\n", + "\n", + "def load_data(labels, run_id, output_dir=OUTPUT_DIR):\n", + " data_path =f\"{BENCHMARK_DIR}/{output_dir}/{run_id}\"\n", + " records = []\n", + " logger.debug(f\"Loading data for {data_path}\")\n", + " for file in filepaths(data_path):\n", + " for label in labels:\n", + " regex = f\".*/{label.name}/results/json/{FILE_MATCHER}.json\"\n", + " logger.debug(f\"matching file {file} for regex {regex} and label {label}\")\n", + " if re.match(regex, file):\n", + " logger.debug(f\"found match file {file} for regex {regex} and label {label}\")\n", + " with open(file, 'r') as f:\n", + " raw_data = json.load(f)\n", + " sample_data = {\n", + " 'file_name': f.name,\n", + " 'label': label.alias,\n", + " }\n", + " for k, v in raw_data.items():\n", + " if isinstance(v, dict):\n", + " for k2, v2 in v.items():\n", + " if isinstance(v2, dict):\n", + " for k3, v3 in v2.items():\n", + " if isinstance(v3, dict):\n", + " for k4, v4 in v3.items():\n", + " sample_data[f\"{k}_{k2}_{k3}_{k4}\"] = v4\n", + " else:\n", + " sample_data[f\"{k}_{k2}_{k3}\"] = v3\n", + " else:\n", + " sample_data[f\"{k}_{k2}\"] = v2\n", + " else:\n", + " sample_data[k] = v\n", + " if 'config_load_summary_requested_rate' in sample_data and 'config_num_models' in sample_data:\n", + " sample_data['load_summary_requested_rate'] = sample_data['config_load_summary_requested_rate'] * sample_data['config_num_models']\n", + " records.append(sample_data)\n", + " all_data = pd.DataFrame.from_records(records, index='file_name') if len(records) > 0 else pd.DataFrame()\n", + " return all_data\n", + "\n", + "def group_data(all_data, metrics=CORE_METRICS):\n", + " try:\n", + " data = all_data.sort_values(by=['load_summary_requested_rate'], ascending=True).copy()\n", + " except Exception as e:\n", + " print(f\"Error sorting data: {e}\")\n", + " return None\n", + "\n", + " # Ensure there is exactly one benchmark result per label and x-axis for each\n", + " # metric.\n", + " x_axes = set()\n", + " for m in metrics:\n", + " x_axes.add(m.x)\n", + "\n", + " for x in x_axes:\n", + " # Check for missing columns before grouping\n", + " if 'label' not in data.columns or x not in data.columns:\n", + " print(f\"Missing 'label' or '{x}' column in data, skipping grouping.\")\n", + " continue\n", + " sizes = data.groupby(by=['label', x], dropna=True).size()\n", + " for index, v in sizes.items():\n", + " if v > 1:\n", + " label, _ = index\n", + " # print(f\"Multiple benchmark results for the same label ({label}), and x-axis ({x}). {index}: {v}. Please use more selective file filters.\")\n", + " # raise ValueError(f\"Multiple benchmark results for the same label ({label}), and x-axis ({x}). Please use more selective file filters.\")\n", + "\n", + " # Group by label.\n", + " if 'label' in data.columns:\n", + " groups = data.groupby(by=['label'],sort=True)\n", + " return groups\n", + " else:\n", + " print(\"Missing 'label' column, cannot group data.\")\n", + " return None\n", + "\n", + "def compute_r2_for_metrics(groups, metrics, label_before, label_after):\n", + " print(\"\\nCoefficient of Determination (R^2) between before and after runs:\")\n", + " for m in metrics:\n", + " try:\n", + " df_b = groups.get_group(label_before).set_index('load_summary_requested_rate')\n", + " df_a = groups.get_group(label_after).set_index('load_summary_requested_rate')\n", + " except KeyError:\n", + " print(f\" Skipping {m.y}: missing group data for '{label_before}' or '{label_after}'\")\n", + " continue\n", + " common = sorted(set(df_b.index).intersection(df_a.index))\n", + " yb = df_b.loc[common, m.y].values\n", + " ya = df_a.loc[common, m.y].values\n", + " mask = ~np.isnan(yb) & ~np.isnan(ya)\n", + " yb, ya = yb[mask], ya[mask]\n", + " if len(yb) > 1 and np.any(yb != 0):\n", + " r2 = r2_score(yb, ya)\n", + " print(f\" {m.y:<30} R^2 = {r2:.4f}\")\n", + " else:\n", + " print(f\" {m.y:<30} insufficient data for R^2 calculation\")\n", + "\n", + "\n", + "def init_plot(metrics, num_plots_per_row=NUM_PLOTS_PER_ROW):\n", + " num_plots_per_row = min(num_plots_per_row, len(metrics))\n", + " row_count = math.ceil(len(metrics) / num_plots_per_row)\n", + " fig, axes = plt.subplots(nrows=row_count, ncols=num_plots_per_row, figsize=(20, 5*row_count), tight_layout=True)\n", + " if row_count == 1 and num_plots_per_row == 1:\n", + " axes = [axes]\n", + " return fig, axes\n", + "\n", + "def plot_metrics(metrics, plot_func, num_plots_per_row=NUM_PLOTS_PER_ROW, fig=None, axes=None):\n", + " \"\"\"\n", + " plot_func: a function in the form of def plot_func(ax:~matplotlib.axes.Axes , m: XY):\n", + " \"\"\"\n", + " logger.debug(f'Plotting metrics: {metrics}')\n", + " num_plots_per_row = min(num_plots_per_row, len(metrics))\n", + " if fig is None or axes is None:\n", + " logger.debug(f'Creating new figure and axes')\n", + " fig, axes = init_plot(metrics, num_plots_per_row)\n", + " row_count = math.ceil(len(metrics) / num_plots_per_row)\n", + " for i, m in enumerate(metrics):\n", + " row = math.floor(i/num_plots_per_row)\n", + " col = i%num_plots_per_row\n", + " if row_count == 1:\n", + " curAx = axes[col]\n", + " else:\n", + " curAx = axes[row, col]\n", + " plot_func(curAx, m)\n", + " return fig, axes\n", + "\n", + "def plot_bar(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=INTERACTIVE_PLOT, annotate=False):\n", + " labels = [label.alias for label in labels]\n", + " logger.debug(f'Prnting bar chart for {labels}')\n", + " logger.debug(f'groups: {groups}')\n", + " dataframes = []\n", + " for label in labels:\n", + " try:\n", + " dataframes.append(groups.get_group((label,)))\n", + " except:\n", + " logger.debug(f\"No data found for label {label}\")\n", + " continue\n", + " y_columns = [m.y for m in metrics]\n", + " logger.debug(f'y_columns: {y_columns}')\n", + " logger.debug(f'dataframes: {dataframes}')\n", + "\n", + " # 1. Combine all request rates\n", + " all_load_summary_requested_rates = set()\n", + " for df in dataframes:\n", + " all_load_summary_requested_rates.update(df['load_summary_requested_rate'].astype(int))\n", + " all_load_summary_requested_rates = sorted(list(all_load_summary_requested_rates))\n", + "\n", + " # 2. Prepare data for plotting: Create a nested dictionary\n", + " plot_data = {y_col: {label: {} for label in labels} for y_col in y_columns}\n", + "\n", + " for i, df in enumerate(dataframes):\n", + " label = labels[i]\n", + " df_dict = df.set_index('load_summary_requested_rate').to_dict()\n", + " for y_col in y_columns:\n", + " for load_summary_requested_rate in all_load_summary_requested_rates:\n", + " plot_data[y_col][label][load_summary_requested_rate] = df_dict.get(y_col, {}).get(load_summary_requested_rate, np.nan)\n", + "\n", + " logger.debug(f'Plot_data: {plot_data}')\n", + "\n", + " # 3. Plotting\n", + " def plot_func(curAx, m):\n", + " num_load_summary_requested_rates = len(all_load_summary_requested_rates)\n", + " num_labels = len(labels)\n", + " x = np.arange(num_load_summary_requested_rates) # the label locations (x-axis positions)\n", + " width = 0.4 / num_labels # width of the bars\n", + "\n", + " for i, label in enumerate(labels):\n", + " bar_x = x - (width*num_labels)/2 + i*width + width/2\n", + " #Extract y-values to plot\n", + " y_values = [plot_data[m.y][label][rr] for rr in all_load_summary_requested_rates]\n", + "\n", + " rects = curAx.bar(bar_x, y_values, width, label=label)\n", + " if annotate:\n", + " for rect, val in zip(rects, y_values):\n", + " if not np.isnan(val):\n", + " height = rect.get_height()\n", + " curAx.annotate(f'{val:.2f}',\n", + " xy=(rect.get_x() + rect.get_width() / 2, height),\n", + " xytext=(0, 3), # 3 points vertical offset\n", + " textcoords=\"offset points\",\n", + " ha='center', va='bottom')\n", + " # Add labels, title, and legend\n", + " curAx.set_xlabel(m.x_label, fontsize=axis_label_fontsize)\n", + " curAx.set_ylabel(m.y_label, fontsize=axis_label_fontsize)\n", + " curAx.set_xticks(x)\n", + " curAx.set_xticklabels(all_load_summary_requested_rates)\n", + " curAx.tick_params(axis='both', labelsize=tick_label_fontsize)\n", + " curAx.legend(fontsize=legend_fontsize, loc='upper left', frameon=True, framealpha=0.8, edgecolor='black')\n", + " fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n", + " fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n", + " plt.show()\n", + "\n", + "def plot_delta(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=True, annotate=False):\n", + " \"\"\"\n", + " Plot the delta between base_label and compare_label for each metric.\n", + " A positive delta means compare_label has a higher value than base_label.\n", + " \"\"\"\n", + " base_label = labels[0].name\n", + " compare_label = labels[1].name\n", + " logger.debug(f'Printing delta chart for {base_label} vs {compare_label}')\n", + "\n", + " try:\n", + " base_df = groups.get_group((base_label,))\n", + " compare_df = groups.get_group((compare_label,))\n", + " except Exception as e:\n", + " logger.error(f\"Error getting data for labels {base_label} and {compare_label}: {e}\")\n", + " return\n", + "\n", + " y_columns = [m.y for m in metrics]\n", + "\n", + " # 1. Find common request rates\n", + " base_rates = set(base_df['load_summary_requested_rate'].astype(int))\n", + " compare_rates = set(compare_df['load_summary_requested_rate'].astype(int))\n", + " common_rates = sorted(list(base_rates.intersection(compare_rates)))[:6]\n", + "\n", + " if not common_rates:\n", + " logger.error(f\"No common request rates found between {base_label} and {compare_label}\")\n", + " return\n", + "\n", + " # 2. Prepare data for delta calculation\n", + " base_data = base_df.set_index('load_summary_requested_rate').to_dict()\n", + " compare_data = compare_df.set_index('load_summary_requested_rate').to_dict()\n", + "\n", + " # Calculate deltas (compare_label - base_label)\n", + " delta_data = {y_col: {} for y_col in y_columns}\n", + " for y_col in y_columns:\n", + " for rate in common_rates:\n", + " base_val = base_data.get(y_col, {}).get(rate, np.nan)\n", + " compare_val = compare_data.get(y_col, {}).get(rate, np.nan)\n", + "\n", + " if not np.isnan(base_val) and not np.isnan(compare_val):\n", + " delta_data[y_col][rate] = (compare_val - base_val)/base_val*100\n", + " else:\n", + " delta_data[y_col][rate] = np.nan\n", + "\n", + " # 3. Plotting\n", + " def plot_func(curAx, m):\n", + " x = np.arange(len(common_rates))\n", + " y_values = [delta_data[m.y].get(rr, np.nan) for rr in common_rates]\n", + "\n", + " # Determine colors based on positive/negative values\n", + " colors = ['green' if val > 0 else 'blue' for val in y_values]\n", + "\n", + " rects = curAx.bar(x, y_values, 0.6, color=colors)\n", + "\n", + " # Add a horizontal line at y=0\n", + " curAx.axhline(y=0, color='black', linestyle='-', linewidth=1)\n", + "\n", + " if annotate:\n", + " for rect, val in zip(rects, y_values):\n", + " if not np.isnan(val):\n", + " height = rect.get_height()\n", + " # For negative bars, put text above the bar\n", + " vert_align = 'bottom' if val >= 0 else 'top'\n", + " y_offset = 3 if val >= 0 else -3\n", + "\n", + " curAx.annotate(f'{val:.2f}',\n", + " xy=(rect.get_x() + rect.get_width() / 2, val),\n", + " xytext=(0, y_offset), # vertical offset\n", + " textcoords=\"offset points\",\n", + " ha='center', va=vert_align)\n", + "\n", + " # Create a title that shows what this delta represents\n", + " title = f\"Delta: {compare_label} - {base_label} ({m.y})\"\n", + " curAx.set_title(title, fontsize=12)\n", + "\n", + " # Add labels\n", + " curAx.set_xlabel(m.x_label, fontsize=axis_label_fontsize)\n", + " #curAx.set_ylabel(f\"% Delta in {m.y_label}\", fontsize=axis_label_fontsize)\n", + " curAx.set_xticks(x)\n", + " curAx.set_xticklabels(common_rates)\n", + " curAx.tick_params(axis='both', labelsize=tick_label_fontsize)\n", + "\n", + " # Create a dummy handle for the legend\n", + " legend_handle = [plt.Rectangle((0,0),1,1,color='green'),\n", + " plt.Rectangle((0,0),1,1,color='blue')]\n", + " legend_label = [f'{compare_label} > {base_label}',\n", + " f'{compare_label} < {base_label}']\n", + "\n", + " return legend_handle, legend_label\n", + "\n", + " # Create plot with metrics\n", + " fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n", + "\n", + " # Add an overall title for the figure\n", + " fig.suptitle(f\"% Delta Metrics: {compare_label} - {base_label}\",\n", + " fontsize=title_fontsize, y=0.98)\n", + "\n", + " plt.subplots_adjust(bottom=0.15, top=0.9) # Make room for legends\n", + " fig.tight_layout(rect=[0, 0.1, 1, 0.95]) # Adjust the rectangle in which the subplots fit\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "height": 1000 + }, + "executionInfo": { + "elapsed": 2232, + "status": "ok", + "timestamp": 1741735855456, + "user": { + "displayName": "Cong Liu", + "userId": "18222691451061354557" + }, + "user_tz": 420 + }, + "id": "HbGEAOucb_Jn", + "outputId": "faf0304b-92f4-4fa7-ae71-83b8bd987e70" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Coefficient of Determination (R^2) between before and after runs:\n", + " successes_throughput_output_tokens_per_sec R^2 = 1.0000\n", + " successes_latency_time_per_output_token_p90 R^2 = 1.0000\n", + " successes_latency_request_latency_p90 R^2 = 1.0000\n", + " load_summary_count R^2 = 1.0000\n", + " successes_count R^2 = 1.0000\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2612482/1457368969.py:189: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_b = groups.get_group(label_before).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:190: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_a = groups.get_group(label_after).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:189: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_b = groups.get_group(label_before).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:190: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_a = groups.get_group(label_after).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:189: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_b = groups.get_group(label_before).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:190: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_a = groups.get_group(label_after).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:189: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_b = groups.get_group(label_before).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:190: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_a = groups.get_group(label_after).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:189: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_b = groups.get_group(label_before).set_index('load_summary_requested_rate')\n", + "/tmp/ipykernel_2612482/1457368969.py:190: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.\n", + " df_a = groups.get_group(label_after).set_index('load_summary_requested_rate')\n" + ] + } + ], + "source": [ + "#@title Plot Result\n", + "# initialize the plotter with the run id and labels. \n", + "# Example labels are 'inference-extension' and 'k8s-svc' if comparing Inference Extension and K8s Service \n", + "# 'regression-before' and 'regression-after' if comparing two different runs of inference extension to see the regression\n", + "\n", + "benchmark_id1 = 'inference-extension' # eg 'regression-before' or 'inference-extension'\n", + "benchmark_id2 = 'inference-extension' # eg 'regression-after' or 'k8s-svc'\n", + "labels = [Label(benchmark_id1), Label(benchmark_id2,)]\n", + "\n", + "# Plot bar chart of metrics\n", + "pl = Plotter(run_id=RUN_ID, labels=labels, output_dir=OUTPUT_DIR)\n", + "pl.plot_bar()\n", + "pl.plot_delta()\n", + "\n", + "# Load & group data to compute R^2\n", + "all_data = load_data(labels, RUN_ID, OUTPUT_DIR)\n", + "groups = group_data(all_data)\n", + "compute_r2_for_metrics(groups, CORE_METRICS,\n", + " label_before=benchmark_id1,\n", + " label_after=benchmark_id2)\n", + "\n" + ] + } + ], + "metadata": { + "colab": { + "last_runtime": { + "build_target": "", + "kind": "local" + }, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Inference Perf (.venv)", + "language": "python", + "name": "inference-perf" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/benchmarking/download-gcs-results.bash b/benchmarking/download-gcs-results.bash new file mode 100755 index 000000000..edefb49d9 --- /dev/null +++ b/benchmarking/download-gcs-results.bash @@ -0,0 +1,32 @@ +#!/bin/bash + +# Downloads a file from a GCS bucket. + +# Check if GCS_BUCKET is provided as an argument +if [ -z "$1" ]; then + echo "Usage: $0 [GCS_FOLDER_PATH:DEFAULT=benchmark_results]" + exit 1 +fi + +GCS_BUCKET="$1" +GCS_FOLDER_PATH="${2:-benchmark_results/}" # Default to benchmark_results/ if not provided + +# Env vars to be passed when calling this script. +# The id of the benchmark. This is needed to identify what the benchmark is for. +# It decides the filepath to save the results, which later is used by the jupyter notebook to assign +# the benchmark_id as data labels for plotting. +benchmark_id=${benchmark_id:-"inference-extension"} +# run_id can be used to group different runs of the same benchmarks for comparison. +run_id=${run_id:-"default-run"} +output_dir=${output_dir:-'output'} + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id} + +echo "Creating output directory: ${benchmark_output_dir}/results/json/" +mkdir -p "${benchmark_output_dir}/results/json/" + +echo "Downloading gs://${GCS_BUCKET}/${GCS_FOLDER_PATH} to ${benchmark_output_dir}/results/json/" +gsutil cp -r "gs://${GCS_BUCKET}/${GCS_FOLDER_PATH}" "${benchmark_output_dir}/results/json/" + +echo "Download complete." diff --git a/benchmarking/inference-perf/.helmignore b/benchmarking/inference-perf/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/benchmarking/inference-perf/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/benchmarking/inference-perf/Chart.yaml b/benchmarking/inference-perf/Chart.yaml new file mode 100644 index 000000000..0295e06ef --- /dev/null +++ b/benchmarking/inference-perf/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: inference-perf +description: A Helm chart for running inference-perf benchmarking tool +type: application +version: 0.2.0 +appVersion: "0.2.0" diff --git a/benchmarking/inference-perf/README.md b/benchmarking/inference-perf/README.md new file mode 100644 index 000000000..54fdd3edd --- /dev/null +++ b/benchmarking/inference-perf/README.md @@ -0,0 +1,85 @@ +## 🚀 Deploying `inference-perf` via Helm Chart + +This guide explains how to deploy `inference-perf` to a Kubernetes cluster with Helm. + +Note: This is a temporary chart added until remote chart is available. + +--- + +### 1. Prerequisites + +Make sure you have the following tools installed and configured: + +* **Kubernetes Cluster:** Access to a functional cluster (e.g., GKE). +* **Helm:** The Helm CLI installed locally. + +--- + +### 2. Configuration (`values.yaml`) + +Before deployment, navigate to the **`deploy/inference-perf`** directory and edit the **`values.yaml`** file to customize your deployment and the benchmark parameters. + +#### Optional Parameters + +| Key | Description | Default | +| :--- | :--- | :--- | +| `hfToken` | Hugging Face API token. If provided, a Kubernetes `Secret` named `hf-token-secret` will be created for authentication. | `""` | +| `serviceAccountName` | Standard Kubernetes `serviceAccountName`. If not provided, default service account is used. | `""` | +| `nodeSelector` | Standard Kubernetes `nodeSelector` map to constrain pod placement to nodes with matching labels. | `{}` | +| `resources` | Standard Kubernetes resource requests and limits for the main `inference-perf` container. | `{}` | +--- + +> **Example Resource Block:** +> ```yaml +> # resources: +> # requests: +> # cpu: "1" +> # memory: "4Gi" +> # limits: +> # cpu: "2" +> # memory: "8Gi" +> ``` + +#### GKE Specific Parameters + +This section details the necessary configuration and permissions for using a Google Cloud Storage (GCS) path to manage your dataset, typical for deployments on GKE. + +##### Required IAM Permissions + +The identity executing the workload (e.g., the associated Kubernetes Service Account, often configured via **Workload Identity**) must possess the following IAM roles on the target GCS bucket for data transfer: + +* **`roles/storage.objectViewer`** (Required to read/download the input dataset from GCS). +* **`roles/storage.objectCreator`** (Required to write/push benchmark results back to GCS). + + +| Key | Description | Default | +| :--- | :--- | :--- | +| `gcsPath` | A GCS URI pointing to the dataset file (e.g., `gs://my-bucket/dataset.json`). The file will be automatically copied to the running pod during initialization. | `""` | + +--- + +### 3. Run Deployment + +Use the **`helm install`** command from the **`deploy/inference-perf`** directory to deploy the chart. + +* **Standard Install:** Deploy using the default `values.yaml`. + ```bash + helm install test . + ``` + +* **Set `hfToken` Override:** Pass the Hugging Face token directly. + ```bash + helm install test . --set hfToken="" + ``` + +* **Custom Config Override:** Make changes to the values file for custom settings. + ```bash + helm install test . -f values.yaml + ``` + +### 4. Cleanup + +To remove the benchmark deployment. + ```bash + helm uninstall test + ``` \ No newline at end of file diff --git a/benchmarking/inference-perf/templates/_helpers.tpl b/benchmarking/inference-perf/templates/_helpers.tpl new file mode 100644 index 000000000..e3f6f5715 --- /dev/null +++ b/benchmarking/inference-perf/templates/_helpers.tpl @@ -0,0 +1,72 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "inference-perf.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "inference-perf.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "inference-perf.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "inference-perf.labels" -}} +helm.sh/chart: {{ include "inference-perf.chart" . }} +{{ include "inference-perf.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "inference-perf.selectorLabels" -}} +app.kubernetes.io/name: {{ include "inference-perf.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Common Secret Name for HuggingFace credentials +*/}} +{{- define "inference-perf.hfSecret" -}} +{{ include "inference-perf.fullname" . }}-hf-secret +{{- end -}} + +{{/* +Common Secret Key for HuggingFace credentials +*/}} +{{- define "inference-perf.hfKey" -}} +{{ include "inference-perf.fullname" . }}-hf-key +{{- end -}} + +{{/* +Mount path for config map +*/}} +{{- define "inference-perf.configMount" -}} +/cfg +{{- end -}} \ No newline at end of file diff --git a/benchmarking/inference-perf/templates/configmap.yaml b/benchmarking/inference-perf/templates/configmap.yaml new file mode 100644 index 000000000..aec22aa8c --- /dev/null +++ b/benchmarking/inference-perf/templates/configmap.yaml @@ -0,0 +1,10 @@ +# inference-perf/templates/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "inference-perf.fullname" . }}-config + labels: + {{- include "inference-perf.labels" . | nindent 4 }} +data: + config.yml: | + {{- toYaml .Values.config | nindent 4 }} \ No newline at end of file diff --git a/benchmarking/inference-perf/templates/job.yaml b/benchmarking/inference-perf/templates/job.yaml new file mode 100644 index 000000000..83a61f862 --- /dev/null +++ b/benchmarking/inference-perf/templates/job.yaml @@ -0,0 +1,57 @@ +# inference-perf/templates/job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "inference-perf.fullname" . }}-job + labels: + {{- include "inference-perf.labels" . | nindent 4 }} + app: inference-perf +spec: + template: + metadata: + labels: + {{- include "inference-perf.selectorLabels" . | nindent 8 }} + app: inference-perf + spec: + restartPolicy: Never + serviceAccountName: {{ .Values.job.serviceAccountName }} + {{- with .Values.job.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- if .Values.gcsPath}} + initContainers: + - name: fetch-dataset + image: google/cloud-sdk:latest + command: ["sh", "-c", "gsutil cp {{ .Values.gcsPath }} /dataset/dataset.json"] + volumeMounts: + - name: dataset-volume + mountPath: /dataset +{{- end }} + containers: + - name: inference-perf-container + image: "{{ .Values.job.image.repository }}:{{ .Values.job.image.tag | default .Chart.AppVersion }}" + command: ["inference-perf"] + args: + - "--config_file" + - "{{ include "inference-perf.configMount" . }}/config.yml" + - "--log-level" + - {{ .Values.logLevel }} + env: + {{- if .Values.hfToken }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ include "inference-perf.hfSecret" . }} + key: {{ include "inference-perf.hfKey" . }} + {{- end }} + volumeMounts: + - name: config-volume + mountPath: {{ include "inference-perf.configMount" . }} + readOnly: true + resources: + {{- toYaml .Values.job.resources | nindent 12 }} + volumes: + - name: config-volume + configMap: + name: {{ include "inference-perf.fullname" . }}-config diff --git a/benchmarking/inference-perf/templates/secret.yaml b/benchmarking/inference-perf/templates/secret.yaml new file mode 100644 index 000000000..66c9cff84 --- /dev/null +++ b/benchmarking/inference-perf/templates/secret.yaml @@ -0,0 +1,12 @@ +# inference-perf/templates/secret.yaml +{{- if .Values.hfToken }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "inference-perf.hfSecret" . }} + labels: + {{- include "inference-perf.labels" . | nindent 4 }} +type: Opaque +stringData: + {{ include "inference-perf.hfKey" . }}: {{ .Values.hfToken | quote }} +{{- end }} diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md index 42d5e727b..6aaed4baa 100644 --- a/site-src/performance/benchmark/index.md +++ b/site-src/performance/benchmark/index.md @@ -32,15 +32,75 @@ kubectl expose deployment vllm-llama3-8b-instruct --port=80 --target-port=8000 - ## Run benchmark -The LPG benchmark tool works by sending traffic to the specified target IP and port, and collecting the results. -Follow the steps below to run a single benchmark. Multiple LPG instances can be deployed to run benchmarks in +The inference perf tool works by sending traffic to the specified target IP and port, and collecting the results. +Follow the steps below to run a single benchmark. Multiple benchmarking instances can be deployed to run benchmarks in parallel against different targets. +#### Parameters to customize: + +For more parameter customizations, refer to inference-perf [guides](https://github.com/kubernetes-sigs/inference-perf/blob/main/docs/config.md) + +* `benchmark`: A unique name for this deployment. +* `hfToken`: Your hugging face token. +* `config.server.base_url`: The base URL (IP and port) of your inference server. + +### Storage Parameters + + Note: Currently inference-perf outputs benchmark results to standard output only, and results will be deleted once pod is finished running the job. + + +#### 1. Local Storage (Default) + +By default, reports are saved locally but **lost when the Pod terminates**. +```yaml +storage: + local_storage: + path: "reports-{timestamp}" # Local directory path + report_file_prefix: null # Optional filename prefix +``` + +#### 2. Google Cloud Storage (GCS) + +Use the `google_cloud_storage` block to save reports to a GCS bucket. + +```yaml +storage: + google_cloud_storage: # Optional GCS configuration + bucket_name: "your-bucket-name" # Required GCS bucket + path: "reports-{timestamp}" # Optional path prefix + report_file_prefix: null # Optional filename prefix +``` + +###### 🚨 GCS Permissions Checklist (Required for Write Access) + +1. **IAM Role (Service Account):** Bound to the target bucket. + + * **Minimum:** **Storage Object Creator** (`roles/storage.objectCreator`) + + * **Full:** **Storage Object Admin** (`roles/storage.objectAdmin`) + +2. **Node Access Scope (GKE Node Pool):** Set during node pool creation. + + * **Required Scope:** **`devstorage.read_write`** or **`cloud-platform`** + +#### 3. Simple Storage Service (S3) + +Use the `simple_storage_service` block for S3-compatible storage. Requires appropriate AWS credentials configured in the runtime environment. + +```yaml +storage: + simple_storage_service: + bucket_name: "your-bucket-name" # Required S3 bucket + path: "reports-{timestamp}" # Optional path prefix + report_file_prefix: null # Optional filename prefix +``` +### Steps to Deploy + 1. Check out the repo. ```bash git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension - cd gateway-api-inference-extension + cd gateway-api-inference-extension/benchmarking ``` 1. Get the target IP. The examples below shows how to get the IP of a gateway or a k8s service. @@ -49,32 +109,51 @@ parallel against different targets. # Get gateway IP GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') # Get LoadBalancer k8s service IP - SVC_IP=$(kubectl get service/vllm-llama2-7b -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + SVC_IP=$(kubectl get service/vllm-llama3-8b-instruct -o jsonpath='{.status.loadBalancer.ingress[0].ip}') echo $GW_IP echo $SVC_IP ``` -1. Then update the `` in `./config/manifests/benchmark/benchmark.yaml` to the value of `$SVC_IP` or `$GW_IP`. - Feel free to adjust other parameters such as `request_rates` as well. For a complete list of LPG configurations, refer to the - [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark). +1. Deploy Benchmark Tool. -1. Start the benchmark tool. + ```bash + export PORT='' + export HF_TOKEN='' + helm install igw-benchmark inference-perf/ -f benchmark-values.yaml \ + --set hfToken=${HF_TOKEN} \ + --set "config.server.base_url=http://${GW_IP}:${PORT}" + + export PORT='' + export HF_TOKEN='' + helm install k8s-benchmark inference-perf/ -f benchmark-values.yaml \ + --set hfToken=${HF_TOKEN} \ + --set "config.server.base_url=http://${SVC_IP}:${PORT}" + ``` + +1. Wait for benchmark to finish and download the results. Follow inference-perf [guides](https://github.com/kubernetes-sigs/inference-perf on how to access logs. At this moment logs are deleted from the pod if using local storage. + + #### GCS Benchmarking Script + + If storing results in GCS, you can use the `download-gcs-results.bash` script. + + Use the `benchmark_id` environment variable to specify what this + benchmark is for. For instance, `inference-extension` or `k8s-svc`. ```bash - kubectl apply -f ./config/manifests/benchmark/benchmark.yaml + benchmark_id='k8s-svc' ./download-gcs-results.bash + + benchmark_id='inference-extension' ./download-gcs-results.bash ``` -1. Wait for benchmark to finish and download the results. Use the `benchmark_id` environment variable to specify what this - benchmark is for. For instance, `inference-extension` or `k8s-svc`. When the LPG tool finishes benchmarking, it will print - a log line `LPG_FINISHED`. The script below will watch for that log line and then start downloading results. + After the script finishes, you should see benchmark results under `./benchmarking/output/default-run/k8s-svc/results/json/`. + +1. Uninstall the chart to tear down resources ```bash - benchmark_id='k8s-svc' ./tools/benchmark/download-benchmark-results.bash + helm uninstall igw-benchmark k8s-benchmark ``` - After the script finishes, you should see benchmark results under `./tools/benchmark/output/default-run/k8s-svc/results/json` folder. - Here is a [sample json file](./sample.json). Replace `k8s-svc` with `inference-extension` when running an inference extension benchmark. ### Tips @@ -82,11 +161,11 @@ parallel against different targets. updated accordingly to analyze the results. * You can specify `run_id="runX"` environment variable when running the `./download-benchmark-results.bash` script. This is useful when you run benchmarks multiple times to get a more statistically meaningful results and group the results accordingly. -* Update the `request_rates` that best suit your benchmark environment. +* Update the `stages` to request rates that best suit your benchmark environment. ### Advanced Benchmark Configurations -Refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark) for a +Refer to the inference-perf [guides](https://github.com/kubernetes-sigs/inference-perf/blob/main/docs/config.md) for a detailed list of configuration knobs. ## Analyze the results @@ -106,7 +185,7 @@ This guide shows how to run the jupyter notebook using vscode after completing k pip install -r ./tools/benchmark/requirements.txt ``` -1. Open the notebook `./tools/benchmark/benchmark.ipynb`, and run each cell. In the last cell update the benchmark ids with`inference-extension` and `k8s-svc`. At the end you should +1. Open the notebook `./benchmarking/benchmark.ipynb`, and run each cell. In the last cell update the benchmark ids with`inference-extension` and `k8s-svc`. At the end you should see a bar chart like below where **"ie"** represents inference extension. This chart is generated using this benchmarking tool with 6 vLLM (v1) model servers (H100 80 GB), [llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) and the [ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json). ![alt text](example-bar-chart.png) \ No newline at end of file