Skip to content

add script for benchmark #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import aiohttp
import requests
import asyncio
import time
import os
import random

start_time = time.time()
SERVER_ENDPOINT = "http://localhost:3928"
TOTAL_REQUESTS = 16
N_PARALLEL = 4
MAX_CTX_FOR_ONE_SEQUENCE = 512
N_CTX = MAX_CTX_FOR_ONE_SEQUENCE*N_PARALLEL # this number related to reserve GPU memory for kv cache

def start_server():
import subprocess

subprocess.run(["cd", "../examples/server/build/", "&&", "./server"])
time.sleep(2)


def load_model():
headers = {"Content-Type": "application/json"}
data = {"llama_model_path": "/mnt/nas/gguf-models/meta-llama3.1-8b-instruct-q4km.gguf", "model_alias": "meta-llama3.1-8b-instruct",
"model": "meta-llama3.1-8b-instruct", "ctx_len": N_CTX,"n_batch":2048, "ngl": 300, "model_type": "llm", "n_parallel": N_PARALLEL}
result = requests.post(SERVER_ENDPOINT+"/loadmodel",
headers=headers, json=data)
print(result.json())


async def send_request(session, prompt):
headers = {"Content-Type": "application/json"}
data = {"model": "meta-llama3.1-8b-instruct",
"messages": [{"role": "user", "content": prompt},]}
async with session.post(SERVER_ENDPOINT+"/v1/chat/completions", headers=headers, json=data) as resp:
result = await resp.json()
return result


async def send_request_sequence():
# warm up
async with aiohttp.ClientSession() as session:
res = await send_request(session, "What is GPU?")

start = time.time()
total_token_processed = 0
async with aiohttp.ClientSession() as session:

tasks = []
prompts = ["What is GPU?", "Who won the world cup 2022?", "Tell me some dad's joke",
"Write a quick sort function", "What is the price of Nvidia H100?", "Who won the world series in 2020?"]
for number in range(TOTAL_REQUESTS):
res = await send_request(session, random.choice(prompts))
if res.get("usage"):
total_token_processed += res["usage"]["total_tokens"]
else:
print(res)

end = time.time()
print("Finished in", end-start, "s")
print("Total token:", total_token_processed)
print("Throughput when run in sequence:", total_token_processed/(end-start), "tokens/s")
print("------------------------------------------------------------------------")


async def main():
# warm up
async with aiohttp.ClientSession() as session:
res = await send_request(session, "What is GPU?")

start = time.time()
total_token_processed = 0
async with aiohttp.ClientSession() as session:

tasks = []
prompts = ["What is GPU?", "Who won the world cup 2022?", "Tell me some dad's joke",
"Write a quick sort function", "What is the price of Nvidia H100?", "Who won the world series in 2020?"]
for number in range(TOTAL_REQUESTS):
tasks.append(asyncio.ensure_future(
send_request(session, random.choice(prompts))))

results = await asyncio.gather(*tasks)
for res in results:
# print(res)
if res.get("usage"):
total_token_processed += res["usage"]["total_tokens"]
else:
print(res)
end = time.time()
print("Finished in", end-start, "s")
print("Total token:", total_token_processed)
print("Throughput when run parallel:", total_token_processed/(end-start), "tokens/s")
print("------------------------------------------------------------------------")
# start_server()
load_model()

asyncio.run(main())

asyncio.run(send_request_sequence())
print("--- %s seconds ---" % (time.time() - start_time))
5 changes: 3 additions & 2 deletions src/llama_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
}
}

params.n_gpu_layers = json_body->get("ngl", 100).asInt();
params.n_gpu_layers = json_body->get("ngl", 300).asInt(); // change from 100 -> 300 since llama 3.1 has 292 gpu layers
params.n_ctx = json_body->get("ctx_len", 2048).asInt();
model_type = json_body->get("model_type", "llm").asString();
// In case of embedding only model, we set default = true
Expand All @@ -343,10 +343,11 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
params.n_ubatch = json_body->get("n_ubatch", params.n_batch).asInt();
// Check if n_parallel exists in json_body, if not, set to drogon_thread
params.n_parallel = json_body->get("n_parallel", 1).asInt();
LOG_INFO<< "Number of parallel is set to "<< params.n_parallel;
params.n_threads =
json_body->get("cpu_threads", std::thread::hardware_concurrency())
.asInt();
params.cont_batching = json_body->get("cont_batching", false).asBool();
params.cont_batching = json_body->get("cont_batching", true).asBool(); // default true according to llama.cpp upstream

params.cache_type_k = json_body->get("cache_type", kTypeF16).asString();
if (!IsValidCacheType(params.cache_type_k)) {
Expand Down
Loading