Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit a53c432

Browse files
Merge pull request #168 from janhq/feat-batching-support
add script for benchmark
2 parents a7ad46a + f40ea6f commit a53c432

File tree

2 files changed

+103
-2
lines changed

2 files changed

+103
-2
lines changed

scripts/benchmark.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import aiohttp
2+
import requests
3+
import asyncio
4+
import time
5+
import os
6+
import random
7+
8+
start_time = time.time()
9+
SERVER_ENDPOINT = "http://localhost:3928"
10+
TOTAL_REQUESTS = 16
11+
N_PARALLEL = 4
12+
MAX_CTX_FOR_ONE_SEQUENCE = 512
13+
N_CTX = MAX_CTX_FOR_ONE_SEQUENCE*N_PARALLEL # this number related to reserve GPU memory for kv cache
14+
15+
def start_server():
16+
import subprocess
17+
18+
subprocess.run(["cd", "../examples/server/build/", "&&", "./server"])
19+
time.sleep(2)
20+
21+
22+
def load_model():
23+
headers = {"Content-Type": "application/json"}
24+
data = {"llama_model_path": "/mnt/nas/gguf-models/meta-llama3.1-8b-instruct-q4km.gguf", "model_alias": "meta-llama3.1-8b-instruct",
25+
"model": "meta-llama3.1-8b-instruct", "ctx_len": N_CTX,"n_batch":2048, "ngl": 300, "model_type": "llm", "n_parallel": N_PARALLEL}
26+
result = requests.post(SERVER_ENDPOINT+"/loadmodel",
27+
headers=headers, json=data)
28+
print(result.json())
29+
30+
31+
async def send_request(session, prompt):
32+
headers = {"Content-Type": "application/json"}
33+
data = {"model": "meta-llama3.1-8b-instruct",
34+
"messages": [{"role": "user", "content": prompt},]}
35+
async with session.post(SERVER_ENDPOINT+"/v1/chat/completions", headers=headers, json=data) as resp:
36+
result = await resp.json()
37+
return result
38+
39+
40+
async def send_request_sequence():
41+
# warm up
42+
async with aiohttp.ClientSession() as session:
43+
res = await send_request(session, "What is GPU?")
44+
45+
start = time.time()
46+
total_token_processed = 0
47+
async with aiohttp.ClientSession() as session:
48+
49+
tasks = []
50+
prompts = ["What is GPU?", "Who won the world cup 2022?", "Tell me some dad's joke",
51+
"Write a quick sort function", "What is the price of Nvidia H100?", "Who won the world series in 2020?"]
52+
for number in range(TOTAL_REQUESTS):
53+
res = await send_request(session, random.choice(prompts))
54+
if res.get("usage"):
55+
total_token_processed += res["usage"]["total_tokens"]
56+
else:
57+
print(res)
58+
59+
end = time.time()
60+
print("Finished in", end-start, "s")
61+
print("Total token:", total_token_processed)
62+
print("Throughput when run in sequence:", total_token_processed/(end-start), "tokens/s")
63+
print("------------------------------------------------------------------------")
64+
65+
66+
async def main():
67+
# warm up
68+
async with aiohttp.ClientSession() as session:
69+
res = await send_request(session, "What is GPU?")
70+
71+
start = time.time()
72+
total_token_processed = 0
73+
async with aiohttp.ClientSession() as session:
74+
75+
tasks = []
76+
prompts = ["What is GPU?", "Who won the world cup 2022?", "Tell me some dad's joke",
77+
"Write a quick sort function", "What is the price of Nvidia H100?", "Who won the world series in 2020?"]
78+
for number in range(TOTAL_REQUESTS):
79+
tasks.append(asyncio.ensure_future(
80+
send_request(session, random.choice(prompts))))
81+
82+
results = await asyncio.gather(*tasks)
83+
for res in results:
84+
# print(res)
85+
if res.get("usage"):
86+
total_token_processed += res["usage"]["total_tokens"]
87+
else:
88+
print(res)
89+
end = time.time()
90+
print("Finished in", end-start, "s")
91+
print("Total token:", total_token_processed)
92+
print("Throughput when run parallel:", total_token_processed/(end-start), "tokens/s")
93+
print("------------------------------------------------------------------------")
94+
# start_server()
95+
load_model()
96+
97+
asyncio.run(main())
98+
99+
asyncio.run(send_request_sequence())
100+
print("--- %s seconds ---" % (time.time() - start_time))

src/llama_engine.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
333333
}
334334
}
335335

336-
params.n_gpu_layers = json_body->get("ngl", 100).asInt();
336+
params.n_gpu_layers = json_body->get("ngl", 300).asInt(); // change from 100 -> 300 since llama 3.1 has 292 gpu layers
337337
params.n_ctx = json_body->get("ctx_len", 2048).asInt();
338338
model_type = json_body->get("model_type", "llm").asString();
339339
// In case of embedding only model, we set default = true
@@ -343,10 +343,11 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
343343
params.n_ubatch = json_body->get("n_ubatch", params.n_batch).asInt();
344344
// Check if n_parallel exists in json_body, if not, set to drogon_thread
345345
params.n_parallel = json_body->get("n_parallel", 1).asInt();
346+
LOG_INFO<< "Number of parallel is set to "<< params.n_parallel;
346347
params.n_threads =
347348
json_body->get("cpu_threads", std::thread::hardware_concurrency())
348349
.asInt();
349-
params.cont_batching = json_body->get("cont_batching", false).asBool();
350+
params.cont_batching = json_body->get("cont_batching", true).asBool(); // default true according to llama.cpp upstream
350351

351352
params.cache_type_k = json_body->get("cache_type", kTypeF16).asString();
352353
if (!IsValidCacheType(params.cache_type_k)) {

0 commit comments

Comments
 (0)