menloresearch · nguyenhoangthuan99 · Jul 31, 2024 · Jul 31, 2024
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -0,0 +1,100 @@
+import aiohttp
+import requests
+import asyncio
+import time
+import os
+import random
+
+start_time = time.time()
+SERVER_ENDPOINT = "http://localhost:3928"
+TOTAL_REQUESTS = 16
+N_PARALLEL = 4
+MAX_CTX_FOR_ONE_SEQUENCE = 512
+N_CTX = MAX_CTX_FOR_ONE_SEQUENCE*N_PARALLEL # this number related to reserve GPU memory for kv cache
+
+def start_server():
+    import subprocess
+
+    subprocess.run(["cd", "../examples/server/build/", "&&", "./server"])
+    time.sleep(2)
+
+
+def load_model():
+    headers = {"Content-Type": "application/json"}
+    data = {"llama_model_path": "/mnt/nas/gguf-models/meta-llama3.1-8b-instruct-q4km.gguf", "model_alias": "meta-llama3.1-8b-instruct",
+            "model": "meta-llama3.1-8b-instruct", "ctx_len": N_CTX,"n_batch":2048, "ngl": 300, "model_type": "llm", "n_parallel": N_PARALLEL}
+    result = requests.post(SERVER_ENDPOINT+"/loadmodel",
+                           headers=headers, json=data)
+    print(result.json())
+
+
+async def send_request(session, prompt):
+    headers = {"Content-Type": "application/json"}
+    data = {"model": "meta-llama3.1-8b-instruct",
+            "messages": [{"role": "user", "content": prompt},]}
+    async with session.post(SERVER_ENDPOINT+"/v1/chat/completions", headers=headers, json=data) as resp:
+        result = await resp.json()
+        return result
+
+
+async def send_request_sequence():
+    # warm up
+    async with aiohttp.ClientSession() as session:
+        res = await send_request(session, "What is GPU?")
+
+    start = time.time()
+    total_token_processed = 0
+    async with aiohttp.ClientSession() as session:
+
+        tasks = []
+        prompts = ["What is GPU?", "Who won the world cup 2022?", "Tell me some dad's joke",
+                   "Write a quick sort function", "What is the price of Nvidia H100?", "Who won the world series in 2020?"]
+        for number in range(TOTAL_REQUESTS):
+            res = await send_request(session, random.choice(prompts))
+            if res.get("usage"):
+                total_token_processed += res["usage"]["total_tokens"]
+            else:
+                print(res)
+
+    end = time.time()
+    print("Finished in", end-start, "s")
+    print("Total token:", total_token_processed)
+    print("Throughput when run in sequence:", total_token_processed/(end-start), "tokens/s")
+    print("------------------------------------------------------------------------")
+
+
+async def main():
+    # warm up
+    async with aiohttp.ClientSession() as session:
+        res = await send_request(session, "What is GPU?")
+
+    start = time.time()
+    total_token_processed = 0
+    async with aiohttp.ClientSession() as session:
+
+        tasks = []
+        prompts = ["What is GPU?", "Who won the world cup 2022?", "Tell me some dad's joke",
+                   "Write a quick sort function", "What is the price of Nvidia H100?", "Who won the world series in 2020?"]
+        for number in range(TOTAL_REQUESTS):
+            tasks.append(asyncio.ensure_future(
+                send_request(session, random.choice(prompts))))
+
+        results = await asyncio.gather(*tasks)
+        for res in results:
+            # print(res)
+            if res.get("usage"):
+                total_token_processed += res["usage"]["total_tokens"]
+            else:
+                print(res)
+    end = time.time()
+    print("Finished in", end-start, "s")
+    print("Total token:", total_token_processed)
+    print("Throughput when run parallel:", total_token_processed/(end-start), "tokens/s")
+    print("------------------------------------------------------------------------")
+# start_server()
+load_model()
+
+asyncio.run(main())
+
+asyncio.run(send_request_sequence())
+print("--- %s seconds ---" % (time.time() - start_time))
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -333,7 +333,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
       }
     }
 
-    params.n_gpu_layers = json_body->get("ngl", 100).asInt();
+    params.n_gpu_layers = json_body->get("ngl", 300).asInt(); // change from 100 -> 300 since llama 3.1 has 292 gpu layers
     params.n_ctx = json_body->get("ctx_len", 2048).asInt();
     model_type = json_body->get("model_type", "llm").asString();
     // In case of embedding only model, we set default = true
@@ -343,10 +343,11 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
     params.n_ubatch = json_body->get("n_ubatch", params.n_batch).asInt();
     // Check if n_parallel exists in json_body, if not, set to drogon_thread
     params.n_parallel = json_body->get("n_parallel", 1).asInt();
+    LOG_INFO<< "Number of parallel is set to "<< params.n_parallel;
     params.n_threads =
         json_body->get("cpu_threads", std::thread::hardware_concurrency())
             .asInt();
-    params.cont_batching = json_body->get("cont_batching", false).asBool();
+    params.cont_batching = json_body->get("cont_batching", true).asBool(); // default true according to llama.cpp upstream
 
     params.cache_type_k = json_body->get("cache_type", kTypeF16).asString();
     if (!IsValidCacheType(params.cache_type_k)) {