llama : allow to initialize backend with NUMA support

ggerganov · ggerganov · commit 0fe4b00de249 · 2023-06-26T20:24:17.000+03:00
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -5,7 +5,6 @@
 
 #include "common.h"
 #include "llama.h"
-#include "ggml.h"
 #include "build-info.h"
 
 #include <cassert>
@@ -106,10 +105,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
-    if (params.numa) {
-        ggml_numa_init();
-    }
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
-    llama_init_backend();
+    llama_init_backend(false);
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
     // Init LLM :
     //---------------------------------
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/ggml.c b/ggml.c
@@ -3879,14 +3879,12 @@ struct ggml_context_container {
 #define GGML_NUMA_MAX_NODES 8
 #define GGML_NUMA_MAX_CPUS 512
 
-struct ggml_numa_node
-{
+struct ggml_numa_node {
     uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
     uint32_t n_cpus;
 };
 
-struct ggml_numa_nodes
-{
+struct ggml_numa_nodes {
     struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
     uint32_t n_nodes;
     uint32_t total_cpus; // hardware threads on system
@@ -3923,32 +3921,41 @@ inline static void ggml_critical_section_end(void) {
     atomic_fetch_sub(&g_state_barrier, 1);
 }
 
-void ggml_numa_init(void)
-{
-    if (g_state.numa.n_nodes > 0) { return; }
+void ggml_numa_init(void) {
+    if (g_state.numa.n_nodes > 0) {
+        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
+
+        return;
+    }
+
 #ifdef __linux__
     struct stat st;
     char path[256];
     int rv;
+
     // enumerate nodes
     while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
         GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
         if (stat(path, &st) != 0) { break; }
         ++g_state.numa.n_nodes;
     }
+
     // enumerate CPUs
     while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
         GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
         if (stat(path, &st) != 0) { break; }
         ++g_state.numa.total_cpus;
     }
+
     GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+
     if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
         g_state.numa.n_nodes = 0;
         return;
     }
+
     for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
         struct ggml_numa_node * node = &g_state.numa.nodes[n];
         GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -3963,6 +3970,7 @@ void ggml_numa_init(void)
         }
         GGML_PRINT_DEBUG("\n");
     }
+
     if (ggml_is_numa()) {
         FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
         if (fptr != NULL) {
@@ -3978,7 +3986,9 @@ void ggml_numa_init(void)
 #endif
 }
 
-bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; }
+bool ggml_is_numa(void) {
+    return g_state.numa.n_nodes > 1;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/llama.cpp b/llama.cpp
@@ -977,7 +977,7 @@ bool llama_mlock_supported() {
     return llama_mlock::SUPPORTED;
 }
 
-void llama_init_backend() {
+void llama_init_backend(bool numa) {
     ggml_time_init();
 
     // needed to initialize f16 tables
@@ -986,6 +986,10 @@ void llama_init_backend() {
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
+
+    if (numa) {
+        ggml_numa_init();
+    }
 }
 
 int64_t llama_time_us() {
diff --git a/llama.h b/llama.h
@@ -140,8 +140,9 @@ extern "C" {
 
     // TODO: not great API - very likely to change
     // Initialize the llama + ggml backend
+    // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend();
+    LLAMA_API void llama_init_backend(bool numa);
 
     LLAMA_API int64_t llama_time_us();
 

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {`
`35`	`35`	`params.prompt = gpt_random_prompt(rng);`
`36`	`36`	`}`
`37`	`37`
`38`		`- llama_init_backend();`
	`38`	`+ llama_init_backend(params.numa);`
`39`	`39`
`40`	`40`	`llama_model * model;`
`41`	`41`	`llama_context * ctx;`
Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {`
`147`	`147`	`params.prompt = gpt_random_prompt(rng);`
`148`	`148`	`}`
`149`	`149`
`150`		`- llama_init_backend();`
	`150`	`+ llama_init_backend(params.numa);`
`151`	`151`
`152`	`152`	`llama_model * model;`
`153`	`153`	`llama_context * ctx;`
Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {`
`180`	`180`	`usage(argv[0]);`
`181`	`181`	`}`
`182`	`182`
`183`		`- llama_init_backend();`
	`183`	`+ llama_init_backend(false);`
`184`	`184`
`185`	`185`	`// parse command line arguments`
`186`	`186`	`const std::string fname_inp = argv[arg_idx];`
Original file line number	Diff line number	Diff line change
`@@ -977,7 +977,7 @@ bool llama_mlock_supported() {`
`977`	`977`	`return llama_mlock::SUPPORTED;`
`978`	`978`	`}`
`979`	`979`
`980`		`-void llama_init_backend() {`
	`980`	`+void llama_init_backend(bool numa) {`
`981`	`981`	`ggml_time_init();`
`982`	`982`
`983`	`983`	`// needed to initialize f16 tables`
`@@ -986,6 +986,10 @@ void llama_init_backend() {`
`986`	`986`	`struct ggml_context * ctx = ggml_init(params);`
`987`	`987`	`ggml_free(ctx);`
`988`	`988`	`}`
	`989`	`+`
	`990`	`+ if (numa) {`
	`991`	`+ ggml_numa_init();`
	`992`	`+ }`
`989`	`993`	`}`
`990`	`994`
`991`	`995`	`int64_t llama_time_us() {`