ggml-org
diff --git a/‎examples/common.cpp
Lines changed: 28 additions & 0 deletions b/‎examples/common.cpp
Lines changed: 28 additions & 0 deletions
diff --git a/‎examples/common.h
Lines changed: 8 additions & 7 deletions b/‎examples/common.h
Lines changed: 8 additions & 7 deletions
diff --git a/‎examples/server/server.cpp
Lines changed: 33 additions & 0 deletions b/‎examples/server/server.cpp
Lines changed: 33 additions & 0 deletions
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <sstream>
 #include <unordered_set>
+#include <regex>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -295,6 +296,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
+        } else if (arg == "--tensor-split" || arg == "-ts") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
+            for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+                if (i < split_arg.size()) {
+                    params.tensor_split[i] = std::stof(split_arg[i]);
+                } else {
+                    params.tensor_split[i] = 0.0f;
+                }
+            }
+#else
+      fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
         } else if (arg == "--mtest") {
@@ -438,6 +463,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
     fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
     fprintf(stderr, "                        number of layers to store in VRAM\n");
+    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
@@ -484,6 +511,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
 
     lparams.n_ctx        = params.n_ctx;
     lparams.n_gpu_layers = params.n_gpu_layers;
+    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
     lparams.seed         = params.seed;
     lparams.f16_kv       = params.memory_f16;
     lparams.use_mmap     = params.use_mmap;
 
@@ -21,13 +21,14 @@
 int32_t get_num_physical_cores();
 
 struct gpt_params {
-    int32_t seed          = -1;  // RNG seed
-    int32_t n_threads     = get_num_physical_cores();
-    int32_t n_predict     = -1;  // new tokens to predict
-    int32_t n_ctx         = 512; // context size
-    int32_t n_batch       = 512; // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep        = 0;   // number of tokens to keep from initial prompt
-    int32_t n_gpu_layers  = 0;   // number of layers to store in VRAM
+    int32_t seed                           = -1;  // RNG seed
+    int32_t n_threads                      = get_num_physical_cores();
+    int32_t n_predict                      = -1;  // new tokens to predict
+    int32_t n_ctx                          = 512; // context size
+    int32_t n_batch                        = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                         = 0;   // number of tokens to keep from initial prompt
+    int32_t n_gpu_layers                   = 0;   // number of layers to store in VRAM
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
 
@@ -401,6 +401,8 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
   fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
   fprintf(stderr, "                        number of layers to store in VRAM\n");
+  fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
+  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
 #endif
   fprintf(stderr, "  -m FNAME, --model FNAME\n");
   fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
@@ -503,6 +505,37 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
       fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
       fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
+    }
+    else if (arg == "--tensor-split" || arg == "-ts")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+#ifdef GGML_USE_CUBLAS
+      std::string arg_next = argv[i];
+
+      // split string by , and /
+      const std::regex regex{R"([,/]+)"};
+      std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+      std::vector<std::string> split_arg{it, {}};
+      GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
+      for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
+      {
+        if (i < split_arg.size())
+        {
+          params.tensor_split[i] = std::stof(split_arg[i]);
+        }
+        else
+        {
+          params.tensor_split[i] = 0.0f;
+        }
+      }
+#else
+      fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
     }
     else
     {