|
9 | 9 | #include <algorithm>
|
10 | 10 | #include <sstream>
|
11 | 11 | #include <unordered_set>
|
| 12 | +#include <regex> |
12 | 13 |
|
13 | 14 | #if defined(__APPLE__) && defined(__MACH__)
|
14 | 15 | #include <sys/types.h>
|
@@ -295,6 +296,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
295 | 296 | fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
296 | 297 | fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
297 | 298 | #endif
|
| 299 | + } else if (arg == "--tensor-split" || arg == "-ts") { |
| 300 | + if (++i >= argc) { |
| 301 | + invalid_param = true; |
| 302 | + break; |
| 303 | + } |
| 304 | +#ifdef GGML_USE_CUBLAS |
| 305 | + std::string arg_next = argv[i]; |
| 306 | + |
| 307 | + // split string by , and / |
| 308 | + const std::regex regex{R"([,/]+)"}; |
| 309 | + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; |
| 310 | + std::vector<std::string> split_arg{it, {}}; |
| 311 | + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); |
| 312 | + |
| 313 | + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { |
| 314 | + if (i < split_arg.size()) { |
| 315 | + params.tensor_split[i] = std::stof(split_arg[i]); |
| 316 | + } else { |
| 317 | + params.tensor_split[i] = 0.0f; |
| 318 | + } |
| 319 | + } |
| 320 | +#else |
| 321 | + fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); |
| 322 | +#endif // GGML_USE_CUBLAS |
298 | 323 | } else if (arg == "--no-mmap") {
|
299 | 324 | params.use_mmap = false;
|
300 | 325 | } else if (arg == "--mtest") {
|
@@ -438,6 +463,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
438 | 463 | #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
439 | 464 | fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
440 | 465 | fprintf(stderr, " number of layers to store in VRAM\n");
|
| 466 | + fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); |
| 467 | + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); |
441 | 468 | #endif
|
442 | 469 | fprintf(stderr, " --mtest compute maximum memory usage\n");
|
443 | 470 | fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
@@ -484,6 +511,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
484 | 511 |
|
485 | 512 | lparams.n_ctx = params.n_ctx;
|
486 | 513 | lparams.n_gpu_layers = params.n_gpu_layers;
|
| 514 | + memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); |
487 | 515 | lparams.seed = params.seed;
|
488 | 516 | lparams.f16_kv = params.memory_f16;
|
489 | 517 | lparams.use_mmap = params.use_mmap;
|
|
0 commit comments