Skip to content

Commit 4f9640b

Browse files
Tensor parallelism
1 parent 971920e commit 4f9640b

File tree

10 files changed

+591
-404
lines changed

10 files changed

+591
-404
lines changed

examples/common.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <algorithm>
1010
#include <sstream>
1111
#include <unordered_set>
12+
#include <regex>
1213

1314
#if defined(__APPLE__) && defined(__MACH__)
1415
#include <sys/types.h>
@@ -295,6 +296,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
295296
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
296297
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
297298
#endif
299+
} else if (arg == "--tensor-split" || arg == "-ts") {
300+
if (++i >= argc) {
301+
invalid_param = true;
302+
break;
303+
}
304+
#ifdef GGML_USE_CUBLAS
305+
std::string arg_next = argv[i];
306+
307+
// split string by , and /
308+
const std::regex regex{R"([,/]+)"};
309+
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
310+
std::vector<std::string> split_arg{it, {}};
311+
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
312+
313+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
314+
if (i < split_arg.size()) {
315+
params.tensor_split[i] = std::stof(split_arg[i]);
316+
} else {
317+
params.tensor_split[i] = 0.0f;
318+
}
319+
}
320+
#else
321+
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
322+
#endif // GGML_USE_CUBLAS
298323
} else if (arg == "--no-mmap") {
299324
params.use_mmap = false;
300325
} else if (arg == "--mtest") {
@@ -438,6 +463,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
438463
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
439464
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
440465
fprintf(stderr, " number of layers to store in VRAM\n");
466+
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
467+
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
441468
#endif
442469
fprintf(stderr, " --mtest compute maximum memory usage\n");
443470
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
@@ -484,6 +511,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
484511

485512
lparams.n_ctx = params.n_ctx;
486513
lparams.n_gpu_layers = params.n_gpu_layers;
514+
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
487515
lparams.seed = params.seed;
488516
lparams.f16_kv = params.memory_f16;
489517
lparams.use_mmap = params.use_mmap;

examples/common.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,14 @@
2121
int32_t get_num_physical_cores();
2222

2323
struct gpt_params {
24-
int32_t seed = -1; // RNG seed
25-
int32_t n_threads = get_num_physical_cores();
26-
int32_t n_predict = -1; // new tokens to predict
27-
int32_t n_ctx = 512; // context size
28-
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29-
int32_t n_keep = 0; // number of tokens to keep from initial prompt
30-
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
24+
int32_t seed = -1; // RNG seed
25+
int32_t n_threads = get_num_physical_cores();
26+
int32_t n_predict = -1; // new tokens to predict
27+
int32_t n_ctx = 512; // context size
28+
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
30+
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
31+
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
3132

3233
// sampling parameters
3334
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

examples/server/server.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,8 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
401401
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
402402
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
403403
fprintf(stderr, " number of layers to store in VRAM\n");
404+
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
405+
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
404406
#endif
405407
fprintf(stderr, " -m FNAME, --model FNAME\n");
406408
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
@@ -503,6 +505,37 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
503505
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
504506
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
505507
#endif
508+
}
509+
else if (arg == "--tensor-split" || arg == "-ts")
510+
{
511+
if (++i >= argc)
512+
{
513+
invalid_param = true;
514+
break;
515+
}
516+
#ifdef GGML_USE_CUBLAS
517+
std::string arg_next = argv[i];
518+
519+
// split string by , and /
520+
const std::regex regex{R"([,/]+)"};
521+
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
522+
std::vector<std::string> split_arg{it, {}};
523+
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
524+
525+
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
526+
{
527+
if (i < split_arg.size())
528+
{
529+
params.tensor_split[i] = std::stof(split_arg[i]);
530+
}
531+
else
532+
{
533+
params.tensor_split[i] = 0.0f;
534+
}
535+
}
536+
#else
537+
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
538+
#endif // GGML_USE_CUBLAS
506539
}
507540
else
508541
{

0 commit comments

Comments
 (0)