Skip to content

Commit 76b4183

Browse files
author
John
committed
Added cuda-integration of JohannesGaessler git
- disabled offload of non layer tensors for now (not working yet) - corrected tensor size calculation for vram - added some more detailed vram reporting - added a automated skip of tensors that would not fit in vram (significant slowdown if --ngl is too high, probably from temporary cuda buffer copies) - added vram_overhead and vram_reserved - those are not pretty but currently needed to get the vram usage right - moved vram scratch buffer allocation a bit up so the usage is available for the skip
1 parent a8bb0fe commit 76b4183

File tree

1 file changed

+115
-38
lines changed

1 file changed

+115
-38
lines changed

libfalcon.cpp

Lines changed: 115 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "ggml.h"
1414
#ifdef GGML_USE_CUBLAS
15+
#include <cuda_runtime.h>
1516
#include "ggml-cuda.h"
1617
#elif defined(GGML_USE_CLBLAST)
1718
#include "ggml-opencl.h"
@@ -1010,6 +1011,33 @@ static const char *falcon_model_type_name(e_model type) {
10101011
}
10111012
}
10121013

1014+
// dynamically gets all tensors from a layer
1015+
std::vector<ggml_tensor*> get_tensors_from_layer(falcon_layer& layer) {
1016+
std::vector<ggml_tensor*> tensors;
1017+
ggml_tensor** tensor_ptr = reinterpret_cast<ggml_tensor**>(&layer); // Cast to the pointer to ggml_tensor pointer
1018+
1019+
// Iterate through the members and store their addresses in the vector
1020+
for (std::size_t i = 0; i < sizeof(falcon_layer) / sizeof(ggml_tensor*); ++i) {
1021+
tensors.push_back(tensor_ptr[i]);
1022+
}
1023+
1024+
return tensors;
1025+
}
1026+
// get vram size of all tensors in a layer (todo: split handling)
1027+
size_t calculate_layer_vram_bytes(const falcon_layer& layer) {
1028+
size_t size = 0;
1029+
auto tensors = get_tensors_from_layer(const_cast<falcon_layer&>(layer));
1030+
1031+
// Add the size of each member with GPU backend
1032+
for (const auto& tensor : tensors) {
1033+
if (tensor != nullptr && tensor->backend != GGML_BACKEND_CPU) {
1034+
size += ggml_nbytes(tensor);
1035+
}
1036+
}
1037+
1038+
return size;
1039+
}
1040+
10131041
static void falcon_model_load_internal(
10141042
const std::string & fname,
10151043
falcon_context & lctx,
@@ -1033,6 +1061,7 @@ static void falcon_model_load_internal(
10331061
auto & model = lctx.model;
10341062
model.hparams = ml->file_loaders.at(0)->hparams;
10351063
model.n_gpu_layers = n_gpu_layers;
1064+
10361065
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
10371066
auto & hparams = model.hparams;
10381067

@@ -1123,6 +1152,7 @@ static void falcon_model_load_internal(
11231152

11241153
(void) main_gpu;
11251154
#if defined(GGML_USE_CUBLAS)
1155+
if (n_gpu_layers > 0)
11261156
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
11271157
ggml_cuda_set_main_device(main_gpu);
11281158
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
@@ -1136,9 +1166,31 @@ static void falcon_model_load_internal(
11361166
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
11371167
#endif
11381168

1169+
size_t vram_total=0;
1170+
size_t vram_free=0;
1171+
size_t vram_reserved=1024*1024*512; //will be adapted by model
1172+
#if defined(GGML_USE_CUBLAS)
1173+
cudaMemGetInfo(&vram_free, &vram_total); // this should go in ggml-cuda.cu but I don't want to make Johannes life harder by modifying that yet
1174+
fprintf(stderr, "%s: VRAM free: %7.2f MB of %7.2f MB (already used: %7.2f MB)\n", __func__, vram_free/MB*1.0, vram_total/MB*1.0, (vram_total-vram_free)/MB*1.0);
1175+
#endif
1176+
11391177
// prepare memory for the weights
11401178
size_t vram_weights = 0;
11411179
size_t vram_scratch = 0;
1180+
size_t vram_overhead = 0;
1181+
(void) vram_scratch;
1182+
(void) n_batch;
1183+
// calculate scratch buffer size and allocate it
1184+
#ifdef GGML_USE_CUBLAS
1185+
vram_scratch = n_batch * MB;
1186+
ggml_cuda_set_scratch_size(vram_scratch);
1187+
if (n_gpu_layers > 0) {
1188+
1189+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1190+
__func__, vram_scratch / MB);
1191+
}
1192+
#endif // GGML_USE_CUBLAS
1193+
11421194
{
11431195
const uint32_t n_embd = hparams.n_embd;
11441196
const uint32_t n_head = hparams.n_head;
@@ -1152,11 +1204,25 @@ static void falcon_model_load_internal(
11521204

11531205
model.tok_embeddings = ml->get_tensor("transformer.word_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
11541206

1207+
// I did not analyze the cause but that's the overhead that is dynamically added to the VRAM at first inference
1208+
// same goes with reserved, most likely we can skip both for a proper size calculation.
1209+
// If the below values are not correct GPU memory will fill up to 100%, resulting in a extreme slowdown of inference
1210+
if (model.type == FALCON_40B)
1211+
{
1212+
vram_reserved=1900*MB;
1213+
vram_overhead+=2700*MB;
1214+
}
1215+
else
1216+
{
1217+
vram_reserved=768*MB;
1218+
vram_overhead+=1200*MB;
1219+
}
11551220

11561221

11571222
ggml_backend backend_norm;
11581223
ggml_backend backend_output;
1159-
if (n_gpu_layers > int(n_layer)) { // NOLINT
1224+
// disabled norm/output offloading until further tests, causes silent crash at the moment
1225+
if (n_gpu_layers > int(n_layer) && false) { // NOLINT
11601226
backend_norm = LLAMA_BACKEND_OFFLOAD;
11611227
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
11621228
} else {
@@ -1172,12 +1238,26 @@ static void falcon_model_load_internal(
11721238
model.lm_head = ml->get_tensor("lm_head.weight", {n_embd, n_vocab}, backend_output);
11731239
}
11741240

1241+
if (backend_norm != GGML_BACKEND_CPU)
1242+
{
1243+
vram_weights += ggml_nbytes(model.output_norm);
1244+
vram_weights += ggml_nbytes(model.output_norm_b);
1245+
vram_free -= ggml_nbytes(model.output_norm);
1246+
vram_free -= ggml_nbytes(model.output_norm_b);
1247+
}
1248+
if (backend_output != GGML_BACKEND_CPU)
1249+
{
1250+
vram_weights += ggml_nbytes(model.lm_head);
1251+
vram_free -= ggml_nbytes(model.lm_head);
1252+
}
1253+
11751254
const int i_gpu_start = n_layer - n_gpu_layers;
1255+
int i_gpu_end = n_layer; // allows to terminate the offloading earlier. TODO: instead do a proper calculation run and determine the start before the loop
11761256

11771257
model.layers.resize(n_layer);
11781258
for (uint32_t i = 0; i < n_layer; ++i) {
1179-
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1180-
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1259+
const ggml_backend backend = (int(i) < i_gpu_start || int(i) >= i_gpu_end) ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1260+
const ggml_backend backend_split = (int(i) < i_gpu_start || int(i) >= i_gpu_end) ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
11811261

11821262
auto & layer = model.layers[i];
11831263

@@ -1201,31 +1281,26 @@ static void falcon_model_load_internal(
12011281

12021282
layer.ffn_up = ml->get_tensor("transformer.h."+str_i + ".mlp.dense_h_to_4h.weight", {n_embd, n_ff}, backend_split); // before gelu
12031283
layer.ffn_down = ml->get_tensor("transformer.h."+str_i + ".mlp.dense_4h_to_h.weight", {n_ff, n_embd}, backend_split); // after gelu
1284+
1285+
if (backend != GGML_BACKEND_CPU)
1286+
{
1287+
size_t vram_layer = 0;
1288+
vram_layer = calculate_layer_vram_bytes(layer);
1289+
vram_weights += vram_layer;
1290+
vram_free = (vram_layer > vram_free) ? 0 : vram_free - vram_layer; // simulate the layer being loaded in VRAM
12041291

1205-
if (backend == GGML_BACKEND_GPU) {
1206-
// llama:
1207-
// vram_weights +=
1208-
// ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
1209-
// ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
1210-
// ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1211-
// falcon:
1212-
if (model.type == FALCON_40B)
1213-
{
1214-
vram_weights +=
1215-
ggml_nbytes(layer.input_layernorm) + ggml_nbytes(layer.input_layernorm_b) +
1216-
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.attention_norm_b) +
1217-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.wo) +
1218-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down) +
1219-
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up);
1220-
} else // FALCON_7B
1292+
if (vram_free <= (vram_overhead+vram_scratch+vram_reserved))
12211293
{
1222-
vram_weights +=
1223-
ggml_nbytes(layer.input_layernorm) + ggml_nbytes(layer.input_layernorm_b) +
1224-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.wo) +
1225-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down) +
1226-
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up);
1294+
// this needs some polishing (instead of fiddling with --ngl I'd like the option to auto-fill the vram with as many layers as possible as an alternative)
1295+
fprintf(stderr, "WARNING: Not enough VRAM to load the model as configured - at layer %d of %d\n", i, n_layer);
1296+
n_gpu_layers = i+1;
1297+
model.n_gpu_layers = n_gpu_layers;
1298+
i_gpu_end = i;
12271299
}
12281300
}
1301+
1302+
1303+
12291304
}
12301305
}
12311306

@@ -1251,25 +1326,17 @@ static void falcon_model_load_internal(
12511326
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
12521327
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
12531328

1254-
(void) vram_scratch;
1255-
(void) n_batch;
1256-
#ifdef GGML_USE_CUBLAS
1257-
vram_scratch = n_batch * MB;
1258-
ggml_cuda_set_scratch_size(vram_scratch);
1259-
if (n_gpu_layers > 0) {
1260-
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1261-
__func__, vram_scratch / MB);
1262-
}
1263-
#endif // GGML_USE_CUBLAS
1329+
// moved scratch allocation of vram to top
12641330
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
12651331
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
12661332

1267-
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1333+
fprintf(stderr, "%s: offloading %d of %d layers to GPU, weights offloaded %7.2f MB\n",
1334+
__func__, n_gpu, hparams.n_layer, vram_weights / 1024.0 / 1024.0);
12681335
if (n_gpu_layers > (int) hparams.n_layer) {
12691336
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
12701337
}
12711338
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1272-
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1339+
__func__, (vram_weights + vram_scratch + vram_overhead + MB - 1) / MB); // round up
12731340
#else
12741341
(void) n_gpu_layers;
12751342
#endif
@@ -1293,13 +1360,22 @@ static void falcon_model_load_internal(
12931360
progress_callback(1.0f, progress_callback_user_data);
12941361
}
12951362

1363+
#if defined(GGML_USE_CUBLAS)
1364+
//size_t vram_free_simulated = vram_free;
1365+
cudaMemGetInfo(&vram_free, &vram_total); // this should go in ggml-cuda.cu but I don't want to make Johannes life harder by modifying that yet
1366+
fprintf(stderr, "%s: VRAM free: %7.2f MB of %7.2f MB (used: %7.2f MB)\n", __func__, vram_free/MB*1.0, vram_total/MB*1.0, (vram_total-vram_free)/MB*1.0);
1367+
1368+
#endif
1369+
1370+
12961371
model.mapping = std::move(ml->mapping);
12971372

12981373
// loading time will be recalculate after the first eval, so
12991374
// we take page faults deferred by mmap() into consideration
13001375
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1376+
13011377
}
1302-
1378+
#include <windows.h>
13031379
static bool falcon_model_load(
13041380
const std::string & fname,
13051381
falcon_context & lctx,
@@ -2591,7 +2667,7 @@ struct falcon_context * falcon_init_from_file(
25912667
ggml_time_init();
25922668

25932669
falcon_context * ctx = new falcon_context;
2594-
2670+
25952671
if (params.seed < 0) {
25962672
params.seed = time(NULL);
25972673
}
@@ -2625,6 +2701,7 @@ struct falcon_context * falcon_init_from_file(
26252701
llama_free(ctx);
26262702
return nullptr;
26272703
}
2704+
params.n_gpu_layers = ctx->model.n_gpu_layers; // model_load_internal() may change this
26282705

26292706
// reserve memory for context buffers
26302707
if (!params.vocab_only) {

0 commit comments

Comments
 (0)