Skip to content

Commit 8e6735e

Browse files
committed
llama : initial ggml-backend integration
1 parent b1306c4 commit 8e6735e

File tree

8 files changed

+386
-546
lines changed

8 files changed

+386
-546
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ test: $(TEST_TARGETS)
6565
./$$test_target; \
6666
fi; \
6767
if [ $$? -ne 0 ]; then \
68-
printf 'Test $$test_target FAILED!\n\n' $$test_target; \
68+
printf 'Test %s FAILED!\n\n' $$test_target; \
6969
failures=$$(( failures + 1 )); \
7070
else \
7171
printf 'Test %s passed.\n\n' $$test_target; \

ggml-alloc.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
449449
if (update_backend) {
450450
view->backend = view->view_src->backend;
451451
}
452-
view->buffer = view->view_src->buffer;
452+
// views are initialized in the alloc buffer rather than the view_src buffer
453+
view->buffer = alloc->buffer;
453454
view->data = (char *)view->view_src->data + view->view_offs;
454455

455-
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
456-
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
457456
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
458457

459458
if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
736735
}
737736

738737
void ggml_allocr_free(ggml_allocr_t alloc) {
738+
if (alloc == NULL) {
739+
return;
740+
}
741+
739742
ggml_gallocr_free(alloc->galloc);
740743
ggml_tallocr_free(alloc->talloc);
741744
free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
775778
}
776779

777780
if (nbytes == 0) {
778-
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
781+
//fprintf(stderr, "%s: no tensors to allocate\n", __func__);
779782
return NULL;
780783
}
781784

ggml-backend.c

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
378378

379379
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
380380
free(buffer->context);
381-
GGML_UNUSED(buffer);
382381
}
383382

384383
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -456,7 +455,7 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
456455
}
457456

458457
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
459-
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
458+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
460459
/* .iface = */ {
461460
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
462461
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
@@ -466,8 +465,50 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
466465
/* .context = */ NULL,
467466
};
468467

469-
return &ggml_backend_buffer_type_cpu;
468+
return &ggml_backend_cpu_buffer_type;
469+
}
470+
471+
#ifdef GGML_USE_CPU_HBM
472+
#include <hbwmalloc.h>
473+
474+
// HBM buffer type
475+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
476+
hbw_free(buffer->context);
477+
}
478+
479+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
480+
//void * ptr = hbw_malloc(size);
481+
void * ptr;
482+
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
483+
if (result != 0) {
484+
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
485+
return NULL;
486+
}
487+
488+
// FIXME: this is a hack to avoid having to implement a new buffer type
489+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
490+
buffer->buft = buft;
491+
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
492+
493+
return buffer;
494+
}
495+
496+
struct ggml_backend_buffer_type_i cpu_backend_hbm_buffer_type_interface = {
497+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
498+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
499+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
500+
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
501+
};
502+
503+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
504+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
505+
/* .iface = */ cpu_backend_hbm_buffer_type_interface,
506+
/* .context = */ NULL,
507+
};
508+
509+
return &ggml_backend_cpu_buffer_type_hbm;
470510
}
511+
#endif
471512

472513
struct ggml_backend_cpu_context {
473514
int n_threads;
@@ -505,7 +546,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
505546
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
506547

507548
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
508-
cpu_plan->cgraph = *cgraph;
549+
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
509550

510551
if (cpu_plan->cplan.work_size > 0) {
511552
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);

ggml-backend.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ extern "C" {
7676

7777
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
7878

79+
#ifdef GGML_USE_CPU_HBM
80+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
81+
#endif
82+
7983
//
8084
// Backend registry
8185
//

ggml-cuda.cu

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7057,6 +7057,7 @@ inline void ggml_cuda_op_upscale(
70577057

70587058
(void) src1;
70597059
(void) dst;
7060+
(void) src1_dd;
70607061
}
70617062

70627063
inline void ggml_cuda_op_pad(
@@ -7073,6 +7074,7 @@ inline void ggml_cuda_op_pad(
70737074

70747075
(void) src1;
70757076
(void) dst;
7077+
(void) src1_dd;
70767078
}
70777079

70787080
inline void ggml_cuda_op_rms_norm(
@@ -8958,7 +8960,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
89588960

89598961
char * buf;
89608962
CUDA_CHECK(cudaMalloc(&buf, size));
8961-
char * buf_host = (char*)data + offset_split;
8963+
char * buf_host = (char *)data + offset_split;
89628964

89638965
// set padding to 0 to avoid possible NaN values
89648966
if (size > original_size) {
@@ -9103,11 +9105,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
91039105

91049106
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
91059107

9106-
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
9107-
tensor->op == GGML_OP_VIEW;
9108+
const bool inplace = tensor->view_src != nullptr;
91089109

9109-
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
9110-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9110+
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
9111+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
91119112
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
91129113
size_t view_offset = 0;
91139114
if (tensor->op == GGML_OP_VIEW) {
@@ -9431,19 +9432,25 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
94319432
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
94329433
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
94339434

9434-
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9435+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
94359436

9436-
UNUSED(buffer);
9437+
ggml_cuda_set_device(ctx->device);
9438+
CUDA_CHECK(cudaDeviceSynchronize());
9439+
9440+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
94379441
}
94389442

94399443
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
94409444
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
94419445
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
94429446
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
94439447

9444-
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
9448+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
94459449

9446-
UNUSED(buffer);
9450+
ggml_cuda_set_device(ctx->device);
9451+
CUDA_CHECK(cudaDeviceSynchronize());
9452+
9453+
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
94479454
}
94489455

94499456
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
@@ -9505,35 +9512,35 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
95059512
UNUSED(buft);
95069513
}
95079514

9508-
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
9515+
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
95099516
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
95109517
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
95119518
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
95129519
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
95139520
};
95149521

95159522
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9516-
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
9517-
static bool ggml_backend_buffer_type_cuda_initialized = false;
9518-
if (!ggml_backend_buffer_type_cuda_initialized) {
9523+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
9524+
9525+
static bool ggml_backend_cuda_buffer_type_initialized = false;
9526+
9527+
if (!ggml_backend_cuda_buffer_type_initialized) {
95199528
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
9520-
ggml_backend_buffer_type_cuda[i] = {
9521-
/* .iface = */ cuda_backend_buffer_type_interface,
9529+
ggml_backend_cuda_buffer_types[i] = {
9530+
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
95229531
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
95239532
};
95249533
}
9525-
ggml_backend_buffer_type_cuda_initialized = true;
9534+
ggml_backend_cuda_buffer_type_initialized = true;
95269535
}
95279536

9528-
return &ggml_backend_buffer_type_cuda[device];
9537+
return &ggml_backend_cuda_buffer_types[device];
95299538
}
95309539

95319540
// host buffer type
95329541

95339542
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9534-
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9535-
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
9536-
delete ctx;
9543+
CUDA_CHECK(cudaFreeHost(buffer->context));
95379544
}
95389545

95399546
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -9546,24 +9553,22 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
95469553
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
95479554

95489555
return buffer;
9549-
9550-
UNUSED(buft);
95519556
}
95529557

9553-
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9558+
struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = {
95549559
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
95559560
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
95569561
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
95579562
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
95589563
};
95599564

95609565
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9561-
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9562-
/* .iface = */ cuda_backend_host_buffer_type_interface,
9566+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
9567+
/* .iface = */ ggml_backend_cuda_host_buffer_type_interface,
95639568
/* .context = */ nullptr,
95649569
};
95659570

9566-
return &ggml_backend_buffer_type_cuda_host;
9571+
return &ggml_backend_cuda_buffer_type_host;
95679572
}
95689573

95699574
// backend

ggml.c

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
23832383
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
23842384
size_t max_size = 0;
23852385

2386-
struct ggml_object * obj = ctx->objects_begin;
2387-
2388-
while (obj != NULL) {
2389-
if (obj->type == GGML_OBJECT_TENSOR) {
2390-
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
2391-
2392-
const size_t size = ggml_nbytes(tensor);
2393-
2394-
if (max_size < size) {
2395-
max_size = size;
2396-
}
2397-
}
2398-
2399-
obj = obj->next;
2386+
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
2387+
max_size = MAX(max_size, ggml_nbytes(tensor));
24002388
}
24012389

24022390
return max_size;
@@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
30933081
return result;
30943082
}
30953083

3096-
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
3084+
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
30973085
struct ggml_object * obj = ctx->objects_begin;
30983086

30993087
char * const mem_buffer = ctx->mem_buffer;
@@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
31093097
return NULL;
31103098
}
31113099

3112-
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
3100+
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
31133101
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
31143102
obj = obj->next;
31153103

@@ -19179,6 +19167,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
1917919167
return ctx->infos[i].name.data;
1918019168
}
1918119169

19170+
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
19171+
return ctx->infos[i].type;
19172+
}
19173+
1918219174
// returns the index
1918319175
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
1918419176
const int idx = gguf_find_key(ctx, key);

ggml.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -729,8 +729,8 @@ extern "C" {
729729
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
730730

731731
// Context tensor enumeration and lookup
732-
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
733-
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
732+
GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
733+
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
734734
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
735735

736736
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@@ -2123,10 +2123,11 @@ extern "C" {
21232123
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
21242124
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
21252125

2126-
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2127-
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2128-
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2129-
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2126+
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2127+
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2128+
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2129+
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2130+
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
21302131

21312132
// overrides existing values or adds a new one
21322133
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);

0 commit comments

Comments
 (0)