Skip to content

Commit ac4b252

Browse files
committed
llama : refactor model loader with backend registry
1 parent 19d900a commit ac4b252

14 files changed

+1463
-1469
lines changed

ggml/include/ggml-backend.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ extern "C" {
169169

170170

171171
// Functions that may be obtained using ggml_backend_reg_get_proc_address
172-
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
173-
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
172+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tesor_split);
173+
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
174174

175175
//
176176
// Backend registry

ggml/include/ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
2828
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
2929

3030
// split tensor buffer that splits matrices by rows across multiple devices
31-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
31+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
3232

3333
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
3434
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

ggml/src/ggml-amx.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@
1616
#if defined(__AMX_INT8__)
1717

1818
// AMX buffer interface
19-
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
20-
return "AMX";
21-
22-
GGML_UNUSED(buffer);
23-
}
24-
2519
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2620
free(buffer->context);
2721
}
@@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
7266
}
7367

7468
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
75-
/* .get_name = */ ggml_backend_amx_buffer_get_name,
7669
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
7770
/* .get_base = */ ggml_backend_amx_buffer_get_base,
7871
/* .init_tensor = */ NULL, // no initialization required

ggml/src/ggml-backend-impl.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ extern "C" {
2222
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
2323
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
2424
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25-
// (optional) check if tensor data is in host memory (defaults to false)
25+
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
2626
bool (*is_host) (ggml_backend_buffer_type_t buft);
2727
};
2828

@@ -37,7 +37,6 @@ extern "C" {
3737
//
3838

3939
struct ggml_backend_buffer_i {
40-
const char * (*get_name) (ggml_backend_buffer_t buffer);
4140
// (optional) free the buffer
4241
void (*free_buffer) (ggml_backend_buffer_t buffer);
4342
// base address of the buffer

ggml/src/ggml-backend.cpp

Lines changed: 28 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
3434
}
3535

3636
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
37+
if (size == 0) {
38+
// return a dummy buffer for zero-sized allocations
39+
return ggml_backend_buffer_init(buft, {}, NULL, 0);
40+
}
41+
3742
return buft->iface.alloc_buffer(buft, size);
3843
}
3944

@@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
8994
}
9095

9196
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
92-
return buffer->iface.get_name(buffer);
97+
return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
9398
}
9499

95100
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
108113
}
109114

110115
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
116+
// get_base is optional if the buffer is zero-sized
117+
if (buffer->iface.get_base == NULL && buffer->size == 0) {
118+
return NULL;
119+
}
120+
111121
void * base = buffer->iface.get_base(buffer);
112122

113123
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -238,43 +248,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
238248
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
239249
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
240250

251+
if (size == 0) {
252+
return;
253+
}
254+
241255
GGML_ASSERT(buf != NULL && "tensor buffer not set");
242256
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
243257
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
244258

245-
if (!size) {
246-
return;
247-
}
248-
249259
buf->iface.set_tensor(buf, tensor, data, offset, size);
250260
}
251261

252262
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
253263
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
254264

265+
if (size == 0) {
266+
return;
267+
}
268+
255269
GGML_ASSERT(buf != NULL && "tensor buffer not set");
256270
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257271
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
258272

259-
if (!size) {
260-
return;
261-
}
262-
263273
buf->iface.get_tensor(buf, tensor, data, offset, size);
264274
}
265275

266276
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
267277
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
268278

269-
GGML_ASSERT(buf != NULL && "tensor buffer not set");
270-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
271-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
272-
273-
if (!size) {
279+
if (size == 0) {
274280
return;
275281
}
276282

277-
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
283+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
284+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
285+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
286+
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
278287

279288
buf->iface.memset_tensor(buf, tensor, value, offset, size);
280289
}
@@ -713,12 +722,6 @@ ggml_backend_t ggml_backend_init_best(void) {
713722

714723
// backend CPU
715724

716-
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
717-
return "CPU";
718-
719-
GGML_UNUSED(buffer);
720-
}
721-
722725
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
723726
uintptr_t data = (uintptr_t)buffer->context;
724727

@@ -767,7 +770,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
767770
}
768771

769772
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
770-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
771773
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
772774
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
773775
/* .init_tensor = */ NULL, // no initialization required
@@ -780,7 +782,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
780782
};
781783

782784
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
783-
/* .get_name = */ ggml_backend_cpu_buffer_get_name,
784785
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
785786
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
786787
/* .init_tensor = */ NULL, // no initialization required
@@ -799,19 +800,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
799800
}
800801

801802
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
802-
auto alloc_size = size;
803-
if (alloc_size == 0) {
804-
alloc_size = 1;
805-
}
806-
807-
void * data = ggml_aligned_malloc(alloc_size);
803+
void * data = ggml_aligned_malloc(size);
808804

809805
if (data == NULL) {
810-
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
806+
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
811807
return NULL;
812808
}
813809

814-
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
810+
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
815811
}
816812

817813
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -1315,12 +1311,6 @@ struct ggml_backend_multi_buffer_context {
13151311
size_t n_buffers;
13161312
};
13171313

1318-
static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
1319-
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
1320-
1321-
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
1322-
}
1323-
13241314
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
13251315
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
13261316
for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -1339,7 +1329,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_
13391329
}
13401330

13411331
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
1342-
/* .get_name = */ ggml_backend_multi_buffer_get_name,
13431332
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
13441333
/* .get_base = */ NULL,
13451334
/* .init_tensor = */ NULL,
@@ -1368,7 +1357,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
13681357
}
13691358

13701359
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1371-
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
1360+
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
13721361
}
13731362

13741363
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {

ggml/src/ggml-cann.cpp

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -489,23 +489,6 @@ struct ggml_backend_cann_buffer_context {
489489
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
490490
};
491491

492-
/**
493-
* @brief Retrieve the name associated with a CANN buffer.
494-
*
495-
* This function returns the name of a CANN buffer, which is stored in the
496-
* context of the buffer.
497-
*
498-
* @param buffer The CANN buffer whose name is to be retrieved.
499-
* @return A pointer to a C-string containing the name of the buffer.
500-
*/
501-
502-
static const char* ggml_backend_cann_buffer_get_name(
503-
ggml_backend_buffer_t buffer) {
504-
return "CANN";
505-
506-
GGML_UNUSED(buffer);
507-
}
508-
509492
/**
510493
* @brief Check if a buffer is a CANN buffer.
511494
*
@@ -515,9 +498,10 @@ static const char* ggml_backend_cann_buffer_get_name(
515498
* @param buffer The buffer to check.
516499
* @return true if the buffer is a CANN buffer, false otherwise.
517500
*/
501+
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
518502
static bool ggml_backend_buffer_is_cann(
519503
ggml_backend_buffer_t buffer) {
520-
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
504+
return ggml_backend_buft_is_cann(buffer->buft);
521505
}
522506

523507
/**
@@ -965,7 +949,6 @@ static void ggml_backend_cann_buffer_clear(
965949
* on a CANN buffer within the backend.
966950
*/
967951
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
968-
/* .get_name = */ ggml_backend_cann_buffer_get_name,
969952
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
970953
/* .get_base = */ ggml_backend_cann_buffer_get_base,
971954
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
@@ -999,9 +982,10 @@ struct ggml_backend_cann_buffer_type_context {
999982
*/
1000983
static const char* ggml_backend_cann_buffer_type_name(
1001984
ggml_backend_buffer_type_t buft) {
1002-
return "CANN";
985+
ggml_backend_cann_buffer_type_context* buft_ctx =
986+
(ggml_backend_cann_buffer_type_context*)buft->context;
1003987

1004-
GGML_UNUSED(buft);
988+
return buft_ctx->name.c_str();
1005989
}
1006990

1007991
/**

0 commit comments

Comments
 (0)