Skip to content

Commit 24b79ca

Browse files
committed
llama : refactor model loader with backend registry
1 parent 958367b commit 24b79ca

16 files changed

+1626
-1737
lines changed

ggml/include/ggml-backend.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ extern "C" {
169169

170170

171171
// Functions that may be obtained using ggml_backend_reg_get_proc_address
172-
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
173-
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
172+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tesor_split);
173+
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
174174

175175
//
176176
// Backend registry

ggml/include/ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
2828
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
2929

3030
// split tensor buffer that splits matrices by rows across multiple devices
31-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
31+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
3232

3333
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
3434
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

ggml/src/ggml-amx.cpp

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@
1616
#if defined(__AMX_INT8__)
1717

1818
// AMX buffer interface
19-
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
20-
return "AMX";
21-
22-
GGML_UNUSED(buffer);
23-
}
24-
2519
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2620
free(buffer->context);
2721
}
@@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
7266
}
7367

7468
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
75-
/* .get_name = */ ggml_backend_amx_buffer_get_name,
7669
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
7770
/* .get_base = */ ggml_backend_amx_buffer_get_base,
7871
/* .init_tensor = */ NULL, // no initialization required
@@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
149142
delete backend;
150143
}
151144

152-
static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
153-
return ggml_backend_amx_buffer_type();
154-
155-
GGML_UNUSED(backend);
156-
}
157-
158145
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
159146
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
160147

@@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
187174
static struct ggml_backend_i ggml_backend_amx_i = {
188175
/* .get_name = */ ggml_backend_amx_name,
189176
/* .free = */ ggml_backend_amx_free,
190-
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
191177
/* .set_tensor_async = */ NULL,
192178
/* .get_tensor_async = */ NULL,
193179
/* .cpy_tensor_async = */ NULL,
@@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
197183
/* .graph_plan_update = */ NULL,
198184
/* .graph_plan_compute = */ NULL,
199185
/* .graph_compute = */ ggml_backend_amx_graph_compute,
200-
/* .supports_op = */ NULL,
201-
/* .supports_buft = */ NULL,
202-
/* .offload_op = */ NULL,
203186
/* .event_record = */ NULL,
204187
/* .event_wait = */ NULL,
205188
};

ggml/src/ggml-backend-impl.h

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ extern "C" {
2222
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
2323
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
2424
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25-
// (optional) check if tensor data is in host memory (defaults to false)
25+
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
2626
bool (*is_host) (ggml_backend_buffer_type_t buft);
2727
};
2828

@@ -37,7 +37,6 @@ extern "C" {
3737
//
3838

3939
struct ggml_backend_buffer_i {
40-
const char * (*get_name) (ggml_backend_buffer_t buffer);
4140
// (optional) free the buffer
4241
void (*free_buffer) (ggml_backend_buffer_t buffer);
4342
// base address of the buffer
@@ -88,19 +87,16 @@ extern "C" {
8887

8988
void (*free)(ggml_backend_t backend);
9089

91-
// Will be moved to the device interface
92-
// buffer allocation
93-
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
94-
9590
// (optional) asynchronous tensor data access
9691
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
9792
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
9893
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
9994

100-
// (optional) complete all pending operations
95+
// (optional) complete all pending operations (required if the backend supports async operations)
10196
void (*synchronize)(ggml_backend_t backend);
10297

103-
// (optional) compute graph with a plan (not used currently)
98+
// (optional) graph plans
99+
// compute graph with a plan (not used currently)
104100
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
105101
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
106102
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@@ -111,13 +107,6 @@ extern "C" {
111107
// compute graph (always async if supported by the backend)
112108
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
113109

114-
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
115-
// new backends should implement the device interface instead
116-
// These functions are being moved to the device interface
117-
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
118-
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
119-
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
120-
121110
// (optional) event synchronization
122111
// record an event on this stream
123112
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);

0 commit comments

Comments
 (0)