ggml-org
diff --git a/‎ggml/include/ggml-backend.h
Lines changed: 2 additions & 2 deletions b/‎ggml/include/ggml-backend.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/include/ggml-cuda.h
Lines changed: 1 addition & 1 deletion b/‎ggml/include/ggml-cuda.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-amx.cpp
Lines changed: 0 additions & 17 deletions b/‎ggml/src/ggml-amx.cpp
Lines changed: 0 additions & 17 deletions
diff --git a/‎ggml/src/ggml-backend-impl.h
Lines changed: 4 additions & 15 deletions b/‎ggml/src/ggml-backend-impl.h
Lines changed: 4 additions & 15 deletions
@@ -169,8 +169,8 @@ extern "C" {
 
 
     // Functions that may be obtained using ggml_backend_reg_get_proc_address
-    typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
-    typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
+    typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tesor_split);
+    typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
 
     //
     // Backend registry
 
@@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
 GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
 
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 
@@ -16,12 +16,6 @@
 #if defined(__AMX_INT8__)
 
 // AMX buffer interface
-static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "AMX";
-
-    GGML_UNUSED(buffer);
-}
-
 static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     free(buffer->context);
 }
@@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }
 
 static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
-    /* .get_name        = */ ggml_backend_amx_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_amx_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
@@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_amx_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
 static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
 
@@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
 static struct ggml_backend_i ggml_backend_amx_i = {
     /* .get_name                = */ ggml_backend_amx_name,
     /* .free                    = */ ggml_backend_amx_free,
-    /* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
@@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_amx_graph_compute,
-    /* .supports_op             = */ NULL,
-    /* .supports_buft           = */ NULL,
-    /* .offload_op              = */ NULL,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
 
@@ -22,7 +22,7 @@ extern "C" {
         size_t                (*get_max_size)  (ggml_backend_buffer_type_t buft);
         // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
         size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
-        // (optional) check if tensor data is in host memory (defaults to false)
+        // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
         bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
     };
 
@@ -37,7 +37,6 @@ extern "C" {
     //
 
     struct ggml_backend_buffer_i {
-        const char * (*get_name)     (ggml_backend_buffer_t buffer);
         // (optional) free the buffer
         void         (*free_buffer)  (ggml_backend_buffer_t buffer);
         // base address of the buffer
@@ -88,19 +87,16 @@ extern "C" {
 
         void (*free)(ggml_backend_t backend);
 
-        // Will be moved to the device interface
-        // buffer allocation
-        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
-
         // (optional) asynchronous tensor data access
         void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
 
-        // (optional) complete all pending operations
+        // (optional) complete all pending operations (required if the backend supports async operations)
         void (*synchronize)(ggml_backend_t backend);
 
-        // (optional) compute graph with a plan (not used currently)
+        // (optional) graph plans
+        // compute graph with a plan (not used currently)
         ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@@ -111,13 +107,6 @@ extern "C" {
         // compute graph (always async if supported by the backend)
         enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
-        // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
-        //            new backends should implement the device interface instead
-        // These functions are being moved to the device interface
-        bool (*supports_op)  (ggml_backend_t backend, const struct ggml_tensor * op);
-        bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
-        bool (*offload_op)   (ggml_backend_t backend, const struct ggml_tensor * op);
-
         // (optional) event synchronization
         // record an event on this stream
         void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);