From 2674f02e4fc107e8ea98f383accfa649e958eeb3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Jan 2025 12:33:16 +0200 Subject: [PATCH 1/5] metal : use residency sets ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 104 +++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index a85502ee0893b..7bf777cae228c 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -19,6 +19,11 @@ // max number of MTLCommandBuffer used to submit a graph for processing #define GGML_METAL_MAX_COMMAND_BUFFERS 8 +// create residency sets only on macOS >= 15.0 +#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 +#define GGML_METAL_HAS_RESIDENCY_SETS 1 +#endif + #define UNUSED(x) (void)(x) // globals @@ -39,6 +44,7 @@ bool has_simdgroup_reduction; bool has_simdgroup_mm; + bool has_residency_sets; bool has_bfloat; bool use_bfloat; @@ -48,6 +54,7 @@ /*.mtl_device_ref_count =*/ 0, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, + /*.has_residency_sets =*/ false, /*.has_bfloat =*/ false, /*.use_bfloat =*/ false, /*.name =*/ "", @@ -65,6 +72,10 @@ ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; +#if defined(GGML_METAL_HAS_RESIDENCY_SETS) + ctx->has_residency_sets = true; +#endif + ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; @@ -483,6 +494,11 @@ @implementation GGMLMetalClass GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->queue = [device newCommandQueue]; + if (ctx->queue == nil) { + GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); + return NULL; + } + ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); id metal_library; @@ -649,6 +665,7 @@ @implementation GGMLMetalClass GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false"); GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false"); + GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false"); GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); @@ -1035,8 +1052,60 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap int n_buffers; struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; + + // optional MTLResidencySet + id rset; }; +// rset init +static bool ggml_backend_metal_buffer_rset_init(struct ggml_backend_metal_buffer_context * ctx, id device) { +#if defined(GGML_METAL_HAS_RESIDENCY_SETS) + if (@available(macOS 15.0, *)) { + MTLResidencySetDescriptor * desc; + desc = [[MTLResidencySetDescriptor alloc] init]; + desc.label = @"ggml_backend_metal"; + desc.initialCapacity = ctx->n_buffers; + + NSError * error; + ctx->rset = [device newResidencySetWithDescriptor:desc error:&error]; + if (error) { + GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return false; + } + + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->rset addAllocation:ctx->buffers[i].metal]; + } + + [ctx->rset commit]; + [ctx->rset requestResidency]; + + return true; + } +#else + GGML_UNUSED(device); +#endif + + ctx->rset = nil; + + return true; +} + +// rset free +static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) { +#if defined(GGML_METAL_HAS_RESIDENCY_SETS) + if (@available(macOS 15.0, *)) { + if (ctx->rset) { + [ctx->rset endResidency]; + [ctx->rset removeAllAllocations]; + [ctx->rset release]; + } + } +#else + GGML_UNUSED(ctx); +#endif +} + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -4086,7 +4155,7 @@ static enum ggml_status ggml_metal_graph_compute( // the main thread commits the first few commands immediately // command_buffer[n_cb] { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; + id command_buffer = [ctx->queue commandBuffer]; ctx->command_buffers[n_cb] = command_buffer; [command_buffer enqueue]; @@ -4096,7 +4165,7 @@ static enum ggml_status ggml_metal_graph_compute( // prepare the rest of the command buffers asynchronously // command_buffer[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; + id command_buffer = [ctx->queue commandBuffer]; ctx->command_buffers[cb_idx] = command_buffer; // always enqueue the first two command buffers @@ -4176,6 +4245,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) for (int i = 0; i < ctx->n_buffers; i++) { [ctx->buffers[i].metal release]; } + + ggml_backend_metal_buffer_rset_free(ctx); ggml_backend_metal_device_rel(buffer->buft->device->context); if (ctx->owned) { @@ -4284,7 +4355,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba size_aligned += (size_page - (size_aligned % size_page)); } - id device = ggml_backend_metal_device_acq(buft->device->context); + struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context; + id device = ggml_backend_metal_device_acq(ctx_dev); ctx->all_data = ggml_metal_host_malloc(size_aligned); ctx->all_size = size_aligned; @@ -4307,7 +4379,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); - ggml_backend_metal_device_rel(buft->device->context); + ggml_backend_metal_device_rel(ctx_dev); + return NULL; + } + + if (!ggml_backend_metal_buffer_rset_init(ctx, device)) { + GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); + free(ctx); + ggml_backend_metal_device_rel(ctx_dev); return NULL; } @@ -4400,7 +4479,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz size_aligned += (size_page - (size_aligned % size_page)); } - id device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main); + struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main; + id device = ggml_backend_metal_device_acq(ctx_dev); // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { @@ -4453,6 +4533,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } } + if (!ggml_backend_metal_buffer_rset_init(ctx, device)) { + GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); + free(ctx); + ggml_backend_metal_device_rel(ctx_dev); + return NULL; + } + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); } @@ -4766,6 +4853,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back } } + if (!ggml_backend_metal_buffer_rset_init(ctx, device)) { + GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); + free(ctx); + ggml_backend_metal_device_rel(ctx_dev); + return NULL; + } + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); } From 7fb39e39e9b5e94436ff8b35792fabfd45a2e391 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Jan 2025 16:32:41 +0200 Subject: [PATCH 2/5] metal : restore commandBufferWithUnretainedReferences calls [no ci] --- ggml/src/ggml-metal/ggml-metal.m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 7bf777cae228c..103c151e5760b 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4155,7 +4155,7 @@ static enum ggml_status ggml_metal_graph_compute( // the main thread commits the first few commands immediately // command_buffer[n_cb] { - id command_buffer = [ctx->queue commandBuffer]; + id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; ctx->command_buffers[n_cb] = command_buffer; [command_buffer enqueue]; @@ -4165,7 +4165,7 @@ static enum ggml_status ggml_metal_graph_compute( // prepare the rest of the command buffers asynchronously // command_buffer[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBuffer]; + id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; ctx->command_buffers[cb_idx] = command_buffer; // always enqueue the first two command buffers From b9126fe364cd13e62c1b10c7f845b9ae021870b1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Jan 2025 16:38:54 +0200 Subject: [PATCH 3/5] metal : release descriptors ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 103c151e5760b..e76d369134d04 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1070,9 +1070,12 @@ static bool ggml_backend_metal_buffer_rset_init(struct ggml_backend_metal_buffer ctx->rset = [device newResidencySetWithDescriptor:desc error:&error]; if (error) { GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + [desc release]; return false; } + [desc release]; + for (int i = 0; i < ctx->n_buffers; i++) { [ctx->rset addAllocation:ctx->buffers[i].metal]; } From 9dc5ef45d8fffc0a41e672aa768cb581af5a82c3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Jan 2025 19:31:08 +0200 Subject: [PATCH 4/5] metal : check env GGML_METAL_NO_RESIDENCY ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index e76d369134d04..d2314da2f5764 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -73,7 +73,7 @@ ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; #if defined(GGML_METAL_HAS_RESIDENCY_SETS) - ctx->has_residency_sets = true; + ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL; #endif ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -1058,11 +1058,19 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { }; // rset init -static bool ggml_backend_metal_buffer_rset_init(struct ggml_backend_metal_buffer_context * ctx, id device) { +static bool ggml_backend_metal_buffer_rset_init( + struct ggml_backend_metal_buffer_context * ctx, + struct ggml_backend_metal_device_context * ctx_dev, + id device) { + ctx->rset = nil; + #if defined(GGML_METAL_HAS_RESIDENCY_SETS) if (@available(macOS 15.0, *)) { - MTLResidencySetDescriptor * desc; - desc = [[MTLResidencySetDescriptor alloc] init]; + if (!ctx_dev->has_residency_sets) { + return true; + } + + MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init]; desc.label = @"ggml_backend_metal"; desc.initialCapacity = ctx->n_buffers; @@ -1089,8 +1097,6 @@ static bool ggml_backend_metal_buffer_rset_init(struct ggml_backend_metal_buffer GGML_UNUSED(device); #endif - ctx->rset = nil; - return true; } @@ -4386,7 +4392,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba return NULL; } - if (!ggml_backend_metal_buffer_rset_init(ctx, device)) { + if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); ggml_backend_metal_device_rel(ctx_dev); @@ -4536,7 +4542,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } } - if (!ggml_backend_metal_buffer_rset_init(ctx, device)) { + if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); ggml_backend_metal_device_rel(ctx_dev); @@ -4856,7 +4862,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back } } - if (!ggml_backend_metal_buffer_rset_init(ctx, device)) { + if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); ggml_backend_metal_device_rel(ctx_dev); From 225d2e0ca1d7a7e627f2cea4a43dd77a83b9f078 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 Jan 2025 19:35:02 +0200 Subject: [PATCH 5/5] metal : fix build + clean-up ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 37 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index d2314da2f5764..c9474345da971 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -24,8 +24,6 @@ #define GGML_METAL_HAS_RESIDENCY_SETS 1 #endif -#define UNUSED(x) (void)(x) - // globals // overload of MTLGPUFamilyMetal3 (not available in some environments) @@ -1064,12 +1062,12 @@ static bool ggml_backend_metal_buffer_rset_init( id device) { ctx->rset = nil; + if (!ctx_dev->has_residency_sets) { + return true; + } + #if defined(GGML_METAL_HAS_RESIDENCY_SETS) if (@available(macOS 15.0, *)) { - if (!ctx_dev->has_residency_sets) { - return true; - } - MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init]; desc.label = @"ggml_backend_metal"; desc.initialCapacity = ctx->n_buffers; @@ -1094,6 +1092,7 @@ static bool ggml_backend_metal_buffer_rset_init( return true; } #else + GGML_UNUSED(ctx_dev); GGML_UNUSED(device); #endif @@ -4278,19 +4277,19 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { memset((char *)tensor->data + offset, value, size); - UNUSED(buffer); + GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); - UNUSED(buffer); + GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); - UNUSED(buffer); + GGML_UNUSED(buffer); } static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { @@ -4300,7 +4299,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c } return false; - UNUSED(buffer); + GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -4326,7 +4325,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "Metal"; - UNUSED(buft); + GGML_UNUSED(buft); } static void ggml_backend_metal_log_allocated_size(id device, size_t size_aligned) { @@ -4350,8 +4349,8 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s } #endif #endif - UNUSED(device); - UNUSED(size_aligned); + GGML_UNUSED(device); + GGML_UNUSED(size_aligned); } static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -4406,7 +4405,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; - UNUSED(buft); + GGML_UNUSED(buft); } static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { @@ -4416,13 +4415,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty return max_size; - UNUSED(buft); + GGML_UNUSED(buft); } static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; - UNUSED(buft); + GGML_UNUSED(buft); } ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { @@ -4445,7 +4444,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { return "Metal_Mapped"; - UNUSED(buft); + GGML_UNUSED(buft); } static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) { @@ -4557,7 +4556,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; - UNUSED(backend); + GGML_UNUSED(backend); } static void ggml_backend_metal_free(ggml_backend_t backend) { @@ -4882,7 +4881,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; - UNUSED(dev); + GGML_UNUSED(dev); } static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {