19
19
// max number of MTLCommandBuffer used to submit a graph for processing
20
20
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
21
21
22
+ // create residency sets only on macOS >= 15.0
23
+ #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
24
+ #define GGML_METAL_HAS_RESIDENCY_SETS 1
25
+ #endif
26
+
22
27
#define UNUSED(x) (void)(x)
23
28
24
29
// globals
39
44
40
45
bool has_simdgroup_reduction;
41
46
bool has_simdgroup_mm;
47
+ bool has_residency_sets;
42
48
bool has_bfloat;
43
49
bool use_bfloat;
44
50
48
54
/*.mtl_device_ref_count =*/ 0,
49
55
/*.has_simdgroup_reduction =*/ false,
50
56
/*.has_simdgroup_mm =*/ false,
57
+ /*.has_residency_sets =*/ false,
51
58
/*.has_bfloat =*/ false,
52
59
/*.use_bfloat =*/ false,
53
60
/*.name =*/ "",
65
72
66
73
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
67
74
75
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
76
+ ctx->has_residency_sets = true;
77
+ #endif
78
+
68
79
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
69
80
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
70
81
@@ -483,6 +494,11 @@ @implementation GGMLMetalClass
483
494
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
484
495
485
496
ctx->queue = [device newCommandQueue];
497
+ if (ctx->queue == nil) {
498
+ GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
499
+ return NULL;
500
+ }
501
+
486
502
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
487
503
488
504
id<MTLLibrary> metal_library;
@@ -649,6 +665,7 @@ @implementation GGMLMetalClass
649
665
650
666
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
651
667
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
668
+ GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
652
669
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
653
670
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
654
671
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
@@ -1035,8 +1052,60 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
1035
1052
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1036
1053
int n_buffers;
1037
1054
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1055
+
1056
+ // optional MTLResidencySet
1057
+ id rset;
1038
1058
};
1039
1059
1060
+ // rset init
1061
+ static bool ggml_backend_metal_buffer_rset_init(struct ggml_backend_metal_buffer_context * ctx, id<MTLDevice> device) {
1062
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1063
+ if (@available(macOS 15.0, *)) {
1064
+ MTLResidencySetDescriptor * desc;
1065
+ desc = [[MTLResidencySetDescriptor alloc] init];
1066
+ desc.label = @"ggml_backend_metal";
1067
+ desc.initialCapacity = ctx->n_buffers;
1068
+
1069
+ NSError * error;
1070
+ ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
1071
+ if (error) {
1072
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
1073
+ return false;
1074
+ }
1075
+
1076
+ for (int i = 0; i < ctx->n_buffers; i++) {
1077
+ [ctx->rset addAllocation:ctx->buffers[i].metal];
1078
+ }
1079
+
1080
+ [ctx->rset commit];
1081
+ [ctx->rset requestResidency];
1082
+
1083
+ return true;
1084
+ }
1085
+ #else
1086
+ GGML_UNUSED(device);
1087
+ #endif
1088
+
1089
+ ctx->rset = nil;
1090
+
1091
+ return true;
1092
+ }
1093
+
1094
+ // rset free
1095
+ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
1096
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1097
+ if (@available(macOS 15.0, *)) {
1098
+ if (ctx->rset) {
1099
+ [ctx->rset endResidency];
1100
+ [ctx->rset removeAllAllocations];
1101
+ [ctx->rset release];
1102
+ }
1103
+ }
1104
+ #else
1105
+ GGML_UNUSED(ctx);
1106
+ #endif
1107
+ }
1108
+
1040
1109
// finds the Metal buffer that contains the tensor data on the GPU device
1041
1110
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
1042
1111
// Metal buffer based on the host memory pointer
@@ -4086,7 +4155,7 @@ static enum ggml_status ggml_metal_graph_compute(
4086
4155
// the main thread commits the first few commands immediately
4087
4156
// command_buffer[n_cb]
4088
4157
{
4089
- id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4158
+ id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer ];
4090
4159
ctx->command_buffers[n_cb] = command_buffer;
4091
4160
4092
4161
[command_buffer enqueue];
@@ -4096,7 +4165,7 @@ static enum ggml_status ggml_metal_graph_compute(
4096
4165
// prepare the rest of the command buffers asynchronously
4097
4166
// command_buffer[0.. n_cb)
4098
4167
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
4099
- id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4168
+ id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer ];
4100
4169
ctx->command_buffers[cb_idx] = command_buffer;
4101
4170
4102
4171
// always enqueue the first two command buffers
@@ -4176,6 +4245,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
4176
4245
for (int i = 0; i < ctx->n_buffers; i++) {
4177
4246
[ctx->buffers[i].metal release];
4178
4247
}
4248
+
4249
+ ggml_backend_metal_buffer_rset_free(ctx);
4179
4250
ggml_backend_metal_device_rel(buffer->buft->device->context);
4180
4251
4181
4252
if (ctx->owned) {
@@ -4284,7 +4355,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4284
4355
size_aligned += (size_page - (size_aligned % size_page));
4285
4356
}
4286
4357
4287
- id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
4358
+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
4359
+ id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
4288
4360
4289
4361
ctx->all_data = ggml_metal_host_malloc(size_aligned);
4290
4362
ctx->all_size = size_aligned;
@@ -4307,7 +4379,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4307
4379
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
4308
4380
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
4309
4381
free(ctx);
4310
- ggml_backend_metal_device_rel(buft->device->context);
4382
+ ggml_backend_metal_device_rel(ctx_dev);
4383
+ return NULL;
4384
+ }
4385
+
4386
+ if (!ggml_backend_metal_buffer_rset_init(ctx, device)) {
4387
+ GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4388
+ free(ctx);
4389
+ ggml_backend_metal_device_rel(ctx_dev);
4311
4390
return NULL;
4312
4391
}
4313
4392
@@ -4400,7 +4479,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4400
4479
size_aligned += (size_page - (size_aligned % size_page));
4401
4480
}
4402
4481
4403
- id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
4482
+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4483
+ id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
4404
4484
4405
4485
// the buffer fits into the max buffer size allowed by the device
4406
4486
if (size_aligned <= device.maxBufferLength) {
@@ -4453,6 +4533,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4453
4533
}
4454
4534
}
4455
4535
4536
+ if (!ggml_backend_metal_buffer_rset_init(ctx, device)) {
4537
+ GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4538
+ free(ctx);
4539
+ ggml_backend_metal_device_rel(ctx_dev);
4540
+ return NULL;
4541
+ }
4542
+
4456
4543
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
4457
4544
}
4458
4545
@@ -4766,6 +4853,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
4766
4853
}
4767
4854
}
4768
4855
4856
+ if (!ggml_backend_metal_buffer_rset_init(ctx, device)) {
4857
+ GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4858
+ free(ctx);
4859
+ ggml_backend_metal_device_rel(ctx_dev);
4860
+ return NULL;
4861
+ }
4862
+
4769
4863
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
4770
4864
}
4771
4865
0 commit comments