19
19
// max number of MTLCommandBuffer used to submit a graph for processing
20
20
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
21
21
22
+ #define GGML_METAL_MAX_RESIDENCY_SETS 128
23
+
22
24
#define UNUSED(x) (void)(x)
23
25
24
26
// globals
37
39
id<MTLDevice> mtl_device;
38
40
int mtl_device_ref_count;
39
41
42
+ id<MTLResidencySet> mtl_residency_set[GGML_METAL_MAX_RESIDENCY_SETS];
43
+ int mtl_residency_set_n;
44
+
40
45
bool has_simdgroup_reduction;
41
46
bool has_simdgroup_mm;
42
47
bool has_bfloat;
46
51
} g_ggml_ctx_dev_main = {
47
52
/*.mtl_device =*/ nil,
48
53
/*.mtl_device_ref_count =*/ 0,
54
+ /*.mtl_residency_set =*/ { nil },
55
+ /*.mtl_residency_set_n =*/ 0,
49
56
/*.has_simdgroup_reduction =*/ false,
50
57
/*.has_simdgroup_mm =*/ false,
51
58
/*.has_bfloat =*/ false,
@@ -95,6 +102,42 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
95
102
}
96
103
}
97
104
105
+ // add command queue
106
+ static bool ggml_backend_metal_device_add_residency_set(struct ggml_backend_metal_device_context * ctx, id<MTLResidencySet> residency_set) {
107
+ assert(ctx != NULL);
108
+ assert(queue != nil);
109
+
110
+ if (ctx->mtl_residency_set_n >= GGML_METAL_MAX_RESIDENCY_SETS) {
111
+ GGML_LOG_ERROR("%s: warning: maximum number of residency sets reached\n", __func__);
112
+ return false;
113
+ }
114
+
115
+ ctx->mtl_residency_set[ctx->mtl_residency_set_n++] = residency_set;
116
+
117
+ return true;
118
+ }
119
+
120
+ // remove residency set
121
+ // search for the residency set in the list, remove it and shift the remaining residency sets
122
+ static bool ggml_backend_metal_device_remove_residency_set(struct ggml_backend_metal_device_context * ctx, id<MTLResidencySet> residency_set) {
123
+ assert(ctx != NULL);
124
+ assert(residency_set != nil);
125
+
126
+ for (int i = 0; i < ctx->mtl_residency_set_n; ++i) {
127
+ if (ctx->mtl_residency_set[i] == residency_set) {
128
+ for (int j = i; j < ctx->mtl_residency_set_n - 1; ++j) {
129
+ ctx->mtl_residency_set[j] = ctx->mtl_residency_set[j + 1];
130
+ }
131
+
132
+ ctx->mtl_residency_set_n--;
133
+
134
+ return true;
135
+ }
136
+ }
137
+
138
+ return false;
139
+ }
140
+
98
141
// kernels
99
142
100
143
struct ggml_metal_kernel {
@@ -483,6 +526,11 @@ @implementation GGMLMetalClass
483
526
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
484
527
485
528
ctx->queue = [device newCommandQueue];
529
+ if (ctx->queue == nil) {
530
+ GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
531
+ return NULL;
532
+ }
533
+
486
534
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
487
535
488
536
id<MTLLibrary> metal_library;
@@ -1035,6 +1083,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
1035
1083
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1036
1084
int n_buffers;
1037
1085
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1086
+
1087
+ id<MTLResidencySet> residency_set;
1038
1088
};
1039
1089
1040
1090
// finds the Metal buffer that contains the tensor data on the GPU device
@@ -4039,6 +4089,20 @@ static enum ggml_status ggml_metal_graph_compute(
4039
4089
struct ggml_backend_metal_context * ctx = backend->context;
4040
4090
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
4041
4091
4092
+ static bool is_first = true;
4093
+ if (is_first) {
4094
+ is_first = false;
4095
+ GGML_LOG_INFO("%s: adding %d residency sets\n", __func__, ctx_dev->mtl_residency_set_n);
4096
+ [ctx->queue addResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
4097
+ }
4098
+
4099
+ //for (int i = 0; i < ctx_dev->mtl_residency_set_n; ++i) {
4100
+ // GGML_LOG_INFO("%s: residency set %d allocations size = %zu\n", __func__, i, [ctx_dev->mtl_residency_set[i] allocatedSize]);
4101
+ // [ctx_dev->mtl_residency_set[i] requestResidency];
4102
+ //}
4103
+
4104
+ int64_t t_start_us = ggml_time_us();
4105
+
4042
4106
// number of nodes encoded by the main thread (empirically determined)
4043
4107
const int n_main = 128;
4044
4108
@@ -4086,19 +4150,23 @@ static enum ggml_status ggml_metal_graph_compute(
4086
4150
// the main thread commits the first few commands immediately
4087
4151
// command_buffer[n_cb]
4088
4152
{
4089
- id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4153
+ id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer ];
4090
4154
ctx->command_buffers[n_cb] = command_buffer;
4091
4155
4156
+ [command_buffer useResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
4157
+
4092
4158
[command_buffer enqueue];
4093
4159
ctx->encode_async(n_cb);
4094
4160
}
4095
4161
4096
4162
// prepare the rest of the command buffers asynchronously
4097
4163
// command_buffer[0.. n_cb)
4098
4164
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
4099
- id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4165
+ id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer ];
4100
4166
ctx->command_buffers[cb_idx] = command_buffer;
4101
4167
4168
+ [command_buffer useResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
4169
+
4102
4170
// always enqueue the first two command buffers
4103
4171
// enqueue all of the command buffers if we don't need to abort
4104
4172
if (cb_idx < 2 || ctx->abort_callback == NULL) {
@@ -4163,6 +4231,10 @@ static enum ggml_status ggml_metal_graph_compute(
4163
4231
}
4164
4232
}
4165
4233
4234
+ int64_t t_end_us = ggml_time_us();
4235
+
4236
+ GGML_LOG_DEBUG("%s: compute graph took %8.2f ms\n", __func__, (t_end_us - t_start_us) / 1000.0);
4237
+
4166
4238
return GGML_STATUS_SUCCESS;
4167
4239
}
4168
4240
@@ -4176,6 +4248,13 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
4176
4248
for (int i = 0; i < ctx->n_buffers; i++) {
4177
4249
[ctx->buffers[i].metal release];
4178
4250
}
4251
+
4252
+ ggml_backend_metal_device_remove_residency_set(buffer->buft->device->context, ctx->residency_set);
4253
+
4254
+ [ctx->residency_set endResidency];
4255
+ [ctx->residency_set removeAllAllocations];
4256
+ [ctx->residency_set release];
4257
+
4179
4258
ggml_backend_metal_device_rel(buffer->buft->device->context);
4180
4259
4181
4260
if (ctx->owned) {
@@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4284
4363
size_aligned += (size_page - (size_aligned % size_page));
4285
4364
}
4286
4365
4287
- id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
4366
+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
4367
+ id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
4288
4368
4289
4369
ctx->all_data = ggml_metal_host_malloc(size_aligned);
4290
4370
ctx->all_size = size_aligned;
@@ -4307,10 +4387,34 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4307
4387
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
4308
4388
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
4309
4389
free(ctx);
4310
- ggml_backend_metal_device_rel(buft->device->context );
4390
+ ggml_backend_metal_device_rel(ctx_dev );
4311
4391
return NULL;
4312
4392
}
4313
4393
4394
+ {
4395
+ MTLResidencySetDescriptor * desc;
4396
+ desc = [[MTLResidencySetDescriptor alloc] init];
4397
+ desc.label = @"Primary residency set";
4398
+ desc.initialCapacity = ctx->n_buffers;
4399
+
4400
+ NSError *error;
4401
+ ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
4402
+ if (error) {
4403
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
4404
+ return NULL;
4405
+ }
4406
+
4407
+ for (int i = 0; i < ctx->n_buffers; i++) {
4408
+ [ctx->residency_set addAllocation:ctx->buffers[i].metal];
4409
+ }
4410
+
4411
+ [ctx->residency_set commit];
4412
+ [ctx->residency_set requestResidency];
4413
+
4414
+ // track the residency set in the device context
4415
+ ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
4416
+ }
4417
+
4314
4418
//ggml_backend_metal_log_allocated_size(device, size_aligned);
4315
4419
4316
4420
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
@@ -4400,7 +4504,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4400
4504
size_aligned += (size_page - (size_aligned % size_page));
4401
4505
}
4402
4506
4403
- id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
4507
+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4508
+ id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
4404
4509
4405
4510
// the buffer fits into the max buffer size allowed by the device
4406
4511
if (size_aligned <= device.maxBufferLength) {
@@ -4453,6 +4558,30 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4453
4558
}
4454
4559
}
4455
4560
4561
+ {
4562
+ MTLResidencySetDescriptor * desc;
4563
+ desc = [[MTLResidencySetDescriptor alloc] init];
4564
+ desc.label = @"Primary residency set";
4565
+ desc.initialCapacity = ctx->n_buffers;
4566
+
4567
+ NSError *error;
4568
+ ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
4569
+ if (error) {
4570
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
4571
+ return NULL;
4572
+ }
4573
+
4574
+ for (int i = 0; i < ctx->n_buffers; i++) {
4575
+ [ctx->residency_set addAllocation:ctx->buffers[i].metal];
4576
+ }
4577
+
4578
+ [ctx->residency_set commit];
4579
+ [ctx->residency_set requestResidency];
4580
+
4581
+ // track the residency set in the device context
4582
+ ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
4583
+ }
4584
+
4456
4585
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
4457
4586
}
4458
4587
@@ -4766,6 +4895,30 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
4766
4895
}
4767
4896
}
4768
4897
4898
+ {
4899
+ MTLResidencySetDescriptor * desc;
4900
+ desc = [[MTLResidencySetDescriptor alloc] init];
4901
+ desc.label = @"Primary residency set";
4902
+ desc.initialCapacity = ctx->n_buffers;
4903
+
4904
+ NSError *error;
4905
+ ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
4906
+ if (error) {
4907
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
4908
+ return NULL;
4909
+ }
4910
+
4911
+ for (int i = 0; i < ctx->n_buffers; i++) {
4912
+ [ctx->residency_set addAllocation:ctx->buffers[i].metal];
4913
+ }
4914
+
4915
+ [ctx->residency_set commit];
4916
+ [ctx->residency_set requestResidency];
4917
+
4918
+ // track the residency set in the device context
4919
+ ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
4920
+ }
4921
+
4769
4922
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
4770
4923
}
4771
4924
0 commit comments