@@ -409,6 +409,21 @@ struct ggml_tensor_extra_gpu {
409
409
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
410
410
};
411
411
412
+ // this is faster on Windows
413
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
414
+ inline cudaError_t ggml_cuda_set_device (int device) {
415
+ int current_device;
416
+ CUDA_CHECK (cudaGetDevice (¤t_device));
417
+
418
+ if (device == current_device) {
419
+ return cudaSuccess;
420
+ }
421
+
422
+ current_device = device;
423
+
424
+ return cudaSetDevice (device);
425
+ }
426
+
412
427
static int g_device_count = -1 ;
413
428
static int g_main_device = 0 ;
414
429
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -5151,7 +5166,7 @@ void ggml_init_cublas() {
5151
5166
}
5152
5167
5153
5168
for (int64_t id = 0 ; id < g_device_count; ++id) {
5154
- CUDA_CHECK (cudaSetDevice (id));
5169
+ CUDA_CHECK (ggml_cuda_set_device (id));
5155
5170
5156
5171
// create cuda streams
5157
5172
for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
@@ -5795,7 +5810,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
5795
5810
size_t src1_asf = 0 ;
5796
5811
size_t dst_asf = 0 ;
5797
5812
5798
- cudaSetDevice (g_main_device);
5813
+ ggml_cuda_set_device (g_main_device);
5799
5814
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
5800
5815
5801
5816
if (src0_on_device) {
@@ -5940,7 +5955,7 @@ static void ggml_cuda_op_mul_mat(
5940
5955
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5941
5956
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5942
5957
5943
- cudaSetDevice (id);
5958
+ ggml_cuda_set_device (id);
5944
5959
const cudaStream_t stream = g_cudaStreams[id][0 ];
5945
5960
5946
5961
if (src0_on_device && src0_is_contiguous) {
@@ -5976,7 +5991,7 @@ static void ggml_cuda_op_mul_mat(
5976
5991
// if multiple devices are used they need to wait for the main device
5977
5992
// here an event is recorded that signals that the main device has finished calculating the input data
5978
5993
if (split && g_device_count > 1 ) {
5979
- CUDA_CHECK (cudaSetDevice (g_main_device));
5994
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
5980
5995
CUDA_CHECK (cudaEventRecord (src0_extra->events [g_main_device][0 ], g_cudaStreams[g_main_device][0 ]));
5981
5996
}
5982
5997
@@ -5994,7 +6009,7 @@ static void ggml_cuda_op_mul_mat(
5994
6009
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5995
6010
const int64_t row_diff = row_high[id] - row_low[id];
5996
6011
5997
- cudaSetDevice (id);
6012
+ ggml_cuda_set_device (id);
5998
6013
const cudaStream_t stream = g_cudaStreams[id][is];
5999
6014
6000
6015
// wait for main GPU data if necessary
@@ -6096,7 +6111,7 @@ static void ggml_cuda_op_mul_mat(
6096
6111
}
6097
6112
6098
6113
for (int64_t id = 0 ; id < g_device_count; ++id) {
6099
- CUDA_CHECK (cudaSetDevice (id));
6114
+ CUDA_CHECK (ggml_cuda_set_device (id));
6100
6115
6101
6116
// free buffers again when done
6102
6117
if (src0_as[id] > 0 ) {
@@ -6118,7 +6133,7 @@ static void ggml_cuda_op_mul_mat(
6118
6133
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
6119
6134
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6120
6135
6121
- CUDA_CHECK (cudaSetDevice (g_main_device));
6136
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6122
6137
for (int64_t id = 0 ; id < g_device_count; ++id) {
6123
6138
for (int64_t is = 0 ; is < is_max; ++is) {
6124
6139
CUDA_CHECK (cudaStreamWaitEvent (g_cudaStreams[g_main_device][0 ], src0_extra->events [id][is]));
@@ -6127,7 +6142,7 @@ static void ggml_cuda_op_mul_mat(
6127
6142
}
6128
6143
6129
6144
if (dst->backend == GGML_BACKEND_CPU) {
6130
- CUDA_CHECK (cudaSetDevice (g_main_device));
6145
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6131
6146
CUDA_CHECK (cudaDeviceSynchronize ());
6132
6147
}
6133
6148
}
@@ -6187,7 +6202,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6187
6202
6188
6203
const int64_t ne12 = src1->ne [2 ];
6189
6204
6190
- CUDA_CHECK (cudaSetDevice (g_main_device));
6205
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6191
6206
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6192
6207
6193
6208
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6218,7 +6233,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6218
6233
const int64_t nb01 = src0->nb [1 ];
6219
6234
const int64_t nb02 = src0->nb [2 ];
6220
6235
6221
- CUDA_CHECK (cudaSetDevice (g_main_device));
6236
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6222
6237
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6223
6238
6224
6239
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6310,7 +6325,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6310
6325
const int64_t nb11 = src1->nb [1 ];
6311
6326
const int64_t nb12 = src1->nb [2 ];
6312
6327
6313
- CUDA_CHECK (cudaSetDevice (g_main_device));
6328
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6314
6329
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6315
6330
6316
6331
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6376,7 +6391,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6376
6391
continue ;
6377
6392
}
6378
6393
6379
- cudaSetDevice (id);
6394
+ ggml_cuda_set_device (id);
6380
6395
6381
6396
int64_t row_low, row_high;
6382
6397
if (backend == GGML_BACKEND_GPU) {
@@ -6446,13 +6461,13 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6446
6461
6447
6462
for (int64_t id = 0 ; id < g_device_count; ++id) {
6448
6463
if (extra->data_device [id] != nullptr ) {
6449
- CUDA_CHECK (cudaSetDevice (id));
6464
+ CUDA_CHECK (ggml_cuda_set_device (id));
6450
6465
CUDA_CHECK (cudaFree (extra->data_device [id]));
6451
6466
}
6452
6467
6453
6468
for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
6454
6469
if (extra->events [id][is] != nullptr ) {
6455
- CUDA_CHECK (cudaSetDevice (id));
6470
+ CUDA_CHECK (ggml_cuda_set_device (id));
6456
6471
CUDA_CHECK (cudaEventDestroy (extra->events [id][is]));
6457
6472
}
6458
6473
}
@@ -6506,7 +6521,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6506
6521
force_inplace;
6507
6522
const size_t size = ggml_nbytes (tensor);
6508
6523
6509
- CUDA_CHECK (cudaSetDevice (g_main_device));
6524
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6510
6525
if (inplace && (tensor->src [0 ]->backend == GGML_BACKEND_GPU || tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT)) {
6511
6526
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src [0 ]->extra ;
6512
6527
char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
0 commit comments