@@ -409,6 +409,19 @@ struct ggml_tensor_extra_gpu {
409
409
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
410
410
};
411
411
412
+ // this is faster on Windows
413
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
414
+ inline cudaError_t ggml_cuda_set_device (int device) {
415
+ int current_device;
416
+ CUDA_CHECK (cudaGetDevice (¤t_device));
417
+
418
+ if (device == current_device) {
419
+ return cudaSuccess;
420
+ }
421
+
422
+ return cudaSetDevice (device);
423
+ }
424
+
412
425
static int g_device_count = -1 ;
413
426
static int g_main_device = 0 ;
414
427
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -5151,7 +5164,7 @@ void ggml_init_cublas() {
5151
5164
}
5152
5165
5153
5166
for (int64_t id = 0 ; id < g_device_count; ++id) {
5154
- CUDA_CHECK (cudaSetDevice (id));
5167
+ CUDA_CHECK (ggml_cuda_set_device (id));
5155
5168
5156
5169
// create cuda streams
5157
5170
for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
@@ -5795,7 +5808,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
5795
5808
size_t src1_asf = 0 ;
5796
5809
size_t dst_asf = 0 ;
5797
5810
5798
- cudaSetDevice (g_main_device);
5811
+ ggml_cuda_set_device (g_main_device);
5799
5812
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
5800
5813
5801
5814
if (src0_on_device) {
@@ -5940,7 +5953,7 @@ static void ggml_cuda_op_mul_mat(
5940
5953
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5941
5954
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5942
5955
5943
- cudaSetDevice (id);
5956
+ ggml_cuda_set_device (id);
5944
5957
const cudaStream_t stream = g_cudaStreams[id][0 ];
5945
5958
5946
5959
if (src0_on_device && src0_is_contiguous) {
@@ -5976,7 +5989,7 @@ static void ggml_cuda_op_mul_mat(
5976
5989
// if multiple devices are used they need to wait for the main device
5977
5990
// here an event is recorded that signals that the main device has finished calculating the input data
5978
5991
if (split && g_device_count > 1 ) {
5979
- CUDA_CHECK (cudaSetDevice (g_main_device));
5992
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
5980
5993
CUDA_CHECK (cudaEventRecord (src0_extra->events [g_main_device][0 ], g_cudaStreams[g_main_device][0 ]));
5981
5994
}
5982
5995
@@ -5994,7 +6007,7 @@ static void ggml_cuda_op_mul_mat(
5994
6007
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5995
6008
const int64_t row_diff = row_high[id] - row_low[id];
5996
6009
5997
- cudaSetDevice (id);
6010
+ ggml_cuda_set_device (id);
5998
6011
const cudaStream_t stream = g_cudaStreams[id][is];
5999
6012
6000
6013
// wait for main GPU data if necessary
@@ -6096,7 +6109,7 @@ static void ggml_cuda_op_mul_mat(
6096
6109
}
6097
6110
6098
6111
for (int64_t id = 0 ; id < g_device_count; ++id) {
6099
- CUDA_CHECK (cudaSetDevice (id));
6112
+ CUDA_CHECK (ggml_cuda_set_device (id));
6100
6113
6101
6114
// free buffers again when done
6102
6115
if (src0_as[id] > 0 ) {
@@ -6118,7 +6131,7 @@ static void ggml_cuda_op_mul_mat(
6118
6131
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
6119
6132
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6120
6133
6121
- CUDA_CHECK (cudaSetDevice (g_main_device));
6134
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6122
6135
for (int64_t id = 0 ; id < g_device_count; ++id) {
6123
6136
for (int64_t is = 0 ; is < is_max; ++is) {
6124
6137
CUDA_CHECK (cudaStreamWaitEvent (g_cudaStreams[g_main_device][0 ], src0_extra->events [id][is]));
@@ -6127,7 +6140,7 @@ static void ggml_cuda_op_mul_mat(
6127
6140
}
6128
6141
6129
6142
if (dst->backend == GGML_BACKEND_CPU) {
6130
- CUDA_CHECK (cudaSetDevice (g_main_device));
6143
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6131
6144
CUDA_CHECK (cudaDeviceSynchronize ());
6132
6145
}
6133
6146
}
@@ -6187,7 +6200,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6187
6200
6188
6201
const int64_t ne12 = src1->ne [2 ];
6189
6202
6190
- CUDA_CHECK (cudaSetDevice (g_main_device));
6203
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6191
6204
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6192
6205
6193
6206
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6218,7 +6231,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6218
6231
const int64_t nb01 = src0->nb [1 ];
6219
6232
const int64_t nb02 = src0->nb [2 ];
6220
6233
6221
- CUDA_CHECK (cudaSetDevice (g_main_device));
6234
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6222
6235
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6223
6236
6224
6237
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6310,7 +6323,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6310
6323
const int64_t nb11 = src1->nb [1 ];
6311
6324
const int64_t nb12 = src1->nb [2 ];
6312
6325
6313
- CUDA_CHECK (cudaSetDevice (g_main_device));
6326
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6314
6327
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6315
6328
6316
6329
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6376,7 +6389,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6376
6389
continue ;
6377
6390
}
6378
6391
6379
- cudaSetDevice (id);
6392
+ ggml_cuda_set_device (id);
6380
6393
6381
6394
int64_t row_low, row_high;
6382
6395
if (backend == GGML_BACKEND_GPU) {
@@ -6446,13 +6459,13 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6446
6459
6447
6460
for (int64_t id = 0 ; id < g_device_count; ++id) {
6448
6461
if (extra->data_device [id] != nullptr ) {
6449
- CUDA_CHECK (cudaSetDevice (id));
6462
+ CUDA_CHECK (ggml_cuda_set_device (id));
6450
6463
CUDA_CHECK (cudaFree (extra->data_device [id]));
6451
6464
}
6452
6465
6453
6466
for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
6454
6467
if (extra->events [id][is] != nullptr ) {
6455
- CUDA_CHECK (cudaSetDevice (id));
6468
+ CUDA_CHECK (ggml_cuda_set_device (id));
6456
6469
CUDA_CHECK (cudaEventDestroy (extra->events [id][is]));
6457
6470
}
6458
6471
}
@@ -6506,7 +6519,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6506
6519
force_inplace;
6507
6520
const size_t size = ggml_nbytes (tensor);
6508
6521
6509
- CUDA_CHECK (cudaSetDevice (g_main_device));
6522
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6510
6523
if (inplace && (tensor->src [0 ]->backend == GGML_BACKEND_GPU || tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT)) {
6511
6524
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src [0 ]->extra ;
6512
6525
char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
0 commit comments