@@ -409,6 +409,16 @@ struct ggml_tensor_extra_gpu {
409
409
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
410
410
};
411
411
412
+ cudaError_t ggml_cuda_set_device (int device) {
413
+ static int current_device = -1 ;
414
+
415
+ if (device == current_device) {
416
+ return cudaSuccess;
417
+ }
418
+
419
+ return cudaSetDevice (device);
420
+ }
421
+
412
422
static int g_device_count = -1 ;
413
423
static int g_main_device = 0 ;
414
424
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -5151,7 +5161,7 @@ void ggml_init_cublas() {
5151
5161
}
5152
5162
5153
5163
for (int64_t id = 0 ; id < g_device_count; ++id) {
5154
- CUDA_CHECK (cudaSetDevice (id));
5164
+ CUDA_CHECK (ggml_cuda_set_device (id));
5155
5165
5156
5166
// create cuda streams
5157
5167
for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
@@ -5795,7 +5805,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
5795
5805
size_t src1_asf = 0 ;
5796
5806
size_t dst_asf = 0 ;
5797
5807
5798
- cudaSetDevice (g_main_device);
5808
+ ggml_cuda_set_device (g_main_device);
5799
5809
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
5800
5810
5801
5811
if (src0_on_device) {
@@ -5940,7 +5950,7 @@ static void ggml_cuda_op_mul_mat(
5940
5950
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5941
5951
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5942
5952
5943
- cudaSetDevice (id);
5953
+ ggml_cuda_set_device (id);
5944
5954
const cudaStream_t stream = g_cudaStreams[id][0 ];
5945
5955
5946
5956
if (src0_on_device && src0_is_contiguous) {
@@ -5976,7 +5986,7 @@ static void ggml_cuda_op_mul_mat(
5976
5986
// if multiple devices are used they need to wait for the main device
5977
5987
// here an event is recorded that signals that the main device has finished calculating the input data
5978
5988
if (split && g_device_count > 1 ) {
5979
- CUDA_CHECK (cudaSetDevice (g_main_device));
5989
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
5980
5990
CUDA_CHECK (cudaEventRecord (src0_extra->events [g_main_device][0 ], g_cudaStreams[g_main_device][0 ]));
5981
5991
}
5982
5992
@@ -5994,7 +6004,7 @@ static void ggml_cuda_op_mul_mat(
5994
6004
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5995
6005
const int64_t row_diff = row_high[id] - row_low[id];
5996
6006
5997
- cudaSetDevice (id);
6007
+ ggml_cuda_set_device (id);
5998
6008
const cudaStream_t stream = g_cudaStreams[id][is];
5999
6009
6000
6010
// wait for main GPU data if necessary
@@ -6096,7 +6106,7 @@ static void ggml_cuda_op_mul_mat(
6096
6106
}
6097
6107
6098
6108
for (int64_t id = 0 ; id < g_device_count; ++id) {
6099
- CUDA_CHECK (cudaSetDevice (id));
6109
+ CUDA_CHECK (ggml_cuda_set_device (id));
6100
6110
6101
6111
// free buffers again when done
6102
6112
if (src0_as[id] > 0 ) {
@@ -6118,7 +6128,7 @@ static void ggml_cuda_op_mul_mat(
6118
6128
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
6119
6129
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6120
6130
6121
- CUDA_CHECK (cudaSetDevice (g_main_device));
6131
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6122
6132
for (int64_t id = 0 ; id < g_device_count; ++id) {
6123
6133
for (int64_t is = 0 ; is < is_max; ++is) {
6124
6134
CUDA_CHECK (cudaStreamWaitEvent (g_cudaStreams[g_main_device][0 ], src0_extra->events [id][is]));
@@ -6127,7 +6137,7 @@ static void ggml_cuda_op_mul_mat(
6127
6137
}
6128
6138
6129
6139
if (dst->backend == GGML_BACKEND_CPU) {
6130
- CUDA_CHECK (cudaSetDevice (g_main_device));
6140
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6131
6141
CUDA_CHECK (cudaDeviceSynchronize ());
6132
6142
}
6133
6143
}
@@ -6187,7 +6197,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6187
6197
6188
6198
const int64_t ne12 = src1->ne [2 ];
6189
6199
6190
- CUDA_CHECK (cudaSetDevice (g_main_device));
6200
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6191
6201
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6192
6202
6193
6203
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6218,7 +6228,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6218
6228
const int64_t nb01 = src0->nb [1 ];
6219
6229
const int64_t nb02 = src0->nb [2 ];
6220
6230
6221
- CUDA_CHECK (cudaSetDevice (g_main_device));
6231
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6222
6232
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6223
6233
6224
6234
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6310,7 +6320,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6310
6320
const int64_t nb11 = src1->nb [1 ];
6311
6321
const int64_t nb12 = src1->nb [2 ];
6312
6322
6313
- CUDA_CHECK (cudaSetDevice (g_main_device));
6323
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6314
6324
cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
6315
6325
6316
6326
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6376,7 +6386,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6376
6386
continue ;
6377
6387
}
6378
6388
6379
- cudaSetDevice (id);
6389
+ ggml_cuda_set_device (id);
6380
6390
6381
6391
int64_t row_low, row_high;
6382
6392
if (backend == GGML_BACKEND_GPU) {
@@ -6446,13 +6456,13 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6446
6456
6447
6457
for (int64_t id = 0 ; id < g_device_count; ++id) {
6448
6458
if (extra->data_device [id] != nullptr ) {
6449
- CUDA_CHECK (cudaSetDevice (id));
6459
+ CUDA_CHECK (ggml_cuda_set_device (id));
6450
6460
CUDA_CHECK (cudaFree (extra->data_device [id]));
6451
6461
}
6452
6462
6453
6463
for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
6454
6464
if (extra->events [id][is] != nullptr ) {
6455
- CUDA_CHECK (cudaSetDevice (id));
6465
+ CUDA_CHECK (ggml_cuda_set_device (id));
6456
6466
CUDA_CHECK (cudaEventDestroy (extra->events [id][is]));
6457
6467
}
6458
6468
}
@@ -6506,7 +6516,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6506
6516
force_inplace;
6507
6517
const size_t size = ggml_nbytes (tensor);
6508
6518
6509
- CUDA_CHECK (cudaSetDevice (g_main_device));
6519
+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
6510
6520
if (inplace && (tensor->src [0 ]->backend == GGML_BACKEND_GPU || tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT)) {
6511
6521
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src [0 ]->extra ;
6512
6522
char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
0 commit comments