Skip to content

Commit c923de7

Browse files
ggml_cuda_set_device
1 parent bd79c94 commit c923de7

File tree

1 file changed

+30
-15
lines changed

1 file changed

+30
-15
lines changed

ggml-cuda.cu

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,21 @@ struct ggml_tensor_extra_gpu {
409409
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
410410
};
411411

412+
// this is faster on Windows
413+
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
414+
inline cudaError_t ggml_cuda_set_device(int device) {
415+
int current_device;
416+
CUDA_CHECK(cudaGetDevice(&current_device));
417+
418+
if (device == current_device) {
419+
return cudaSuccess;
420+
}
421+
422+
current_device = device;
423+
424+
return cudaSetDevice(device);
425+
}
426+
412427
static int g_device_count = -1;
413428
static int g_main_device = 0;
414429
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -5151,7 +5166,7 @@ void ggml_init_cublas() {
51515166
}
51525167

51535168
for (int64_t id = 0; id < g_device_count; ++id) {
5154-
CUDA_CHECK(cudaSetDevice(id));
5169+
CUDA_CHECK(ggml_cuda_set_device(id));
51555170

51565171
// create cuda streams
51575172
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
@@ -5795,7 +5810,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
57955810
size_t src1_asf = 0;
57965811
size_t dst_asf = 0;
57975812

5798-
cudaSetDevice(g_main_device);
5813+
ggml_cuda_set_device(g_main_device);
57995814
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
58005815

58015816
if (src0_on_device) {
@@ -5940,7 +5955,7 @@ static void ggml_cuda_op_mul_mat(
59405955
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
59415956
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
59425957

5943-
cudaSetDevice(id);
5958+
ggml_cuda_set_device(id);
59445959
const cudaStream_t stream = g_cudaStreams[id][0];
59455960

59465961
if (src0_on_device && src0_is_contiguous) {
@@ -5976,7 +5991,7 @@ static void ggml_cuda_op_mul_mat(
59765991
// if multiple devices are used they need to wait for the main device
59775992
// here an event is recorded that signals that the main device has finished calculating the input data
59785993
if (split && g_device_count > 1) {
5979-
CUDA_CHECK(cudaSetDevice(g_main_device));
5994+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
59805995
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
59815996
}
59825997

@@ -5994,7 +6009,7 @@ static void ggml_cuda_op_mul_mat(
59946009
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
59956010
const int64_t row_diff = row_high[id] - row_low[id];
59966011

5997-
cudaSetDevice(id);
6012+
ggml_cuda_set_device(id);
59986013
const cudaStream_t stream = g_cudaStreams[id][is];
59996014

60006015
// wait for main GPU data if necessary
@@ -6096,7 +6111,7 @@ static void ggml_cuda_op_mul_mat(
60966111
}
60976112

60986113
for (int64_t id = 0; id < g_device_count; ++id) {
6099-
CUDA_CHECK(cudaSetDevice(id));
6114+
CUDA_CHECK(ggml_cuda_set_device(id));
61006115

61016116
// free buffers again when done
61026117
if (src0_as[id] > 0) {
@@ -6118,7 +6133,7 @@ static void ggml_cuda_op_mul_mat(
61186133
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
61196134
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
61206135

6121-
CUDA_CHECK(cudaSetDevice(g_main_device));
6136+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
61226137
for (int64_t id = 0; id < g_device_count; ++id) {
61236138
for (int64_t is = 0; is < is_max; ++is) {
61246139
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
@@ -6127,7 +6142,7 @@ static void ggml_cuda_op_mul_mat(
61276142
}
61286143

61296144
if (dst->backend == GGML_BACKEND_CPU) {
6130-
CUDA_CHECK(cudaSetDevice(g_main_device));
6145+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
61316146
CUDA_CHECK(cudaDeviceSynchronize());
61326147
}
61336148
}
@@ -6187,7 +6202,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
61876202

61886203
const int64_t ne12 = src1->ne[2];
61896204

6190-
CUDA_CHECK(cudaSetDevice(g_main_device));
6205+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
61916206
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
61926207

61936208
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -6218,7 +6233,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
62186233
const int64_t nb01 = src0->nb[1];
62196234
const int64_t nb02 = src0->nb[2];
62206235

6221-
CUDA_CHECK(cudaSetDevice(g_main_device));
6236+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
62226237
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
62236238

62246239
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -6310,7 +6325,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
63106325
const int64_t nb11 = src1->nb[1];
63116326
const int64_t nb12 = src1->nb[2];
63126327

6313-
CUDA_CHECK(cudaSetDevice(g_main_device));
6328+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
63146329
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
63156330

63166331
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -6376,7 +6391,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
63766391
continue;
63776392
}
63786393

6379-
cudaSetDevice(id);
6394+
ggml_cuda_set_device(id);
63806395

63816396
int64_t row_low, row_high;
63826397
if (backend == GGML_BACKEND_GPU) {
@@ -6446,13 +6461,13 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
64466461

64476462
for (int64_t id = 0; id < g_device_count; ++id) {
64486463
if (extra->data_device[id] != nullptr) {
6449-
CUDA_CHECK(cudaSetDevice(id));
6464+
CUDA_CHECK(ggml_cuda_set_device(id));
64506465
CUDA_CHECK(cudaFree(extra->data_device[id]));
64516466
}
64526467

64536468
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
64546469
if (extra->events[id][is] != nullptr) {
6455-
CUDA_CHECK(cudaSetDevice(id));
6470+
CUDA_CHECK(ggml_cuda_set_device(id));
64566471
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
64576472
}
64586473
}
@@ -6506,7 +6521,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
65066521
force_inplace;
65076522
const size_t size = ggml_nbytes(tensor);
65086523

6509-
CUDA_CHECK(cudaSetDevice(g_main_device));
6524+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
65106525
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
65116526
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
65126527
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];

0 commit comments

Comments
 (0)