Skip to content

Commit a599006

Browse files
ggml_cuda_set_device
1 parent bd79c94 commit a599006

File tree

1 file changed

+28
-15
lines changed

1 file changed

+28
-15
lines changed

ggml-cuda.cu

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,19 @@ struct ggml_tensor_extra_gpu {
409409
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
410410
};
411411

412+
// this is faster on Windows
413+
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
414+
inline cudaError_t ggml_cuda_set_device(int device) {
415+
int current_device;
416+
CUDA_CHECK(cudaGetDevice(&current_device));
417+
418+
if (device == current_device) {
419+
return cudaSuccess;
420+
}
421+
422+
return cudaSetDevice(device);
423+
}
424+
412425
static int g_device_count = -1;
413426
static int g_main_device = 0;
414427
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -5151,7 +5164,7 @@ void ggml_init_cublas() {
51515164
}
51525165

51535166
for (int64_t id = 0; id < g_device_count; ++id) {
5154-
CUDA_CHECK(cudaSetDevice(id));
5167+
CUDA_CHECK(ggml_cuda_set_device(id));
51555168

51565169
// create cuda streams
51575170
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
@@ -5795,7 +5808,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
57955808
size_t src1_asf = 0;
57965809
size_t dst_asf = 0;
57975810

5798-
cudaSetDevice(g_main_device);
5811+
ggml_cuda_set_device(g_main_device);
57995812
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
58005813

58015814
if (src0_on_device) {
@@ -5940,7 +5953,7 @@ static void ggml_cuda_op_mul_mat(
59405953
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
59415954
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
59425955

5943-
cudaSetDevice(id);
5956+
ggml_cuda_set_device(id);
59445957
const cudaStream_t stream = g_cudaStreams[id][0];
59455958

59465959
if (src0_on_device && src0_is_contiguous) {
@@ -5976,7 +5989,7 @@ static void ggml_cuda_op_mul_mat(
59765989
// if multiple devices are used they need to wait for the main device
59775990
// here an event is recorded that signals that the main device has finished calculating the input data
59785991
if (split && g_device_count > 1) {
5979-
CUDA_CHECK(cudaSetDevice(g_main_device));
5992+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
59805993
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
59815994
}
59825995

@@ -5994,7 +6007,7 @@ static void ggml_cuda_op_mul_mat(
59946007
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
59956008
const int64_t row_diff = row_high[id] - row_low[id];
59966009

5997-
cudaSetDevice(id);
6010+
ggml_cuda_set_device(id);
59986011
const cudaStream_t stream = g_cudaStreams[id][is];
59996012

60006013
// wait for main GPU data if necessary
@@ -6096,7 +6109,7 @@ static void ggml_cuda_op_mul_mat(
60966109
}
60976110

60986111
for (int64_t id = 0; id < g_device_count; ++id) {
6099-
CUDA_CHECK(cudaSetDevice(id));
6112+
CUDA_CHECK(ggml_cuda_set_device(id));
61006113

61016114
// free buffers again when done
61026115
if (src0_as[id] > 0) {
@@ -6118,7 +6131,7 @@ static void ggml_cuda_op_mul_mat(
61186131
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
61196132
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
61206133

6121-
CUDA_CHECK(cudaSetDevice(g_main_device));
6134+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
61226135
for (int64_t id = 0; id < g_device_count; ++id) {
61236136
for (int64_t is = 0; is < is_max; ++is) {
61246137
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
@@ -6127,7 +6140,7 @@ static void ggml_cuda_op_mul_mat(
61276140
}
61286141

61296142
if (dst->backend == GGML_BACKEND_CPU) {
6130-
CUDA_CHECK(cudaSetDevice(g_main_device));
6143+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
61316144
CUDA_CHECK(cudaDeviceSynchronize());
61326145
}
61336146
}
@@ -6187,7 +6200,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
61876200

61886201
const int64_t ne12 = src1->ne[2];
61896202

6190-
CUDA_CHECK(cudaSetDevice(g_main_device));
6203+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
61916204
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
61926205

61936206
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -6218,7 +6231,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
62186231
const int64_t nb01 = src0->nb[1];
62196232
const int64_t nb02 = src0->nb[2];
62206233

6221-
CUDA_CHECK(cudaSetDevice(g_main_device));
6234+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
62226235
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
62236236

62246237
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -6310,7 +6323,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
63106323
const int64_t nb11 = src1->nb[1];
63116324
const int64_t nb12 = src1->nb[2];
63126325

6313-
CUDA_CHECK(cudaSetDevice(g_main_device));
6326+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
63146327
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
63156328

63166329
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -6376,7 +6389,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
63766389
continue;
63776390
}
63786391

6379-
cudaSetDevice(id);
6392+
ggml_cuda_set_device(id);
63806393

63816394
int64_t row_low, row_high;
63826395
if (backend == GGML_BACKEND_GPU) {
@@ -6446,13 +6459,13 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
64466459

64476460
for (int64_t id = 0; id < g_device_count; ++id) {
64486461
if (extra->data_device[id] != nullptr) {
6449-
CUDA_CHECK(cudaSetDevice(id));
6462+
CUDA_CHECK(ggml_cuda_set_device(id));
64506463
CUDA_CHECK(cudaFree(extra->data_device[id]));
64516464
}
64526465

64536466
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
64546467
if (extra->events[id][is] != nullptr) {
6455-
CUDA_CHECK(cudaSetDevice(id));
6468+
CUDA_CHECK(ggml_cuda_set_device(id));
64566469
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
64576470
}
64586471
}
@@ -6506,7 +6519,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
65066519
force_inplace;
65076520
const size_t size = ggml_nbytes(tensor);
65086521

6509-
CUDA_CHECK(cudaSetDevice(g_main_device));
6522+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
65106523
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
65116524
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
65126525
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];

0 commit comments

Comments
 (0)