@@ -699,6 +699,15 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
699
699
ctx->tensor_extras .push_back (extra);
700
700
701
701
for (int id = 0 ; id < ggml_backend_cuda_get_device_count (); ++id) {
702
+ ggml_cuda_set_device (id);
703
+
704
+ // Create events on all devices unconditionally even if they don't actually hold any data.
705
+ // This is because for very small matrices it's possible for the active device to not hold any data.
706
+ // But in this case the events are still needed to synchronize the other devices.
707
+ for (int64_t is = 0 ; is < GGML_CUDA_MAX_STREAMS; ++is) {
708
+ CUDA_CHECK (cudaEventCreateWithFlags (&extra->events [id][is], cudaEventDisableTiming));
709
+ }
710
+
702
711
int64_t row_low, row_high;
703
712
get_row_split (&row_low, &row_high, tensor, buft_ctx->tensor_split , id);
704
713
@@ -717,7 +726,6 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
717
726
718
727
// FIXME: do not crash if cudaMalloc fails
719
728
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
720
- ggml_cuda_set_device (id);
721
729
char * buf;
722
730
CUDA_CHECK (ggml_cuda_device_malloc ((void **)&buf, size, id));
723
731
@@ -727,10 +735,6 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
727
735
}
728
736
729
737
extra->data_device [id] = buf;
730
-
731
- for (int64_t is = 0 ; is < GGML_CUDA_MAX_STREAMS; ++is) {
732
- CUDA_CHECK (cudaEventCreateWithFlags (&extra->events [id][is], cudaEventDisableTiming));
733
- }
734
738
}
735
739
tensor->extra = extra;
736
740
}
0 commit comments