CUDA: always create events for split buffers

JohannesGaessler · JohannesGaessler · commit 38d11f55a269 · 2024-11-05T18:03:38.000+01:00
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -699,6 +699,15 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
     ctx->tensor_extras.push_back(extra);
 
     for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        ggml_cuda_set_device(id);
+
+        // Create events on all devices unconditionally even if they don't actually hold any data.
+        // This is because for very small matrices it's possible for the active device to not hold any data.
+        // But in this case the events are still needed to synchronize the other devices.
+        for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
+        }
+
         int64_t row_low, row_high;
         get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
 
@@ -717,7 +726,6 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
 
         // FIXME: do not crash if cudaMalloc fails
         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
-        ggml_cuda_set_device(id);
         char * buf;
         CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
 
@@ -727,10 +735,6 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
         }
 
         extra->data_device[id] = buf;
-
-        for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
-            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
-        }
     }
     tensor->extra = extra;
 }