@@ -1467,19 +1467,13 @@ static void * g_scratch_buffer = nullptr;
1467
1467
static size_t g_scratch_size = 1024 *1024 *1024 ; // 1 GB by default
1468
1468
static size_t g_scratch_offset = 0 ;
1469
1469
1470
- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1471
- #define GGML_CUDA_MAX_EVENTS 64
1472
-
1473
1470
static int g_device_count = -1 ;
1474
1471
static int g_main_device = 0 ;
1475
1472
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
1476
1473
1477
1474
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
1478
1475
1479
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1480
-
1481
- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1482
- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1476
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1483
1477
1484
1478
void ggml_init_cublas () {
1485
1479
static bool initialized = false ;
@@ -1503,15 +1497,8 @@ void ggml_init_cublas() {
1503
1497
for (int id = 0 ; id < g_device_count; ++id) {
1504
1498
CUDA_CHECK (cudaSetDevice (id));
1505
1499
1506
- // create streams
1507
- for (int i = 0 ; i < GGML_CUDA_MAX_STREAMS; ++i) {
1508
- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1509
- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1510
- }
1511
- // create events
1512
- for (int i = 0 ; i < GGML_CUDA_MAX_EVENTS; ++i) {
1513
- CUDA_CHECK (cudaEventCreateWithFlags (&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1514
- }
1500
+ // create main stream
1501
+ CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id], cudaStreamNonBlocking));
1515
1502
1516
1503
// create cublas handle
1517
1504
CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
@@ -1978,6 +1965,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1978
1965
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0 };
1979
1966
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
1980
1967
1968
+ // if multiple GPUs are used they need to wait for the main GPU to finish
1969
+ if (split && g_device_count > 1 ) {
1970
+ CUDA_CHECK (cudaSetDevice (g_main_device));
1971
+ CUDA_CHECK (cudaDeviceSynchronize ());
1972
+ }
1973
+
1981
1974
for (int id = 0 ; id < g_device_count; ++id) {
1982
1975
if (!split && id != g_main_device) {
1983
1976
continue ;
@@ -2076,9 +2069,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2076
2069
}
2077
2070
const int64_t i11 = i13*ne12 + i12;
2078
2071
2079
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
2080
- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
2081
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2072
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
2082
2073
2083
2074
// for split tensors the data begins at i0 == i0_offset_low
2084
2075
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -2106,14 +2097,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2106
2097
if (src1->backend == GGML_BACKEND_CPU) {
2107
2098
GGML_ASSERT (!flatten_rows || nrows0 == ggml_nrows (src1));
2108
2099
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2109
- CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_memcpy_src1 ));
2100
+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_main ));
2110
2101
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
2111
2102
if (id != g_main_device) {
2112
2103
GGML_ASSERT (!flatten_rows);
2113
2104
float * src1_ddf_i_source = (float *) src1_extra->data_device [g_main_device];
2114
2105
src1_ddf_i_source += i11*src1_stride;
2115
2106
CUDA_CHECK (cudaMemcpyAsync (src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof (float ),
2116
- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1 ));
2107
+ cudaMemcpyDeviceToDevice, cudaStream_main ));
2117
2108
}
2118
2109
} else if (src1_on_device && !src1_is_contiguous) {
2119
2110
GGML_ASSERT (!split);
@@ -2122,7 +2113,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2122
2113
GGML_ASSERT (false );
2123
2114
}
2124
2115
}
2125
- CUDA_CHECK (cudaEventRecord (cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
2126
2116
2127
2117
if (!src0_on_device || !src0_is_contiguous) {
2128
2118
if (src0_is_f32) {
@@ -2138,9 +2128,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2138
2128
CUDA_CHECK (cudaGetLastError ());
2139
2129
}
2140
2130
2141
- // wait with main stream until src1 memcpy is done
2142
- CUDA_CHECK (cudaStreamWaitEvent (cudaStream_main, cudaEvent_memcpy_src1, 0 ));
2143
-
2144
2131
// do the computation
2145
2132
op (src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2146
2133
@@ -2178,8 +2165,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2178
2165
2179
2166
// wait until each device is finished, then free their buffers
2180
2167
for (int id = 0 ; id < g_device_count; ++id) {
2168
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0 ) {
2169
+ continue ;
2170
+ }
2171
+
2181
2172
CUDA_CHECK (cudaSetDevice (id));
2182
2173
CUDA_CHECK (cudaDeviceSynchronize ());
2174
+
2183
2175
if (src0_asq[id] > 0 ) {
2184
2176
ggml_cuda_pool_free (src0_ddq[id], src0_asq[id]);
2185
2177
}
@@ -2245,7 +2237,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2245
2237
const int64_t ne02 = src0->ne [2 ];
2246
2238
2247
2239
CUDA_CHECK (cudaSetDevice (g_main_device));
2248
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2240
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2249
2241
2250
2242
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2251
2243
void * src0_ddq = src0_extra->data_device [g_main_device];
@@ -2257,8 +2249,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2257
2249
float * dst_ddf = (float *) dst_extra->data_device [g_main_device];
2258
2250
2259
2251
ggml_mul_mat_p021_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2260
-
2261
- CUDA_CHECK (cudaDeviceSynchronize ());
2262
2252
}
2263
2253
2264
2254
void ggml_cuda_mul_mat_vec_nc (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2276,7 +2266,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2276
2266
const int64_t nb02 = src0->nb [2 ];
2277
2267
2278
2268
CUDA_CHECK (cudaSetDevice (g_main_device));
2279
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2269
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2280
2270
2281
2271
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2282
2272
void * src0_ddq = src0_extra->data_device [g_main_device];
@@ -2291,8 +2281,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2291
2281
const int channel_stride_x = nb02 / sizeof (half);
2292
2282
2293
2283
ggml_mul_mat_vec_nc_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2294
-
2295
- CUDA_CHECK (cudaDeviceSynchronize ());
2296
2284
}
2297
2285
2298
2286
void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2348,7 +2336,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2348
2336
const int64_t nb12 = src1->nb [2 ];
2349
2337
2350
2338
CUDA_CHECK (cudaSetDevice (g_main_device));
2351
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2339
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2352
2340
2353
2341
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2354
2342
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra ;
@@ -2366,8 +2354,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2366
2354
GGML_ASSERT (false );
2367
2355
}
2368
2356
2369
- CUDA_CHECK (cudaDeviceSynchronize ());
2370
-
2371
2357
(void ) dst;
2372
2358
}
2373
2359
0 commit comments