@@ -1306,19 +1306,13 @@ static void * g_scratch_buffer = nullptr;
1306
1306
static size_t g_scratch_size = 1024 *1024 *1024 ; // 1 GB by default
1307
1307
static size_t g_scratch_offset = 0 ;
1308
1308
1309
- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1310
- #define GGML_CUDA_MAX_EVENTS 64
1311
-
1312
1309
static int g_device_count = -1 ;
1313
1310
static int g_main_device = 0 ;
1314
1311
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
1315
1312
1316
1313
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
1317
1314
1318
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1319
-
1320
- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1321
- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1315
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
1322
1316
1323
1317
void ggml_init_cublas () {
1324
1318
static bool initialized = false ;
@@ -1342,15 +1336,8 @@ void ggml_init_cublas() {
1342
1336
for (int id = 0 ; id < g_device_count; ++id) {
1343
1337
CUDA_CHECK (cudaSetDevice (id));
1344
1338
1345
- // create streams
1346
- for (int i = 0 ; i < GGML_CUDA_MAX_STREAMS; ++i) {
1347
- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1348
- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1349
- }
1350
- // create events
1351
- for (int i = 0 ; i < GGML_CUDA_MAX_EVENTS; ++i) {
1352
- CUDA_CHECK (cudaEventCreateWithFlags (&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1353
- }
1339
+ // create main stream
1340
+ CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams_main[id], cudaStreamNonBlocking));
1354
1341
1355
1342
// create cublas handle
1356
1343
CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
@@ -1817,6 +1804,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1817
1804
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0 };
1818
1805
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
1819
1806
1807
+ // if multiple GPUs are used they need to wait for the main GPU to finish
1808
+ if (split && g_device_count > 1 ) {
1809
+ CUDA_CHECK (cudaSetDevice (g_main_device));
1810
+ CUDA_CHECK (cudaDeviceSynchronize ());
1811
+ }
1812
+
1820
1813
for (int id = 0 ; id < g_device_count; ++id) {
1821
1814
if (!split && id != g_main_device) {
1822
1815
continue ;
@@ -1915,9 +1908,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1915
1908
}
1916
1909
const int64_t i11 = i13*ne12 + i12;
1917
1910
1918
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
1919
- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
1920
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
1911
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
1921
1912
1922
1913
// for split tensors the data begins at i0 == i0_offset_low
1923
1914
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -1945,14 +1936,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1945
1936
if (src1->backend == GGML_BACKEND_CPU) {
1946
1937
GGML_ASSERT (!flatten_rows || nrows0 == ggml_nrows (src1));
1947
1938
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
1948
- CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_memcpy_src1 ));
1939
+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_main ));
1949
1940
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
1950
1941
if (id != g_main_device) {
1951
1942
GGML_ASSERT (!flatten_rows);
1952
1943
float * src1_ddf_i_source = (float *) src1_extra->data_device [g_main_device];
1953
1944
src1_ddf_i_source += i11*src1_stride;
1954
1945
CUDA_CHECK (cudaMemcpyAsync (src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof (float ),
1955
- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1 ));
1946
+ cudaMemcpyDeviceToDevice, cudaStream_main ));
1956
1947
}
1957
1948
} else if (src1_on_device && !src1_is_contiguous) {
1958
1949
GGML_ASSERT (!split);
@@ -1961,7 +1952,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1961
1952
GGML_ASSERT (false );
1962
1953
}
1963
1954
}
1964
- CUDA_CHECK (cudaEventRecord (cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
1965
1955
1966
1956
if (!src0_on_device || !src0_is_contiguous) {
1967
1957
if (src0_is_f32) {
@@ -1977,9 +1967,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1977
1967
CUDA_CHECK (cudaGetLastError ());
1978
1968
}
1979
1969
1980
- // wait with main stream until src1 memcpy is done
1981
- CUDA_CHECK (cudaStreamWaitEvent (cudaStream_main, cudaEvent_memcpy_src1, 0 ));
1982
-
1983
1970
// do the computation
1984
1971
op (src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
1985
1972
@@ -2017,8 +2004,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2017
2004
2018
2005
// wait until each device is finished, then free their buffers
2019
2006
for (int id = 0 ; id < g_device_count; ++id) {
2007
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0 ) {
2008
+ continue ;
2009
+ }
2010
+
2020
2011
CUDA_CHECK (cudaSetDevice (id));
2021
2012
CUDA_CHECK (cudaDeviceSynchronize ());
2013
+
2022
2014
if (src0_asq[id] > 0 ) {
2023
2015
ggml_cuda_pool_free (src0_ddq[id], src0_asq[id]);
2024
2016
}
@@ -2084,7 +2076,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2084
2076
const int64_t ne02 = src0->ne [2 ];
2085
2077
2086
2078
CUDA_CHECK (cudaSetDevice (g_main_device));
2087
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2079
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2088
2080
2089
2081
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2090
2082
void * src0_ddq = src0_extra->data_device [g_main_device];
@@ -2096,8 +2088,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2096
2088
float * dst_ddf = (float *) dst_extra->data_device [g_main_device];
2097
2089
2098
2090
ggml_mul_mat_p021_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2099
-
2100
- CUDA_CHECK (cudaDeviceSynchronize ());
2101
2091
}
2102
2092
2103
2093
void ggml_cuda_mul_mat_vec_nc (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2115,7 +2105,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2115
2105
const int64_t nb02 = src0->nb [2 ];
2116
2106
2117
2107
CUDA_CHECK (cudaSetDevice (g_main_device));
2118
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2108
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2119
2109
2120
2110
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2121
2111
void * src0_ddq = src0_extra->data_device [g_main_device];
@@ -2130,8 +2120,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2130
2120
const int channel_stride_x = nb02 / sizeof (half);
2131
2121
2132
2122
ggml_mul_mat_vec_nc_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2133
-
2134
- CUDA_CHECK (cudaDeviceSynchronize ());
2135
2123
}
2136
2124
2137
2125
void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2187,7 +2175,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2187
2175
const int64_t nb12 = src1->nb [2 ];
2188
2176
2189
2177
CUDA_CHECK (cudaSetDevice (g_main_device));
2190
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][ 0 ] ;
2178
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2191
2179
2192
2180
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
2193
2181
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra ;
@@ -2205,8 +2193,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2205
2193
GGML_ASSERT (false );
2206
2194
}
2207
2195
2208
- CUDA_CHECK (cudaDeviceSynchronize ());
2209
-
2210
2196
(void ) dst;
2211
2197
}
2212
2198
0 commit comments