|
61 | 61 | #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62 | 62 | #define cudaStreamNonBlocking hipStreamNonBlocking
|
63 | 63 | #define cudaStreamSynchronize hipStreamSynchronize
|
64 |
| -#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0) |
| 64 | +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) |
65 | 65 | #define cudaStream_t hipStream_t
|
66 | 66 | #define cudaSuccess hipSuccess
|
67 | 67 | #else
|
@@ -190,6 +190,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
190 | 190 | } while (0)
|
191 | 191 | #endif // CUDART_VERSION >= 11
|
192 | 192 |
|
| 193 | +#if CUDART_VERSION >= 11100 |
| 194 | +#define GGML_CUDA_ASSUME(x) __builtin_assume(x) |
| 195 | +#else |
| 196 | +#define GGML_CUDA_ASSUME(x) |
| 197 | +#endif // CUDART_VERSION >= 11100 |
| 198 | + |
193 | 199 | #ifdef GGML_CUDA_F16
|
194 | 200 | typedef half dfloat; // dequantize float
|
195 | 201 | typedef half2 dfloat2;
|
@@ -2145,10 +2151,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
2145 | 2151 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2146 | 2152 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2147 | 2153 |
|
2148 |
| - __builtin_assume(i_offset >= 0); |
2149 |
| - __builtin_assume(i_offset < nwarps); |
2150 |
| - __builtin_assume(k >= 0); |
2151 |
| - __builtin_assume(k < WARP_SIZE); |
| 2154 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 2155 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 2156 | + GGML_CUDA_ASSUME(k >= 0); |
| 2157 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
2152 | 2158 |
|
2153 | 2159 | const int kbx = k / QI4_0;
|
2154 | 2160 | const int kqsx = k % QI4_0;
|
@@ -2239,10 +2245,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
2239 | 2245 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2240 | 2246 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2241 | 2247 |
|
2242 |
| - __builtin_assume(i_offset >= 0); |
2243 |
| - __builtin_assume(i_offset < nwarps); |
2244 |
| - __builtin_assume(k >= 0); |
2245 |
| - __builtin_assume(k < WARP_SIZE); |
| 2248 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 2249 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 2250 | + GGML_CUDA_ASSUME(k >= 0); |
| 2251 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
2246 | 2252 |
|
2247 | 2253 | const int kbx = k / QI4_1;
|
2248 | 2254 | const int kqsx = k % QI4_1;
|
@@ -2331,10 +2337,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
2331 | 2337 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2332 | 2338 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2333 | 2339 |
|
2334 |
| - __builtin_assume(i_offset >= 0); |
2335 |
| - __builtin_assume(i_offset < nwarps); |
2336 |
| - __builtin_assume(k >= 0); |
2337 |
| - __builtin_assume(k < WARP_SIZE); |
| 2340 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 2341 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 2342 | + GGML_CUDA_ASSUME(k >= 0); |
| 2343 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
2338 | 2344 |
|
2339 | 2345 | const int kbx = k / QI5_0;
|
2340 | 2346 | const int kqsx = k % QI5_0;
|
@@ -2445,10 +2451,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
2445 | 2451 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2446 | 2452 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2447 | 2453 |
|
2448 |
| - __builtin_assume(i_offset >= 0); |
2449 |
| - __builtin_assume(i_offset < nwarps); |
2450 |
| - __builtin_assume(k >= 0); |
2451 |
| - __builtin_assume(k < WARP_SIZE); |
| 2454 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 2455 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 2456 | + GGML_CUDA_ASSUME(k >= 0); |
| 2457 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
2452 | 2458 |
|
2453 | 2459 | const int kbx = k / QI5_1;
|
2454 | 2460 | const int kqsx = k % QI5_1;
|
@@ -2551,10 +2557,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
2551 | 2557 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2552 | 2558 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2553 | 2559 |
|
2554 |
| - __builtin_assume(i_offset >= 0); |
2555 |
| - __builtin_assume(i_offset < nwarps); |
2556 |
| - __builtin_assume(k >= 0); |
2557 |
| - __builtin_assume(k < WARP_SIZE); |
| 2560 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 2561 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 2562 | + GGML_CUDA_ASSUME(k >= 0); |
| 2563 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
2558 | 2564 |
|
2559 | 2565 | const int kbx = k / QI8_0;
|
2560 | 2566 | const int kqsx = k % QI8_0;
|
@@ -2642,10 +2648,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
2642 | 2648 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2643 | 2649 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2644 | 2650 |
|
2645 |
| - __builtin_assume(i_offset >= 0); |
2646 |
| - __builtin_assume(i_offset < nwarps); |
2647 |
| - __builtin_assume(k >= 0); |
2648 |
| - __builtin_assume(k < WARP_SIZE); |
| 2651 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 2652 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 2653 | + GGML_CUDA_ASSUME(k >= 0); |
| 2654 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
2649 | 2655 |
|
2650 | 2656 | const int kbx = k / QI2_K;
|
2651 | 2657 | const int kqsx = k % QI2_K;
|
@@ -2763,10 +2769,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
2763 | 2769 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2764 | 2770 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2765 | 2771 |
|
2766 |
| - __builtin_assume(i_offset >= 0); |
2767 |
| - __builtin_assume(i_offset < nwarps); |
2768 |
| - __builtin_assume(k >= 0); |
2769 |
| - __builtin_assume(k < WARP_SIZE); |
| 2772 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 2773 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 2774 | + GGML_CUDA_ASSUME(k >= 0); |
| 2775 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
2770 | 2776 |
|
2771 | 2777 | const int kbx = k / QI3_K;
|
2772 | 2778 | const int kqsx = k % QI3_K;
|
@@ -2981,10 +2987,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
2981 | 2987 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2982 | 2988 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2983 | 2989 |
|
2984 |
| - __builtin_assume(i_offset >= 0); |
2985 |
| - __builtin_assume(i_offset < nwarps); |
2986 |
| - __builtin_assume(k >= 0); |
2987 |
| - __builtin_assume(k < WARP_SIZE); |
| 2990 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 2991 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 2992 | + GGML_CUDA_ASSUME(k >= 0); |
| 2993 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
2988 | 2994 |
|
2989 | 2995 | const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2990 | 2996 | const int kqsx = k % QI4_K; // == k if QK_K == 256
|
@@ -3162,10 +3168,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
3162 | 3168 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3163 | 3169 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3164 | 3170 |
|
3165 |
| - __builtin_assume(i_offset >= 0); |
3166 |
| - __builtin_assume(i_offset < nwarps); |
3167 |
| - __builtin_assume(k >= 0); |
3168 |
| - __builtin_assume(k < WARP_SIZE); |
| 3171 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 3172 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 3173 | + GGML_CUDA_ASSUME(k >= 0); |
| 3174 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
3169 | 3175 |
|
3170 | 3176 | const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3171 | 3177 | const int kqsx = k % QI5_K; // == k if QK_K == 256
|
@@ -3291,10 +3297,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
3291 | 3297 | const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3292 | 3298 | int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3293 | 3299 |
|
3294 |
| - __builtin_assume(i_offset >= 0); |
3295 |
| - __builtin_assume(i_offset < nwarps); |
3296 |
| - __builtin_assume(k >= 0); |
3297 |
| - __builtin_assume(k < WARP_SIZE); |
| 3300 | + GGML_CUDA_ASSUME(i_offset >= 0); |
| 3301 | + GGML_CUDA_ASSUME(i_offset < nwarps); |
| 3302 | + GGML_CUDA_ASSUME(k >= 0); |
| 3303 | + GGML_CUDA_ASSUME(k < WARP_SIZE); |
3298 | 3304 |
|
3299 | 3305 | const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3300 | 3306 | const int kqsx = k % QI6_K; // == k if QK_K == 256
|
@@ -6408,7 +6414,7 @@ static void ggml_cuda_op_mul_mat(
|
6408 | 6414 |
|
6409 | 6415 | // wait for main GPU data if necessary
|
6410 | 6416 | if (split && (id != g_main_device || is != 0)) {
|
6411 |
| - CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0])); |
| 6417 | + CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0)); |
6412 | 6418 | }
|
6413 | 6419 |
|
6414 | 6420 | for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
@@ -6530,7 +6536,7 @@ static void ggml_cuda_op_mul_mat(
|
6530 | 6536 | CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6531 | 6537 | for (int64_t id = 0; id < g_device_count; ++id) {
|
6532 | 6538 | for (int64_t is = 0; is < is_max; ++is) {
|
6533 |
| - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is])); |
| 6539 | + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0)); |
6534 | 6540 | }
|
6535 | 6541 | }
|
6536 | 6542 | }
|
|
0 commit comments