Skip to content

Commit 64fcbe0

Browse files
wrap __cvta_generic_to_shared for HIP
1 parent d19838e commit 64fcbe0

File tree

2 files changed

+13
-2
lines changed

2 files changed

+13
-2
lines changed

ggml/src/ggml-cuda/cp-async.cuh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22

33
#include "common.cuh"
44

5+
6+
static __device__ __forceinline__ unsigned int ggml_cuda_ctva_generic_to_shared(void * generic_ptr) {
7+
#ifdef CP_ASYNC_AVAILABLE
8+
return __cvta_generic_to_shared(generic_ptr);
9+
#else
10+
GGML_UNUSED(generic_ptr);
11+
NO_DEVICE_CODE;
12+
return -1;
13+
#endif // CP_ASYNC_AVAILABLE
14+
}
15+
516
// Copies data from global to shared memory, cg == cache global.
617
// Both the src and dst pointers must be aligned to 16 bit.
718
// Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
112112
// The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
113113

114114
if (use_cp_async) {
115-
const unsigned int tile_KV_32 = __cvta_generic_to_shared(tile_KV);
115+
const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
116116

117117
constexpr int preload = 64;
118118
constexpr int h2_per_chunk = 16/sizeof(half2);
@@ -186,7 +186,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
186186
constexpr int cols_per_warp = 8*WARP_SIZE/nbatch_fa;
187187
constexpr int stride_j = nwarps * cols_per_warp;
188188

189-
const unsigned int tile_mask_32 = __cvta_generic_to_shared(tile_mask);
189+
const unsigned int tile_mask_32 = ggml_cuda_cvta_generic_to_shared(tile_mask);
190190

191191
#pragma unroll
192192
for (int j0 = 0; j0 < ncols1; j0 += stride_j) {

0 commit comments

Comments
 (0)