wrap __cvta_generic_to_shared for HIP

JohannesGaessler · JohannesGaessler · commit 64fcbe0c1757 · 2025-05-06T13:07:47.000+02:00
diff --git a/ggml/src/ggml-cuda/cp-async.cuh b/ggml/src/ggml-cuda/cp-async.cuh
@@ -2,6 +2,17 @@
 
 #include "common.cuh"
 
+
+static __device__ __forceinline__ unsigned int ggml_cuda_ctva_generic_to_shared(void * generic_ptr) {
+#ifdef CP_ASYNC_AVAILABLE
+    return __cvta_generic_to_shared(generic_ptr);
+#else
+    GGML_UNUSED(generic_ptr);
+    NO_DEVICE_CODE;
+    return -1;
+#endif // CP_ASYNC_AVAILABLE
+}
+
 // Copies data from global to shared memory, cg == cache global.
 // Both the src and dst pointers must be aligned to 16 bit.
 // Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -112,7 +112,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
     // The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
 
     if (use_cp_async) {
-        const unsigned int tile_KV_32 = __cvta_generic_to_shared(tile_KV);
+        const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
 
         constexpr int preload = 64;
         constexpr int h2_per_chunk = 16/sizeof(half2);
@@ -186,7 +186,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
         constexpr int cols_per_warp = 8*WARP_SIZE/nbatch_fa;
         constexpr int stride_j = nwarps * cols_per_warp;
 
-        const unsigned int tile_mask_32 = __cvta_generic_to_shared(tile_mask);
+        const unsigned int tile_mask_32 = ggml_cuda_cvta_generic_to_shared(tile_mask);
 
 #pragma unroll
         for (int j0 = 0; j0 < ncols1; j0 += stride_j) {