File tree 2 files changed +13
-2
lines changed
2 files changed +13
-2
lines changed Original file line number Diff line number Diff line change 2
2
3
3
#include " common.cuh"
4
4
5
+
6
+ static __device__ __forceinline__ unsigned int ggml_cuda_ctva_generic_to_shared (void * generic_ptr) {
7
+ #ifdef CP_ASYNC_AVAILABLE
8
+ return __cvta_generic_to_shared (generic_ptr);
9
+ #else
10
+ GGML_UNUSED (generic_ptr);
11
+ NO_DEVICE_CODE;
12
+ return -1 ;
13
+ #endif // CP_ASYNC_AVAILABLE
14
+ }
15
+
5
16
// Copies data from global to shared memory, cg == cache global.
6
17
// Both the src and dst pointers must be aligned to 16 bit.
7
18
// Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
Original file line number Diff line number Diff line change @@ -112,7 +112,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
112
112
// The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
113
113
114
114
if (use_cp_async) {
115
- const unsigned int tile_KV_32 = __cvta_generic_to_shared (tile_KV);
115
+ const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared (tile_KV);
116
116
117
117
constexpr int preload = 64 ;
118
118
constexpr int h2_per_chunk = 16 /sizeof (half2);
@@ -186,7 +186,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
186
186
constexpr int cols_per_warp = 8 *WARP_SIZE/nbatch_fa;
187
187
constexpr int stride_j = nwarps * cols_per_warp;
188
188
189
- const unsigned int tile_mask_32 = __cvta_generic_to_shared (tile_mask);
189
+ const unsigned int tile_mask_32 = ggml_cuda_cvta_generic_to_shared (tile_mask);
190
190
191
191
#pragma unroll
192
192
for (int j0 = 0 ; j0 < ncols1; j0 += stride_j) {
You can’t perform that action at this time.
0 commit comments