@@ -49,7 +49,6 @@ extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_
4949extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin[];
5050extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90_cu_cubin[];
5151extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin[];
52- extern unsigned char cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin[];
5352extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin[];
5453extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin[];
5554extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin[];
@@ -261,7 +260,7 @@ extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_tma_ws_sm90
261260extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
262261extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
263262extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
264- extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90 (Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
263+ extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_softmax_tma_ws_sm90 (Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
265264extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
266265extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
267266extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
@@ -282,7 +281,8 @@ extern void run_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_w
282281extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
283282extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
284283extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_alibi_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
285- extern void run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
284+ extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
285+ extern void run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_softmax_output_bf16_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
286286extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
287287extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_40_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
288288extern void run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);
@@ -1354,7 +1354,6 @@ extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_w
13541354extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90_cu_cubin_len;
13551355extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90_cu_cubin_len;
13561356extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90_cu_cubin_len;
1357- extern uint32_t cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin_len;
13581357extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90_cu_cubin_len;
13591358extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90_cu_cubin_len;
13601359extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90_cu_cubin_len;
@@ -1976,8 +1975,8 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2
19761975{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_sliding_or_chunked_causal_softcapping_tma_ws_sm90_kernel", 196864, 384, 64, 2, 2, false, true, true, true, false, false, true, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_softcapping_tma_ws_sm90},
19771976{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90},
19781977{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_tma_ws_sm90},
1979- { DATA_TYPE_E4M3, DATA_TYPE_BF16 , 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90_kernel ", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90 },
1980- { DATA_TYPE_E4M3, DATA_TYPE_BF16 , 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_output_bf16_tma_ws_sm90_kernel ", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90 },
1978+ { DATA_TYPE_E4M3, DATA_TYPE_E4M3 , 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_softmax_tma_ws_sm90_kernel ", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_softmax_tma_ws_sm90 },
1979+ { DATA_TYPE_E4M3, DATA_TYPE_E4M3 , 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_softmax_tma_ws_sm90_kernel ", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_softmax_tma_ws_sm90 },
19811980{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_causal_alibi_tma_ws_sm90_kernel", 82304, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_32_alibi_tma_ws_sm90},
19821981{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 40, 40, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90},
19831982{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 256, 48, 48, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_causal_alibi_tma_ws_sm90_kernel", 164224, 384, 64, 1, 0, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90},
@@ -2000,8 +1999,10 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2
20001999{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 160, 160, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_160_alibi_tma_ws_sm90},
20012000{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 192, 192, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_192_alibi_tma_ws_sm90},
20022001{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, 0, 64, 128, 256, 256, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_causal_alibi_tma_ws_sm90_kernel", 229632, 384, 64, 1, 2, false, true, true, true, true, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_paged_kv_256_alibi_tma_ws_sm90},
2003- { DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 256, 80, 80, 64, 64, 256, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90_kernel", 196864, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_256_S_qkv_80_sage_64_64_256_output_bf16_tma_ws_sm90},
2004- { DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 256, 128, 128, 64, 64, 256, kSM_90, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90_kernel", 196864, 384, 64, 0, 0, false, true, true, true, false, false, false, false, nullptr},
2002+ { DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90},
2003+ { DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_output_bf16_tma_ws_sm90},
2004+ { DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_softmax_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 0, 3, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_softmax_output_bf16_tma_ws_sm90},
2005+ { DATA_TYPE_E4M3, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_causal_softmax_output_bf16_tma_ws_sm90_kernel", 164096, 384, 64, 1, 3, false, true, true, true, false, false, false, true, run_fmha_v2_flash_attention_e4m3_64_128_S_q_k_v_192x128_softmax_output_bf16_tma_ws_sm90},
20052006{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90_kernel", 73984, 384, 64, 0, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90},
20062007{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_causal_tma_ws_sm90_kernel", 73984, 384, 64, 1, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90},
20072008{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 256, 32, 32, 0, 0, 0, kSM_90, nullptr, 0, "fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_sliding_or_chunked_causal_tma_ws_sm90_kernel", 73984, 384, 64, 2, 0, false, true, true, true, false, false, false, false, run_fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_32_tma_ws_sm90},
0 commit comments