microsoft
diff --git a/‎onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc‎
Lines changed: 56 additions & 35 deletions b/‎onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc‎
Lines changed: 56 additions & 35 deletions
@@ -31,9 +31,9 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& present_key = shader.AddOutput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   const auto& present_value = shader.AddOutput("present_value", ShaderUsage::UseUniform);
   const auto& copy_kv_shape = shader.AddIndices("copy_kv_shape");
-  shader.AddInput("seqlen_k", ShaderUsage::None);
   // If prepare_indirect_dispatch is enabled, add seqlen_k input and indirect_buffer output
   if (prepare_indirect_dispatch_) {
+    shader.AddInput("seqlen_k", ShaderUsage::None);
     shader.AddOutput("indirect_buffer", ShaderUsage::None);
   }
 
@@ -42,8 +42,12 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
                             << "  let head_size_id = output_indices[3];\n"
                                "  let sequence_id = output_indices[2];\n"
                                "  let num_head_id = output_indices[1];\n"
-                               "  let batch = output_indices[0];\n"
-                               "  let total_seq_length = u32(seqlen_k[0u]) + 1u;\n";
+                               "  let batch = output_indices[0];\n";
+  if (prepare_indirect_dispatch_) {
+    shader.MainFunctionBody() << "  let total_seq_length = u32(seqlen_k[0u]) + 1u;\n";
+  } else {
+    shader.MainFunctionBody() << "  let total_seq_length = uniforms.total_sequence_length;\n";
+  }
 
   // Add indirect dispatch logic for thread 0
   if (prepare_indirect_dispatch_) {
@@ -89,7 +93,7 @@ Status CopyKVCacheProgram::GenerateShaderCode(ShaderHelper& shader) const {
 Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAttentionParameters& parameters,
                    const Tensor* K, const Tensor* past_key, Tensor* present_key,
                    const Tensor* V, const Tensor* past_value, Tensor* present_value,
-                   const Tensor* seqlen_k, Tensor* indirect_buffer) {
+                   uint32_t tile_size, const Tensor* seqlen_k, Tensor* indirect_buffer) {
   // CopyKVCache takes past key/value and current key/value and copies them to present key and value.
   // This makes it so that FlashAttention only needs to look at present key and value, and saves
   // number of input buffers in the shader, which we run out of (<=8) without this optimization.
@@ -106,10 +110,9 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
 
   // Determine if we need to prepare indirect dispatch
   bool prepare_indirect_dispatch = (indirect_buffer != nullptr);
-  constexpr uint32_t tile_size = 64;
 
   CopyKVCacheProgram program{"CopyKVCache", has_past, parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH, parameters.past_present_share_buffer_,
-                             prepare_indirect_dispatch, tile_size, static_cast<uint32_t>(parameters.num_heads_)};
+                             prepare_indirect_dispatch};
   if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) {
     program.AddInputs({{K, ProgramTensorMetadataDependency::TypeAndRank, components},
                        {V, ProgramTensorMetadataDependency::TypeAndRank, components}});
@@ -121,7 +124,7 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
                        {V, ProgramTensorMetadataDependency::TypeAndRank, reshaped_KV_shape, components}});
   }
 
-  if (seqlen_k != nullptr) {
+  if (prepare_indirect_dispatch) {
     program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
   }
 
@@ -132,7 +135,6 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
   program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components},
                       {present_value, ProgramTensorMetadataDependency::Rank, components}});
 
-  // Add indirect_buffer output if preparing indirect dispatch
   if (prepare_indirect_dispatch) {
     program.AddOutput({indirect_buffer, ProgramTensorMetadataDependency::None});
   }
@@ -142,6 +144,7 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
       .SetWorkgroupSize(64)
       .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_, prepare_indirect_dispatch)
       .AddUniformVariables({{static_cast<uint32_t>(copy_size)},
+                            {static_cast<uint32_t>(parameters.total_sequence_length_)},
                             {static_cast<uint32_t>(parameters.kv_sequence_length_)},
                             {tile_size},
                             {static_cast<uint32_t>(parameters.num_heads_)}});
@@ -184,7 +187,9 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
 Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   shader.AddInput("present_key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
-  shader.AddInput("seqlens_k", ShaderUsage::None);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   if (has_attention_bias_) {
     shader.AddInput("attention_bias", ShaderUsage::UseUniform);
   }
@@ -197,7 +202,8 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader)
                              WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, sub_tile_count),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& context, const Tensor* Q,
@@ -209,10 +215,12 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte
   const bool has_attention_bias = attention_bias != nullptr;
   const int components = 4;
 
-  FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size};
+  FlashAttentionDecodeQKTProgram program{"FlashAttentionDecodeQKT", has_attention_bias, tile_size, use_indirect_dispatch};
   program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
-                     {present_key, ProgramTensorMetadataDependency::TypeAndRank, components},
-                     {seqlen_k, ProgramTensorMetadataDependency::None}});
+                     {present_key, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
   if (has_attention_bias) {
     program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank});
   }
@@ -226,8 +234,9 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte
     program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile);
   }
   program.SetWorkgroupSize(64)
-      .CacheHint(tile_size, has_attention_bias)
+      .CacheHint(tile_size, has_attention_bias, use_indirect_dispatch)
       .AddUniformVariables({{static_cast<uint32_t>(vectorized_head_size)},
+                            {static_cast<uint32_t>(parameters.total_sequence_length_)},
                             {static_cast<float>(alpha)},
                             {static_cast<uint32_t>(present_sequence_length)},
                             {static_cast<uint32_t>(parameters.n_reps)},
@@ -241,7 +250,9 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
   shader.AddInput("metadata", ShaderUsage::UseUniform);
   shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
   shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-  shader.AddInput("seqlens_k", ShaderUsage::None);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   shader.AddOutput("out_split_vx", ShaderUsage::UseUniform);
 
   const uint32_t tile_size_k_vec = 8u;
@@ -250,7 +261,8 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
                              WGSL_TEMPLATE_PARAMETER(head_size_vec, head_size_vec_),
                              WGSL_TEMPLATE_PARAMETER(sub_tile_count, WorkgroupSizeX() / tile_size_k_vec),
                              WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
-                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec));
+                             WGSL_TEMPLATE_PARAMETER(tile_size_k_vec, tile_size_k_vec),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeContext& context,
@@ -268,20 +280,21 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
                                                uint32_t present_sequence_length) {
   const int components = 4;
   int head_size_vec = parameters.v_head_size_ / components;
-  FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec};
+  FlashAttentionDecodeSplitVxProgram program{"FlashAttentionDecodeSplitVx", tile_size, head_size_vec, use_indirect_dispatch};
   program.AddInputs({{metadata, ProgramTensorMetadataDependency::TypeAndRank, 2},
                      {qk, ProgramTensorMetadataDependency::TypeAndRank},
-                     {present_value, ProgramTensorMetadataDependency::TypeAndRank, components},
-                     {seqlen_k, ProgramTensorMetadataDependency::None}});
+                     {present_value, ProgramTensorMetadataDependency::TypeAndRank, components}});
   program.AddOutputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});  // [B, N, split_k, head_size]
   if (use_indirect_dispatch) {
-    program.SetIndirectDispatchTensor(indirect_buffer);
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None})
+        .SetIndirectDispatchTensor(indirect_buffer);
   } else {
     program.SetDispatchGroupSize(parameters.num_heads_ * num_total_seq_length_tile);
   }
-  program.CacheHint(tile_size, head_size_vec)
+  program.CacheHint(tile_size, head_size_vec, use_indirect_dispatch)
       .SetWorkgroupSize(64)
-      .AddUniformVariables({{static_cast<uint32_t>(head_size_vec)},
+      .AddUniformVariables({{static_cast<uint32_t>(parameters.total_sequence_length_)},
+                            {static_cast<uint32_t>(head_size_vec)},
                             {static_cast<uint32_t>(present_sequence_length)},
                             {static_cast<uint32_t>(parameters.n_reps)},
                             num_present_sequence_length_tile,
@@ -292,31 +305,39 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
 
 Status FlashAttentionDecodeVxReduceProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("input", ShaderUsage::UseUniform);
-  shader.AddInput("seqlens_k", ShaderUsage::None);
+  if (use_indirect_dispatch_) {
+    shader.AddInput("seqlens_k", ShaderUsage::None);
+  }
   shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
   return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention_decode_vx_reduce.wgsl.template",
-                             WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_));
+                             WGSL_TEMPLATE_PARAMETER(tile_size, tile_size_),
+                             WGSL_TEMPLATE_PARAMETER(use_indirect_dispatch, use_indirect_dispatch_));
 }
 
 Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext& context,
                                            const Tensor* out_split_vx,
                                            Tensor* output,
                                            const Tensor* seqlen_k,
                                            const WebgpuAttentionParameters& parameters,
-                                           uint32_t num_present_sequence_length_tile) {
+                                           uint32_t num_total_seq_length_tile,
+                                           uint32_t num_present_sequence_length_tile,
+                                           bool use_indirect_dispatch) {
   const int components = 4;
   constexpr int tile_size = 8;
   int tile_head_size = tile_size * components;
-  FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size};
-  program.AddInputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components},
-                     {seqlen_k, ProgramTensorMetadataDependency::None}});
+  FlashAttentionDecodeVxReduceProgram program{"FlashAttentionDecodeVxReduce", tile_size, use_indirect_dispatch};
+  program.AddInputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (use_indirect_dispatch) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
+  }
   program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, components}});
   const uint32_t num_head_size_tile = static_cast<uint32_t>((parameters.v_head_size_ + tile_head_size - 1) / tile_head_size);
   program.SetDispatchGroupSize(parameters.num_heads_ * num_head_size_tile)
-      .CacheHint(tile_size)
+      .CacheHint(tile_size, use_indirect_dispatch)
       .SetWorkgroupSize(tile_size * tile_size)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.v_head_size_ / components)},
+                            num_total_seq_length_tile,
                             num_present_sequence_length_tile,
                             {num_head_size_tile},
                             {static_cast<uint32_t>(parameters.num_heads_)}});
@@ -332,10 +353,9 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   const int present_sequence_length = static_cast<int>(present_key->Shape()[2]);
 
   if (parameters.sequence_length_ > 1) {
-    // For encode path, use the original CopyKVCache without indirect dispatch preparation
-    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, seqlen_k, nullptr));
-
     const uint32_t tile_size = 64;
+    // For encode path, use the original CopyKVCache without indirect dispatch preparation
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr));
     bool has_attention_bias = attention_bias != nullptr;
     bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"};
     bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"};
@@ -394,10 +414,10 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
     indirect_buffer = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), indirect_buffer_shape);
     indirect_buffer_ptr = &indirect_buffer;
     // Use the fused CopyKVCache that also prepares the indirect dispatch buffer
-    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, seqlen_k, indirect_buffer_ptr));
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, indirect_buffer_ptr));
   } else {
     // Use the original CopyKVCache without indirect dispatch preparation
-    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, seqlen_k, nullptr));
+    ORT_RETURN_IF_ERROR(CopyKVCache(context, parameters, K, past_key, present_key, V, past_value, present_value, tile_size, seqlen_k, nullptr));
   }
 
   // The metadata is used to store the max and sum of each tile.
@@ -420,7 +440,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
                                                               num_present_sequence_length_tile, tile_size,
                                                               use_indirect_dispatch, present_sequence_length));
   ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeVxReduce(context, &out_split_vx, output, seqlen_k, parameters,
-                                                          num_present_sequence_length_tile));
+                                                          num_total_seq_length_tile,
+                                                          num_present_sequence_length_tile, use_indirect_dispatch));
 
   return Status::OK();
 }