[SW-187215] Add valid_seq_len feature to patched SDPA module

wszczurekhabana · Eran Geva · commit 3f61954cc453 · 2024-07-25T18:24:38.000Z
Change-Id: Ia627fe8134470d68a7e55fc978a972bb7f7b3d5b
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -725,6 +725,9 @@ def forward(
         is_causal=False,
         scale=None,
         softmax_mode="None",
+        recompute=None,
+        valid_seq_len=None,
+        seq_padding_type="None",
     ):
         qinput = self.quant_q(q).detach()
         kinput = self.quant_k(k).detach()
@@ -746,6 +749,8 @@ def forward(
             q_scale_o=self.scale_output,
             d_scale_s=self.descale_amax,
             is_amax_s=False,
+            valid_seq_len=valid_seq_len,
+            seq_padding_type=seq_padding_type
         )
         output = results[0]
         d_out = self.dequant_output(output)
@@ -761,6 +766,9 @@ def forward_measure(
         is_causal=False,
         scale=None,
         softmax_mode="fast",
+        recompute=None,
+        valid_seq_len=None,
+        seq_padding_type="None",
     ):
         dq = q.detach()
         dk = k.detach()
@@ -777,6 +785,8 @@ def forward_measure(
             # fp8_fused_sdpa in bf16 can use either FastSoftmax or regular
             softmax_mode="fast",
             is_amax_s=True,
+            valid_seq_len=valid_seq_len,
+            seq_padding_type=seq_padding_type
         )
         output = results[0]
         amax = results[1]