Silv3S
diff --git a/‎torch/nativert/backends/_lowered_aoti_module.py‎
Lines changed: 2 additions & 4 deletions b/‎torch/nativert/backends/_lowered_aoti_module.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎torch/nested/__init__.py‎
Lines changed: 7 additions & 7 deletions b/‎torch/nested/__init__.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎torch/nested/_internal/ops.py‎
Lines changed: 2 additions & 3 deletions b/‎torch/nested/_internal/ops.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎torch/nested/_internal/sdpa.py‎
Lines changed: 4 additions & 5 deletions b/‎torch/nested/_internal/sdpa.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎torch/nn/attention/_utils.py‎
Lines changed: 2 additions & 3 deletions b/‎torch/nn/attention/_utils.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎torch/nn/attention/bias.py‎
Lines changed: 2 additions & 3 deletions b/‎torch/nn/attention/bias.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎torch/nn/attention/experimental/_paged_attention.py‎
Lines changed: 7 additions & 7 deletions b/‎torch/nn/attention/experimental/_paged_attention.py‎
Lines changed: 7 additions & 7 deletions
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import torch
 from torch.export import ExportedProgram
 
@@ -10,7 +8,7 @@ def __init__(
         original_exported_program: ExportedProgram,
         backend_id: str,
         *,
-        module_name: Optional[str] = None,
+        module_name: str | None = None,
     ) -> None:
         super().__init__()
         self._backend_id = backend_id
@@ -22,7 +20,7 @@ def backend_id(self) -> str:
         return self._backend_id
 
     @property
-    def module_name(self) -> Optional[str]:
+    def module_name(self) -> str | None:
         return self._module_name
 
     @property
 
@@ -26,8 +26,8 @@
 
 def as_nested_tensor(
     ts: Union[Tensor, list[Tensor], tuple[Tensor, ...]],
-    dtype: Optional[DType] = None,
-    device: Optional[Device] = None,
+    dtype: DType | None = None,
+    device: Device | None = None,
     layout=None,
 ) -> Tensor:
     r"""
@@ -358,11 +358,11 @@ def narrow(
 
 def nested_tensor_from_jagged(
     values: Tensor,
-    offsets: Optional[Tensor] = None,
-    lengths: Optional[Tensor] = None,
-    jagged_dim: Optional[int] = None,
-    min_seqlen: Optional[int] = None,
-    max_seqlen: Optional[int] = None,
+    offsets: Tensor | None = None,
+    lengths: Tensor | None = None,
+    jagged_dim: int | None = None,
+    min_seqlen: int | None = None,
+    max_seqlen: int | None = None,
 ) -> Tensor:
     r"""
     Constructs a jagged layout nested tensor from the given jagged components. The jagged layout
 
@@ -3,7 +3,6 @@
 import math
 import operator
 from typing import *  # noqa: F403
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -249,7 +248,7 @@ def inner(*args, **kwargs):
 register_jagged_func = functools.partial(register_func, JAGGED_OPS_TABLE)
 
 
-def lookup_jagged(func, *args, **kwargs) -> Optional[Callable]:
+def lookup_jagged(func, *args, **kwargs) -> Callable | None:
     dispatch_func = JAGGED_OPS_TABLE.get(func, None)
     if dispatch_func is not None:
         return dispatch_func
@@ -1138,7 +1137,7 @@ def unbind_int(func, *args, **kwargs):
     lengths = inp.lengths()
     ragged_idx = inp._ragged_idx
 
-    def _torch_check(_lengths: list[int], _offsets: Optional[list[int]] = None) -> None:
+    def _torch_check(_lengths: list[int], _offsets: list[int] | None = None) -> None:
         # This torch._check are needed for torch.compile
         # symbolic shapes processing.
         # offsets and lengths are symbolic variables during compilation,
 
@@ -1,6 +1,5 @@
 # mypy: allow-untyped-defs
 import logging
-from typing import Optional
 
 import torch
 import torch.nn
@@ -27,7 +26,7 @@ def _validate_sdpa_input(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
     dropout_p=0.0,
     is_causal=False,
     scale=None,
@@ -668,8 +667,8 @@ def _autocast(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    attn_mask: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor | None]:
     """
     [Autocasting SDPA for NJT]
 
@@ -714,7 +713,7 @@ def jagged_scaled_dot_product_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
     dropout_p=0.0,
     is_causal=False,
     scale=None,
 
@@ -2,7 +2,6 @@
 """Defines utilities for interacting with scaled_dot_product_attention"""
 
 import math
-from typing import Optional
 
 import torch
 
@@ -22,7 +21,7 @@ def _postprocess_flash_output(inpt_tensor: torch.Tensor, og_size: int) -> torch.
     return inpt_tensor
 
 
-def _calculate_scale(head_dim_size: int, scale: Optional[float]) -> float:
+def _calculate_scale(head_dim_size: int, scale: float | None) -> float:
     """
     For FlashAttention we pad the head dimension to be a multiple of 8 so we need to scale the output
     by the original head size and not the padded.
@@ -36,7 +35,7 @@ def _validate_sdpa_input(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
     dropout_p=0.0,
     is_causal=False,
     scale=None,
 
@@ -2,7 +2,6 @@
 """Defines bias subclasses that work with scaled_dot_product_attention"""
 
 from enum import auto, IntEnum
-from typing import Optional
 from warnings import warn
 
 import torch
@@ -155,7 +154,7 @@ def _lower_right(self, device: torch.device) -> torch.Tensor:
         )
 
     # pyrefly: ignore [bad-return]
-    def _materialize(self, device: Optional[torch.device] = None) -> torch.Tensor:
+    def _materialize(self, device: torch.device | None = None) -> torch.Tensor:
         """
         Materializes the causal bias into a tensor form.
 
@@ -183,7 +182,7 @@ def _dispatch(
         attn_mask: "CausalBias",
         dropout_p: float = 0.0,
         is_causal: bool = False,
-        scale: Optional[float] = None,
+        scale: float | None = None,
         enable_gqa: bool = False,
     ) -> torch.Tensor:
         r"""
 
@@ -4,7 +4,7 @@
 This module is experimental and subject to change.
 """
 
-from typing import Optional, Union
+from typing import Union
 
 import torch
 from torch.nn.attention.flex_attention import (
@@ -197,8 +197,8 @@ def assign(
     def convert_logical_block_mask(
         self,
         block_mask: BlockMask,
-        batch_idx: Optional[torch.Tensor] = None,
-        kv_len: Optional[torch.Tensor] = None,
+        batch_idx: torch.Tensor | None = None,
+        kv_len: torch.Tensor | None = None,
     ) -> BlockMask:
         """
         Converts a logical block mask by mapping its logical kv indices to the corresponding
@@ -279,8 +279,8 @@ def convert_logical_block_mask(
 
     def get_mask_mod(
         self,
-        mask_mod: Optional[_mask_mod_signature],
-        kv_len: Optional[torch.Tensor] = None,
+        mask_mod: _mask_mod_signature | None,
+        kv_len: torch.Tensor | None = None,
     ) -> _mask_mod_signature:
         """
         Converts a mask_mod based on mapping from the physical block index to the logical
@@ -316,8 +316,8 @@ def new_mask_mod(
 
     def get_score_mod(
         self,
-        score_mod: Optional[_score_mod_signature],
-        kv_len: Optional[torch.Tensor] = None,
+        score_mod: _score_mod_signature | None,
+        kv_len: torch.Tensor | None = None,
     ) -> _score_mod_signature:
         """
         Converts a score_mod based on mapping from the physical block index to the logical