pytorch
diff --git a/‎.github/workflows/_link_check.yml
Lines changed: 30 additions & 14 deletions b/‎.github/workflows/_link_check.yml
Lines changed: 30 additions & 14 deletions
diff --git a/‎.github/workflows/build-presets.yml
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/build-presets.yml
Lines changed: 13 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/TARGETS
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/TARGETS
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 9 additions & 5 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 9 additions & 5 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 4 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/_passes/convert_split_to_slice.py
Lines changed: 6 additions & 5 deletions b/‎backends/arm/_passes/convert_split_to_slice.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎backends/arm/_passes/decompose_cosine_similarity_pass.py
Lines changed: 75 additions & 0 deletions b/‎backends/arm/_passes/decompose_cosine_similarity_pass.py
Lines changed: 75 additions & 0 deletions
diff --git a/‎backends/arm/_passes/replace_inf_values_pass.py
Lines changed: 45 additions & 0 deletions b/‎backends/arm/_passes/replace_inf_values_pass.py
Lines changed: 45 additions & 0 deletions
@@ -7,35 +7,51 @@ on:
 
 jobs:
   lint-urls:
+    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-linter
-      submodules: 'none'
+      submodules: false
       fetch-depth: 0
       ref: ${{ inputs.ref }}
-      timeout: 90
+      timeout: 120
       script: |
         ./scripts/lint_urls.sh $(
-          [ "${{ github.event_name }}" = "pull_request" ] \
-            && git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
-          || [ "${{ github.event_name }}" = "push" ] \
-            && git diff --name-only ${{ github.event.before }} ${{ github.sha }}
-        )
+          { [ "${{ github.event_name }}" = "pull_request" ] \
+              && git diff --name-only "${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }}"; } \
+          || \
+          { [ "${{ github.event_name }}" = "push" ] \
+              && git diff --name-only "${{ github.event.before }}...${{ github.sha }}"; }
+        ) || {
+          echo
+          echo "URL lint failed."
+          echo "If this is a transient outage, you can bypass it by adding the \`skip-url-lint\` label to your PR."
+          echo "Or add \`@lint-ignore\` somewhere on the same line as the URL you want to skip checking."
+          exit 1
+        }
 
   lint-xrefs:
+    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-linter
-      submodules: 'none'
+      submodules: false
       fetch-depth: 0
       ref: ${{ inputs.ref }}
-      timeout: 90
+      timeout: 60
       script: |
         ./scripts/lint_xrefs.sh $(
-          [ "${{ github.event_name }}" = "pull_request" ] \
-            && git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
-          || [ "${{ github.event_name }}" = "push" ] \
-            && git diff --name-only ${{ github.event.before }} ${{ github.sha }}
-        )
+          { [ "${{ github.event_name }}" = "pull_request" ] \
+              && git diff --name-only "${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }}"; } \
+          || \
+          { [ "${{ github.event_name }}" = "push" ] \
+              && git diff --name-only "${{ github.event.before }}...${{ github.sha }}"; }
+        ) || {
+          echo
+          echo "Xref lint failed."
+          echo "If this is a transient outage, you can bypass it by adding the \`skip-xref-lint\` label to your PR."
+          echo "Or add \`@lint-ignore\` somewhere on the same line as the reference you want to skip checking."
+          exit 1
+        }
@@ -0,0 +1,13 @@
+name: Build Presets
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
@@ -608,7 +608,7 @@ endif()
 # any backends.
 #
 add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PUBLIC executorch_core)
+target_link_libraries(executorch PRIVATE executorch_core)
 target_include_directories(executorch PUBLIC ${_common_include_directories})
 target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch PUBLIC ${_common_compile_options})
 
@@ -11,5 +11,6 @@ python_library(
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
         "//executorch/backends/transforms:utils",
+        "//executorch/backends/transforms:decompose_sdpa",
     ],
 )
@@ -19,6 +19,7 @@
 from .convert_squeezes_to_view import ConvertSqueezesToViewPass  # noqa
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_batchnorm_pass import DecomposeBatchNormPass  # noqa
+from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
@@ -57,4 +58,5 @@
 from .size_adjust_conv2d_pass import SizeAdjustConv2DPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass  # noqa
+from .replace_inf_values_pass import ReplaceInfValues  # noqa  # usort: skip
 from .arm_pass_manager import ArmPassManager  # noqa  # usort: skip
@@ -70,17 +70,14 @@ def call(self, graph_module: GraphModule) -> PassResult:
             if quantized_input:
                 matmul_args = matmul_node.all_input_nodes
                 for node in matmul_args:
+                    # Find the dq-node connected to this mm/bmm arg
                     input_node = self._match_partition_to_node(
                         node, partition.input_nodes
                     )
-
-                    # Remove partition input dq-node
-                    input_node.replace_all_uses_with(input_node.all_input_nodes[0])
-                    graph_module.graph.erase_node(input_node)
                     input_node_qargs = QuantArgs.from_operator(
                         input_node.target, input_node.args
                     )
-
+                    # Insert new dq-node just before the mm/bmm with input_node's qparams
                     with graph_module.graph.inserting_before(matmul_node):
                         # Create new dq-node before matmul
                         dq_node = create_node(
@@ -90,6 +87,13 @@ def call(self, graph_module: GraphModule) -> PassResult:
                         dq_node.args = (node, *input_node_qargs)
                         matmul_node.replace_input_with(node, dq_node)
 
+                for partition_input in partition.input_nodes:
+                    # Remove partition input dq-node
+                    partition_input.replace_all_uses_with(
+                        partition_input.all_input_nodes[0]
+                    )
+                    graph_module.graph.erase_node(partition_input)
+
             partition_output = list(partition.output_nodes[0].users)[0]
             quantized_output = partition_output.target == q_op
             if quantized_output:
 
@@ -24,6 +24,7 @@
     ConvertSqueezesToViewPass,
     ConvertToClampPass,
     DecomposeBatchNormPass,
+    DecomposeCosineSimilarityPass,
     DecomposeDivPass,
     DecomposeGeluPass,
     DecomposeLayerNormPass,
@@ -49,6 +50,7 @@
     MatchWhereSelfDtypePass,
     QuantizeOperatorArguments,
     RemoveClonePass,
+    ReplaceInfValues,
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
     RetraceFoldedDtypesPass,
@@ -204,6 +206,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeNotEqualPass())
+        self.add_pass(DecomposeCosineSimilarityPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeSqrtPass())
@@ -216,4 +219,5 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
             self.add_pass(DecomposeSoftmaxPass())
 
         self.add_pass(ConvertMinMaxPass())
+        self.add_pass(ReplaceInfValues())
         return self._transform(graph_module)
@@ -1,14 +1,15 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
 import torch.fx
-from executorch.backends.arm._passes.arm_pass_utils import create_node
-from executorch.backends.arm.tosa_mapping import extract_tensor_meta
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -34,7 +35,7 @@ def call(self, graph_module: torch.fx.GraphModule):
             split_node = node
             input_node = split_node.all_input_nodes[0]
             output_nodes = split_node.users.copy()
-            _, shape, _ = extract_tensor_meta(input_node.meta)
+            shape = get_first_fake_tensor(input_node).shape
             rank = len(shape)
             split_lengths = split_node.args[1]
             dim = split_node.args[2] if len(split_node.args) > 2 else 0
 
@@ -0,0 +1,75 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass
+
+torch_cosine_similarity = (torch.ops.aten.cosine_similarity.default,)
+
+
+class DecomposeCosineSimilarityPass(ExportPass):
+    """
+    Decomposition of aten.cosine_similarity:
+
+      dot    = sum(mul(x1, x2), dims, keepdim=False)
+      norm   = pow( sum(mul(x, x), dims, keepdim=False), 0.5 )
+      eps    = full( (), eps_scalar )
+      n1c    = max(norm1, eps)
+      n2c    = max(norm2, eps)
+      denom  = mul(n1c, n2c)
+      out    = div(dot, denom)
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in torch_cosine_similarity:
+            return super().call_operator(op, args, kwargs, meta)
+
+        x1, x2 = args[0], args[1]
+        dim = kwargs.get("dim", 1)
+        eps = kwargs.get("eps", 1e-8)
+        dims = [dim] if isinstance(dim, int) else list(dim)
+
+        # 1) dot
+        prod = super().call_operator(torch.ops.aten.mul.Tensor, (x1, x2), {}, meta)
+        dot = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (prod, dims, False), {}, meta
+        )
+
+        # 2a) norm1 = pow(sum(x1*x1), 0.5)
+        x1_sq = super().call_operator(torch.ops.aten.mul.Tensor, (x1, x1), {}, meta)
+        s1 = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (x1_sq, dims, False), {}, meta
+        )
+        norm1 = super().call_operator(
+            torch.ops.aten.pow.Tensor_Scalar, (s1, 0.5), {}, meta
+        )
+
+        # 2b) norm2 = pow(sum(x2*x2), 0.5)
+        x2_sq = super().call_operator(torch.ops.aten.mul.Tensor, (x2, x2), {}, meta)
+        s2 = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (x2_sq, dims, False), {}, meta
+        )
+        norm2 = super().call_operator(
+            torch.ops.aten.pow.Tensor_Scalar, (s2, 0.5), {}, meta
+        )
+
+        # 3) eps scalar - we need to broadcast ourselves as TOSA dont do this for scalar
+        eps_t = super().call_operator(
+            torch.ops.aten.full_like.default, (norm1, eps), {}, meta
+        )
+
+        # 4) clamp to avoid zero division
+        n1c = super().call_operator(
+            torch.ops.aten.maximum.default, (norm1, eps_t), {}, meta
+        )
+        n2c = super().call_operator(
+            torch.ops.aten.maximum.default, (norm2, eps_t), {}, meta
+        )
+
+        # 5) denom and divide
+        denom = super().call_operator(torch.ops.aten.mul.Tensor, (n1c, n2c), {}, meta)
+        out = super().call_operator(torch.ops.aten.div.Tensor, (dot, denom), {}, meta)
+
+        return out
@@ -0,0 +1,45 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This pass is based on backends/qualcomm/_passes/replace_inf_values.py
+# with some modification to replaced inf values.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ReplaceInfValues(ExportPass):
+    """
+    Due to limitation in Quantizer, we need to change inf/-inf to more quantizable values.
+    """
+
+    def __init__(self):
+        super(ReplaceInfValues, self).__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        for buf_name, tensor in graph_module.named_buffers():
+            if tensor.is_floating_point():
+                modified = True
+                # 255 here is mainly for attention_mask in Llama for reasonable quant scale
+                tensor[tensor == float("inf")] = 255
+                tensor[tensor == float("-inf")] = -255
+                setattr(graph_module, buf_name, tensor)
+
+        for node in graph_module.graph.nodes:
+            arg_list = list(node.args)
+            for index, arg in enumerate(arg_list):
+                if arg == float("-inf"):
+                    modified = True
+                    arg_list[index] = -255
+                elif arg == float("inf"):
+                    modified = True
+                    arg_list[index] = +255
+            node.args = tuple(arg_list)
+
+        if modified:
+            graph_module.recompile()
+        return PassResult(graph_module, modified)
Original file line number	Diff line number	Diff line change
`@@ -608,7 +608,7 @@ endif()`
`608`	`608`	`# any backends.`
`609`	`609`	`#`
`610`	`610`	`add_library(executorch ${_executorch__srcs})`
`611`		`-target_link_libraries(executorch PUBLIC executorch_core)`
	`611`	`+target_link_libraries(executorch PRIVATE executorch_core)`
`612`	`612`	`target_include_directories(executorch PUBLIC ${_common_include_directories})`
`613`	`613`	`target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)`
`614`	`614`	`target_compile_options(executorch PUBLIC ${_common_compile_options})`
Original file line number	Diff line number	Diff line change
`@@ -11,5 +11,6 @@ python_library(`
`11`	`11`	`"//executorch/backends/xnnpack/_passes:xnnpack_passes",`
`12`	`12`	`"//executorch/exir:lib",`
`13`	`13`	`"//executorch/backends/transforms:utils",`
	`14`	`+ "//executorch/backends/transforms:decompose_sdpa",`
`14`	`15`	`],`
`15`	`16`	`)`