pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_model.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Package.swift‎
Lines changed: 4 additions & 4 deletions b/‎Package.swift‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm‎
Lines changed: 15 additions & 1 deletion b/‎backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/scripts/install_requirements.sh‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/scripts/install_requirements.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_upsample_bilinear2d.py‎
Lines changed: 8 additions & 6 deletions b/‎backends/arm/operators/op_upsample_bilinear2d.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎backends/arm/operators/op_upsample_nearest2d.py‎
Lines changed: 8 additions & 6 deletions b/‎backends/arm/operators/op_upsample_nearest2d.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎backends/cadence/aot/compiler_utils.py‎
Lines changed: 12 additions & 18 deletions b/‎backends/cadence/aot/compiler_utils.py‎
Lines changed: 12 additions & 18 deletions
diff --git a/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 25 additions & 43 deletions b/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 25 additions & 43 deletions
@@ -33,6 +33,7 @@ set_up_aot() {
   cmake .. \
       -DCMAKE_INSTALL_PREFIX=$PWD \
       -DEXECUTORCH_BUILD_QNN=ON \
+      -DANDROID_NATIVE_API_LEVEL=30 \
       -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
       -DEXECUTORCH_BUILD_DEVTOOLS=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
 
@@ -222,7 +222,7 @@ test_model_with_coreml() {
 
   DTYPE=float16
 
-  "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}"
+  "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}" --use_partitioner
   EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
 
   if [ -n "$EXPORTED_MODEL" ]; then
 
@@ -77,7 +77,9 @@ let package = Package(
         name: "\(key)_dependencies",
         dependencies: [.target(name: key)],
         path: ".Package.swift/\(key)",
-        linkerSettings:
+        linkerSettings: [
+          .linkedLibrary("c++")
+        ] +
           (value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } +
           (value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) }
       ),
@@ -94,10 +96,8 @@ let package = Package(
         .copy("resources/add.pte")
       ],
       linkerSettings: [
-        .linkedLibrary("c++"),
         .unsafeFlags([
-          "-Xlinker", "-force_load",
-          "-Xlinker", "cmake-out/kernels_portable.xcframework/macos-arm64/libkernels_portable_macos.a",
+          "-Xlinker", "-all_load",
         ])
       ]
     )
 
@@ -88,9 +88,17 @@
         ET_LOG(Error, "%s: DataType=%d is not supported", ETCoreMLStrings.delegateIdentifier.UTF8String, (int)tensor.scalar_type());
         return std::nullopt;
     }
-
+    
     std::vector<ssize_t> strides(tensor.strides().begin(), tensor.strides().end());
     std::vector<size_t> shape(tensor.sizes().begin(), tensor.sizes().end());
+    
+    // If tensor is rank 0, wrap in rank 1
+    // See https://github.com/apple/coremltools/blob/8.2/coremltools/converters/mil/frontend/torch/exir_utils.py#L73
+    if (shape.size() == 0) {
+        shape.push_back(1);
+        strides.push_back(1);
+    }
+    
     MultiArray::MemoryLayout layout(dataType.value(), std::move(shape), std::move(strides));
     switch (argType) {
         case ArgType::Input: {
@@ -233,6 +241,12 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
     std::array<SizesType, kTensorDimensionLimit> new_shape;
     for (size_t i = nInputs; i < nInputs + nOutputs; i++) {
         Tensor& t = args[i]->toTensor();
+        // If t has rank 0, do not resize.  delegate_args[i] will have rank 1
+        // because we resized it in get_multi_array
+        if (t.dim() == 0) {
+            continue;
+        }
+
         int rank = delegate_args[i].layout().rank();
         assert (rank <= new_shape.size());
         for (int d = 0; d < rank; d++) {
 
@@ -113,7 +113,7 @@ - (void)testAddModelExecution {
     XCTAssertNotNil(inputs);
     MLMultiArray *output = [ETCoreMLTestUtils filledMultiArrayWithShape:inputs[0].shape dataType:inputs[0].dataType repeatedValue:@(0) error:&localError];
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
-    XCTAssertTrue([self.modelManager executeModelWithHandle:handle 
+    XCTAssertTrue([self.modelManager executeModelWithHandle:handle
                                                        args:args
                                              loggingOptions:executorchcoreml::ModelLoggingOptions()
                                                 eventLogger:nullptr
 
@@ -12,7 +12,7 @@ SCRIPT_DIR_PATH="$(
 
 # TODO(jathu): remove the need to fetch coremltools to build deps for coreml_executor_runner.
 # Keep this version in sync with: pyproject.toml
-COREMLTOOLS_VERSION="8.2"
+COREMLTOOLS_VERSION="8.3"
 
 red=`tput setaf 1`
 green=`tput setaf 2`
 
@@ -34,9 +34,8 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].shape is not None and output.shape is not None
-        ), "Only static shapes are supported"
+        if inputs[0].shape is None or output.shape is None:
+            raise ValueError("Only static shapes are supported")
 
         input_dtype = inputs[0].dtype
 
@@ -55,9 +54,12 @@ def define_node(
         def in_int16_range(x):
             return torch.all(x >= -(2**15)) and torch.all(x <= 2**15 - 1)
 
-        assert in_int16_range(scale_n_yx)
-        assert in_int16_range(scale_d_yx)
-        assert in_int16_range(border_yx)
+        if not in_int16_range(scale_n_yx):
+            raise ValueError("scale_n_yx is out of the int16 range")
+        if not in_int16_range(scale_d_yx):
+            raise ValueError("scale_d_yx is out of the int16 range")
+        if not in_int16_range(border_yx):
+            raise ValueError("border_yx is out of the int16 range")
 
         attr = ts.TosaSerializerAttribute()
         attr.ResizeAttribute(
 
@@ -36,9 +36,8 @@ def define_node(
     ) -> None:
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        assert (
-            inputs[0].shape is not None and output.shape is not None
-        ), "Only static shapes are supported"
+        if inputs[0].shape is None or output.shape is None:
+            raise ValueError("Only static shapes are supported")
 
         # tosa_shape output is NHWC, take HW
         input_size_yx = torch.tensor(
@@ -55,9 +54,12 @@ def define_node(
         def in_int16_range(x):
             return torch.all(x >= -(2**15)) and torch.all(x <= 2**15 - 1)
 
-        assert in_int16_range(scale_n_yx)
-        assert in_int16_range(scale_d_yx)
-        assert in_int16_range(border_yx)
+        if not in_int16_range(scale_n_yx):
+            raise ValueError("scale_n_yx is out of the int16 range")
+        if not in_int16_range(scale_d_yx):
+            raise ValueError("scale_d_yx is out of the int16 range")
+        if not in_int16_range(border_yx):
+            raise ValueError("border_yx is out of the int16 range")
 
         attr = ts.TosaSerializerAttribute()
         attr.ResizeAttribute(
 
@@ -109,12 +109,12 @@ def get_cascaded_ops(
     return nodes
 
 
-# Capture the effect of transpose op on incoming dimension order
-def get_transposed_dims(node: torch.fx.Node, dims: List[int]) -> List[int]:
+def get_transposed_dims(
+    node: torch.fx.Node, dims: Optional[List[int]] = None
+) -> List[int]:
     """
-    Given a transpose node, and the incoming dimension ordering of the input
-    tensor to the transpose node, return the net effect of transpose op on the
-    dimension order.
+    Applies the transposition as given by node onto the dimensions given in input
+    e.g (1, 2) on [a, b, c, d] would return [a, c, b, d]
     """
     assert node.target == exir_ops.edge.aten.transpose_copy.int
     # Assert that the dims is not empty
@@ -127,28 +127,22 @@ def get_transposed_dims(node: torch.fx.Node, dims: List[int]) -> List[int]:
     assert isinstance(transpose_dims1, int)
     dim0 = transpose_dims0 if transpose_dims0 >= 0 else transpose_dims0 + dim_len
     dim1 = transpose_dims1 if transpose_dims1 >= 0 else transpose_dims1 + dim_len
-    # Perform transpose on dimmension ordering (dims)
-    dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
-    return dims
+    new_dims = list(dims)
+    new_dims[dim0], new_dims[dim1] = dims[dim1], dims[dim0]
+    return new_dims
 
 
-# Capture the effect of permute op on incoming dimension order
-def get_permuted_dims(node: torch.fx.Node, dims: Optional[Sequence[int]]) -> List[int]:
+def get_permuted_dims(node: torch.fx.Node, dims: List[int]) -> List[int]:
     """
-    Given a permute node, and the incoming dimension ordering of the input
-    tensor to the permute node, return the net effect of permute op on the
-    dimension order.
+    Applies the permutation as given by node onto the dimensions given in input
+    e.g (1, 2, 0)  on [a, b, c] would return [b, c, a]
     """
     assert node.target == exir_ops.edge.aten.permute_copy.default
     # Permute each index of the dimension ordering (dims)
     # pyre-fixme[6]: This combined typecheck isn't supported yet.
     permute_dims: List[int] = list(node.args[1])
     assert all(isinstance(x, int) for x in permute_dims)
-    # If the dims is empty, we can simply return the permute order
-    if not dims:
-        return permute_dims
-    dims = [dims[x] for x in permute_dims]
-    return dims
+    return [dims[x] for x in permute_dims]
 
 
 # Return the tensor of buffer/parameter op
 
@@ -14,7 +14,7 @@
 import operator
 from collections import deque
 from numbers import Number
-from typing import cast, Sequence
+from typing import Any, Callable, cast
 
 # Import these for the cadence function signatures.
 import executorch.backends.cadence.aot.ops_registrations  # noqa: F401
@@ -881,9 +881,10 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
-class FuseTransposeOpPairsPass(FuseOpPairsAcrossBranchesPass):
+class FuseTransposeOrPermuteOpPairsPass(FuseOpPairsAcrossBranchesPass):
     """
-    Fuse transpose op pairs to a single view op.
+    Fuse transpose or permute op pairs to a single view op.
+    (transpose or permutation) -> (quant or dequant) -> (transpose or permutation)
     """
 
     # A list of ops that can be bypassed when looking for a
@@ -907,42 +908,17 @@ def can_fuse_for_chain(
         if not super().can_fuse_for_chain(producer, consumer, consumer_op_packets):
             return False
 
-        def get_dims(node: torch.fx.Node) -> tuple[int, int]:
-            def canonicalize(dim: int) -> int:
-                if dim < 0:
-                    dim += len(node.meta["val"].shape)
-                return dim
-
-            return tuple(canonicalize(cast(int, d)) for d in node.args[1:3])
-
-        def is_equivalent(
-            shape: Sequence[int],
-            transpose0: tuple[int, int],
-            transpose1: tuple[int, int],
-        ) -> bool:
-            def permute_order(
-                order: Sequence[int], dims: tuple[int, int]
-            ) -> Sequence[int]:
-                new_order = list(order)
-                new_order[dims[0]], new_order[dims[1]] = (
-                    new_order[dims[1]],
-                    new_order[dims[0]],
-                )
-                return new_order
-
-            order = permute_order(range(len(shape)), transpose0)
-            order = permute_order(order, transpose1)
-
-            non_unit_dims = [dim for dim in range(len(shape)) if shape[dim] != 1]
-            non_unit_dims_permuted = [dim for dim in order if shape[dim] != 1]
-
-            return non_unit_dims == non_unit_dims_permuted
-
-        return is_equivalent(
-            cast(torch.fx.Node, producer.args[0]).meta["val"].shape,
-            get_dims(producer),
-            get_dims(consumer),
-        )
+        # checking that permut2(permut1(identify)) == identity
+        input_shape = cast(torch.fx.Node, producer.args[0]).meta["val"].shape
+        ident_dims = list(range(len(input_shape)))
+        # this mapping helps to handle both transpose and permutations
+        f: dict[Any, Callable] = {
+            exir_ops.edge.aten.transpose_copy.int: get_transposed_dims,
+            exir_ops.edge.aten.permute_copy.default: get_permuted_dims,
+        }
+        in_dims = f[producer.target](producer, ident_dims)
+        out_dims = f[consumer.target](consumer, in_dims)
+        return out_dims == ident_dims
 
     def get_fused_node(
         self,
@@ -960,11 +936,17 @@ def get_fused_node(
         return view
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        # Remove any dequantize op that has only quantize ops as its users.
+        # Remove any transpose/permutation op pair that cancel each other.
         self.find_and_fuse(
             graph_module,
-            producer_op_packets={exir_ops.edge.aten.transpose_copy},
-            consumer_op_packets={exir_ops.edge.aten.transpose_copy},
+            producer_op_packets={
+                exir_ops.edge.aten.transpose_copy,
+                exir_ops.edge.aten.permute_copy,
+            },
+            consumer_op_packets={
+                exir_ops.edge.aten.transpose_copy,
+                exir_ops.edge.aten.permute_copy,
+            },
             bypass_ops=self.bypass_ops,
         )
         result = super().call(graph_module)
@@ -1028,5 +1010,5 @@ class CadenceFuseOpsInGraph:
         FuseQuantDequantToRequantizePass,
         FuseMulIntoDequantPass,
         FuseFullThenReshapePass,
-        FuseTransposeOpPairsPass,
+        FuseTransposeOrPermuteOpPairsPass,
     ]