pytorch
diff --git a/‎.ci/scripts/build_android_instrumentation.sh
Lines changed: 0 additions & 21 deletions b/‎.ci/scripts/build_android_instrumentation.sh
Lines changed: 0 additions & 21 deletions
diff --git a/‎.ci/scripts/test_llava.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llava.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_android.yml
Lines changed: 3 additions & 2 deletions b/‎.github/workflows/_android.yml
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android-release-artifacts.yml
Lines changed: 12 additions & 4 deletions b/‎.github/workflows/android-release-artifacts.yml
Lines changed: 12 additions & 4 deletions
diff --git a/‎.mypy.ini
Lines changed: 3 additions & 0 deletions b/‎.mypy.ini
Lines changed: 3 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 6 additions & 13 deletions b/‎CONTRIBUTING.md
Lines changed: 6 additions & 13 deletions
diff --git a/‎backends/apple/coreml/README.md
Lines changed: 1 addition & 106 deletions b/‎backends/apple/coreml/README.md
Lines changed: 1 addition & 106 deletions
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 15 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 0 additions & 1 deletion b/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 0 additions & 1 deletion
@@ -154,7 +154,7 @@ run_and_verify() {
         EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
     else
         # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
-        EXPECTED_PREFIX="ASSISTANT:"
+        EXPECTED_PREFIX="ASSISTANT: image"
     fi
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
 
@@ -14,7 +14,7 @@ jobs:
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: android-apps
@@ -36,7 +36,8 @@ jobs:
         cp ${BUILD_AAR_DIR}/executorch.aar $ARTIFACTS_DIR_NAME
 
         mkdir -p ${ARTIFACTS_DIR_NAME}/library_test_dir
-        bash .ci/scripts/build_android_instrumentation.sh
+        bash extension/android/executorch_android/android_test_setup.sh
+        (cd extension/android; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest)
         cp extension/android/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir"
 
         mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
 
@@ -345,7 +345,7 @@ jobs:
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: android-apps
 
@@ -11,6 +11,8 @@ on:
         description: Upload the AAR to maven staging repository
         required: false
         type: boolean
+  schedule:
+    - cron: 0 10 * * *
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -26,6 +28,10 @@ jobs:
         shell: bash
         run: |
           VERSION="${{ inputs.version }}"
+          if [ -z "$VERSION" ]; then
+            echo "No version name specified. Will create a snapshot AAR"
+            exit 0
+          fi
           if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" | grep "200 OK"; then
             echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar"
             echo "Will skip build/upload"
@@ -45,7 +51,7 @@ jobs:
       secrets-env: EXECUTORCH_MAVEN_SIGNING_KEYID EXECUTORCH_MAVEN_SIGNING_PASSWORD EXECUTORCH_MAVEN_CENTRAL_PASSWORD EXECUTORCH_MAVEN_CENTRAL_USERNAME EXECUTORCH_MAVEN_SIGNING_GPG_KEY_CONTENTS
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.sha }}
       timeout: 90
       upload-artifact: android-apps
@@ -107,6 +113,8 @@ jobs:
           pip install awscli==1.32.18
           AWS_CMD="aws s3 cp"
           VERSION="${{ inputs.version }}"
-          VERSION_NAME="${VERSION:-temp_snapshot}"
-          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar --acl public-read
-          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar.sha256sums --acl public-read
+          if [ -z "$VERSION" ]; then
+            VERSION="snapshot-$(date +"%Y%m%d")"
+          fi
+          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION}/executorch.aar --acl public-read
+          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION}/executorch.aar.sha256sums --acl public-read
@@ -80,6 +80,9 @@ ignore_missing_imports = True
 [mypy-serializer.*]
 ignore_missing_imports = True
 
+[mypy-tosa_tools.*]
+ignore_missing_imports = True
+
 [mypy-setuptools.*]
 ignore_missing_imports = True
 
 
@@ -58,7 +58,7 @@ executorch
 │   ├── <a href="exir/verification">verification</a> - IR verification.
 ├── <a href="extension">extension</a> - Extensions built on top of the runtime.
 │   ├── <a href="extension/android">android</a> - ExecuTorch wrappers for Android apps. Please refer to the <a href="docs/source/using-executorch-android.md">Android documentation</a> and <a href="https://pytorch.org/executorch/main/javadoc/">Javadoc</a> for more information.
-│   ├── <a href="extension/apple">apple</a> - ExecuTorch wrappers for iOS apps. Please refer to the <a href="docs/source/using-executorch-ios.md">iOS documentation</a> and <a href="https://pytorch.org/executorch/stable/apple-runtime.html">how to integrate into Apple platform</a> for more information.
+│   ├── <a href="extension/apple">apple</a> - ExecuTorch wrappers for iOS apps. Please refer to the <a href="docs/source/using-executorch-ios.md">iOS documentation</a> and <a href="https://pytorch.org/executorch/main/using-executorch-ios.html">how to integrate into Apple platform</a> for more information.
 │   ├── <a href="extension/aten_util">aten_util</a> - Converts to and from PyTorch ATen types.
 │   ├── <a href="extension/data_loader">data_loader</a> - 1st party data loader implementations.
 │   ├── <a href="extension/evalue_util">evalue_util</a> - Helpers for working with EValue objects.
@@ -102,6 +102,8 @@ executorch
 ## Contributing workflow
 We actively welcome your pull requests (PRs).
 
+If you're completely new to open-source projects, GitHub, or ExecuTorch, please see our [New Contributor Guide](./docs/source/new-contributor-guide.md) for a step-by-step walkthrough on making your first contribution. Otherwise, read on.
+
 1. [Claim an issue](#claiming-issues), if present, before starting work. If an
    issue doesn't cover the work you plan to do, consider creating one to provide
    context about it, and to build consensus about the scope and solution.
@@ -407,18 +409,9 @@ for basics.
    - If the reviewers have requests or questions, follow up with them.
    - The goal of the reviewer is to ensure that the code in the `main` branch of
      the repo is consistent, maintainable, and of high quality.
-1. Once the PR has been approved,
-   - If you have the "write permission" in this repo, you can merge it yourself
-     by clicking the "Squash and merge" button once it is green and all CI
-     signals are passing.
-   - If you don't have "write permission" in this repo, the reviewer will take
-     care of the PR. The reviewer may import the PR into Meta's internal system
-     to validate it against internal CI.
-   - If the PR is approved but not merged within 5 business days, please comment
-     on the PR to ask about its status.
-   - Note that if the `main` [CI](#continuous-integration) jobs are broken, we
-     will only merge PRs that fix the broken jobs until all critical jobs are
-     fixed.
+1. Once the PR has been approved, you can merge it yourself
+     by clicking the "Squash and merge" button once it is
+     green and all CI signals are passing.
 
 &nbsp;
 
 
@@ -1,8 +1,7 @@
 # ExecuTorch Core ML Delegate
 
-
 This subtree contains the Core ML Delegate implementation for ExecuTorch.
-Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.
+Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.  To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends-coreml.md). 
 
 ## Layout
 - `compiler/` : Lowers a module to Core ML backend.
@@ -19,110 +18,6 @@ Core ML is an optimized framework for running machine learning models on Apple d
     - `workspace` : Xcode workspace for the runtime.
 - `third-party/`: External dependencies.
 
-## Partition and Delegation
-
-To delegate a Program to the **Core ML** backend, the client must call `to_backend` with the **CoreMLPartitioner**.
-
-```python
-import torch
-import executorch.exir
-
-from executorch.backends.apple.coreml.compiler import CoreMLBackend
-from executorch.backends.apple.coreml.partition import CoreMLPartitioner
-
-class Model(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.sin(x)
-
-source_model = Model()
-example_inputs = (torch.ones(1), )
-
-# Export the source model to Edge IR representation
-aten_program = torch.export.export(source_model, example_inputs)
-edge_program_manager = executorch.exir.to_edge(aten_program)
-
-# Delegate to Core ML backend
-delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
-
-# Serialize delegated program
-executorch_program = delegated_program_manager.to_executorch()
-with open("model.pte", "wb") as f:
-    f.write(executorch_program.buffer)
-```
-
-The module will be fully or partially delegated to **Core ML**, depending on whether all or part of ops are supported by the **Core ML** backend. User may force skip certain ops by `CoreMLPartitioner(skip_ops_for_coreml_delegation=...)`
-
-The `to_backend` implementation is a thin wrapper over [coremltools](https://apple.github.io/coremltools/docs-guides/), `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**.
-
-## Quantization
-
-To quantize a Program in a Core ML favored way, the client may utilize **CoreMLQuantizer**.
-
-```python
-import torch
-import executorch.exir
-
-from torch.export import export_for_training
-from torch.ao.quantization.quantize_pt2e import (
-    convert_pt2e,
-    prepare_pt2e,
-    prepare_qat_pt2e,
-)
-
-from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
-from coremltools.optimize.torch.quantization.quantization_config import (
-    LinearQuantizerConfig,
-    QuantizationScheme,
-)
-
-class Model(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.conv = torch.nn.Conv2d(
-            in_channels=3, out_channels=16, kernel_size=3, padding=1
-        )
-        self.relu = torch.nn.ReLU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        a = self.conv(x)
-        return self.relu(a)
-
-source_model = Model()
-example_inputs = (torch.randn((1, 3, 256, 256)), )
-
-pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()
-
-quantization_config = LinearQuantizerConfig.from_dict(
-    {
-        "global_config": {
-            "quantization_scheme": QuantizationScheme.symmetric,
-            "activation_dtype": torch.quint8,
-            "weight_dtype": torch.qint8,
-            "weight_per_channel": True,
-        }
-    }
-)
-quantizer = CoreMLQuantizer(quantization_config)
-
-# For post-training quantization, use `prepare_pt2e`
-# For quantization-aware trainin,g use `prepare_qat_pt2e`
-prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer)
-
-prepared_graph(*example_inputs)
-converted_graph = convert_pt2e(prepared_graph)
-```
-
-The `converted_graph` is the quantized torch model, and can be delegated to **Core ML** similarly through **CoreMLPartitioner**
-
-## Runtime
-
-To execute a Core ML delegated program, the application must link to the `coremldelegate` library. Once linked there are no additional steps required, ExecuTorch when running the program would call the Core ML runtime to execute the Core ML delegated part of the program.
-
-Please follow the instructions described in the [Core ML setup](/backends/apple/coreml/setup.md) to link the `coremldelegate` library.
-
 ## Help & Improvements
 If you have problems or questions or have suggestions for ways to make
 implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues).
@@ -20,13 +20,15 @@
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_batchnorm_pass import DecomposeBatchNormPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
+from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_select import DecomposeSelectPass  # noqa
 from .decompose_softmax_pass import DecomposeSoftmaxPass  # noqa
 from .decompose_softmax_unstable_pass import DecomposeSoftmaxUnstablePass  # noqa
+from .decompose_sqrt_pass import DecomposeSqrtPass  # noqa
 from .decompose_var_pass import DecomposeVarPass  # noqa
 from .fold_qdq_with_annotated_qparams_pass import (  # noqa
     FoldAndAnnotateQParamsPass,
 
@@ -25,13 +25,15 @@
     ConvertToClampPass,
     DecomposeBatchNormPass,
     DecomposeDivPass,
+    DecomposeGeluPass,
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
     DecomposeMeanDimPass,
     DecomposeSelectPass,
     DecomposeSoftmaxPass,
     DecomposeSoftmaxUnstablePass,
+    DecomposeSqrtPass,
     DecomposeVarPass,
     FoldAndAnnotateQParamsPass,
     FuseBatchnorm2DPass,
@@ -115,6 +117,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+        self.add_pass(DecomposeSqrtPass())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
@@ -130,6 +133,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxPass())
+        self.add_pass(DecomposeGeluPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
@@ -162,12 +166,22 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
 
         return self._transform(exported_program.graph_module)
 
+    def _tosa_1_0_int_quantized_pipeline(self, exported_program: ExportedProgram):
+        return self._tosa_080_BI_pipeline(exported_program)
+
+    def _tosa_1_0_fp_pipeline(self, exported_program: ExportedProgram):
+        return self._tosa_080_MI_pipeline(exported_program)
+
     def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
         """Apply passes before transforming program to backend"""
         if self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+BI"):
             return self._tosa_080_BI_pipeline(exported_program)
         elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+MI"):
             return self._tosa_080_MI_pipeline(exported_program)
+        elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-1.0+FP"):
+            return self._tosa_1_0_fp_pipeline(exported_program)
+        elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-1.0+INT"):
+            return self._tosa_1_0_int_quantized_pipeline(exported_program)
         else:
             raise NotImplementedError(
                 f"No pass pipeline implemented for {self.tosa_spec=}"
@@ -181,6 +195,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
+        self.add_pass(DecomposeSqrtPass())
 
         if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
             # Numerically stable softmax uses amax which is not supported on Ethos-U55
 
@@ -12,7 +12,6 @@
 from torch._export.utils import is_buffer
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 class CastInt64BuffersToInt32Pass(ExportPass):