Merge branch 'main' into fix_logging

metascroy · web-flow · commit 6c1f90d2a110 · 2025-04-14T17:05:24.000-07:00
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -2110,6 +2110,102 @@ def call_operator(
         return super().call_operator(op, args, kwargs, meta)
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=2))
+class ReplaceGeluWithApproximateGeluPass(ExportPass):
+    """
+    Replace the gelu op with an approximate gelu op. The approximate gelu op
+    is more efficient on DSP backends.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in {
+            exir_ops.edge.aten.gelu.default,
+        }:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # compute the approximate gelu (0.7978845608028654 is sqrt(2 / pi))
+        # as 0.5 * x * (1 + torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3)))
+
+        # Get 0.5 * x
+        half = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[0], 0.5),
+            {},
+            meta,
+        )
+
+        scaled = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[0], 0.044715),
+            {},
+            meta,
+        )
+
+        # Get x^2 (note that we use mul.Tensor twice instead of pow.Tensor because
+        # it is much more efficient on DSP backends)
+        scaled_square = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (scaled, args[0]),
+            {},
+            meta,
+        )
+
+        # Get x^3
+        scaled_cubed = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (scaled_square, args[0]),
+            {},
+            meta,
+        )
+
+        # Get x + 0.044715 * x^3
+        inner_sum = super().call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (scaled_cubed, args[0]),
+            {},
+            meta,
+        )
+
+        # Get 0.7978845608028654 * ( x + 0.044715 * x^3)
+        scaled_sum = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (inner_sum, 0.7978845608028654),
+            {},
+            meta,
+        )
+
+        # Get torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3))
+        tanh = super().call_operator(
+            exir_ops.edge.aten.tanh.default,
+            (scaled_sum,),
+            {},
+            meta,
+        )
+
+        # Get 1 + torch.tanh(0.79788456 * ( x + 0.044715 * x^3))
+        # TODO(): Check why this is not working properly with integer values (e.g. 1 instead of 1.)
+        outer_sum = super().call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (tanh, 1.0),
+            {},
+            meta,
+        )
+
+        # Retunr the final result
+        return super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (half, outer_sum),
+            {},
+            meta,
+        )
+
+
 # This class encapsulates all the functions that replace/switch one op in the
 # graph with another.
 class CadenceReplaceOpsInGraph:
@@ -2149,4 +2245,5 @@ class CadenceReplaceOpsInGraph:
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
         ReplaceWhereWithFullArgsWithWhereScalar,
+        # ReplaceGeluWithApproximateGeluPass,
     ]
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -29,6 +29,7 @@
     ReplaceConvWithIm2RowAndLinear,
     ReplaceEmptyTensorsWithFullPass,
     ReplaceFunctionallyEquivalentOpTargets,
+    ReplaceGeluWithApproximateGeluPass,
     ReplaceIm2RowWithViewPass,
     ReplaceLinearWithFullyConnectedOpPass,
     ReplaceMMWithAddMMPass,
@@ -1301,6 +1302,41 @@ def forward(self, cond: torch.Tensor):
             1,
         )
 
+    def test_replace_aten_gelu_with_approximate_gelu(self):
+        class Gelu(torch.nn.Module):
+            def forward(self, input):
+                return torch.nn.functional.gelu(input)
+
+        inputs = torch.randn(2, 1, 64)
+
+        graph_module = export_to_edge(Gelu(), (inputs,)).exported_program().graph_module
+
+        p = ReplaceGeluWithApproximateGeluPass()
+        graph_after_passes = cast(PassResult, p(graph_module)).graph_module
+
+        # Assert that aten.gelu op was decomposed
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                exir_ops.edge.aten.gelu.default,
+            ),
+            0,
+        )
+
+        # The decomposition should have one tanh, 2 add and 6 mul
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.tanh.default),
+            1,
+        )
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.add.Tensor),
+            2,
+        )
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.mul.Tensor),
+            6,
+        )
+
 
 class TestReplaceIm2rowWithViewPass(unittest.TestCase):
     def test_no_replacement_for_conv(self):
diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md
@@ -131,6 +131,6 @@ create an issue on [github](https://www.github.com/pytorch/executorch/issues).
 
 
 ## See Also
-For more information about the XNNPACK Delegate, please check out the following resources:
-- [ExecuTorch XNNPACK Delegate](https://pytorch.org/executorch/0.2/native-delegates-executorch-xnnpack-delegate.html)
-- [Building and Running ExecuTorch with XNNPACK Backend](https://pytorch.org/executorch/0.2/native-delegates-executorch-xnnpack-delegate.html)
+For more information about the XNNPACK Backend, please check out the following resources:
+- [XNNPACK Backend](https://pytorch.org/executorch/main/backends-xnnpack.html)
+- [XNNPACK Backend Internals](https://pytorch.org/executorch/main/backend-delegates-xnnpack-reference.html)
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
@@ -43,7 +43,8 @@ ExecuTorch provides hardware acceleration for a wide variety of hardware. The mo
 For mobile use cases, consider using XNNPACK for Android and Core ML or XNNPACK for iOS as a first step. See [Hardware Backends](backends-overview.md) for more information.
 
 ### Exporting
-Exporting is done using Python APIs. ExecuTorch provides a high degree of customization during the export process, but the typical flow is as follows. This example uses the MobileNet V2 image classification model implementation in torchvision, but the process supports any [export-compliant](https://pytorch.org/docs/stable/export.html) PyTorch model.
+Exporting is done using Python APIs. ExecuTorch provides a high degree of customization during the export process, but the typical flow is as follows. This example uses the MobileNet V2 image classification model implementation in torchvision, but the process supports any [export-compliant](https://pytorch.org/docs/stable/export.html) PyTorch model. For users working with Hugging Face models,
+you can find a list of supported models in the [*huggingface/optimum-executorch*](https://github.com/huggingface/optimum-executorch) repo.
 
 ```python
 import torch
@@ -101,6 +102,8 @@ print(torch.allclose(output[0], eager_reference_output, rtol=1e-3, atol=1e-5))
 
 For complete examples of exporting and running the model, please refer to our [examples GitHub repository](https://github.com/pytorch-labs/executorch-examples/tree/main/mv2/python).
 
+Additionally, if you work with Hugging Face models, the [*huggingface/optimum-executorch*](https://github.com/huggingface/optimum-executorch) library simplifies running these models end-to-end with ExecuTorch, using familiar Hugging Face APIs. Visit the repository for specific examples and supported models.
+
 <hr/>
 
 ## Running on Device
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -43,6 +43,7 @@ ExecuTorch provides support for:
 #### Examples
 - [Android Demo Apps](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
 - [iOS Demo Apps](demo-apps-ios.md)
+- [Hugging Face Models](https://github.com/huggingface/optimum-executorch/blob/main/README.md)
 #### Backends
 - [Overview](backends-overview)
 - [XNNPACK](backends-xnnpack)
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
@@ -58,9 +58,20 @@ You can also directly specify an AAR file in the app. We upload pre-built AAR to
 
 ### Snapshots from main branch
 
-| Date | AAR | SHASUMS |
-| ------- | --- | ------- |
-| 2025-02-27 | [executorch.aar](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-20250227/executorch.aar) | [executorch.aar.sha256sums](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-20250227/executorch.aar.sha256sums) |
+Starting from 2025-04-12, you can download nightly `main` branch snapshots:
+* `executorch.aar`: `https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-YYYYMMDD/executorch.aar`
+* `executorch.aar.sha256sums`: `https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-YYYYMMDD/executorch.aar.sha256sums`
+* Replace `YYYYMMDD` with the actual date you want to use.
+* AAR file is generated by [this workflow](https://github.com/pytorch/executorch/blob/c66b37d010c88a113560693b14dc6bd112593c11/.github/workflows/android-release-artifacts.yml#L14-L15).
+
+For example:
+
+```sh
+curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar
+curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar.sha256sums
+```
+
+We aim to make every daily snapshot available and useable. However, for best stability, please use releases, not snapshots.
 
 ## Using AAR file
 
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -85,7 +85,7 @@ Link your binary with the ExecuTorch runtime and any backends or kernels used by
 
 Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
 
-For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/using-executorch-ios.html).
+For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/using-executorch-ios.html).
 
 <p align="center">
 <img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
diff --git a/runtime/core/exec_aten/testing_util/tensor_factory.h b/runtime/core/exec_aten/testing_util/tensor_factory.h
@@ -133,7 +133,7 @@ inline bool check_dim_order(
   size_t gauss_sum = 0;
   std::vector<int> count(dim_order.size(), 0);
   for (int i = 0; i < dim_order.size(); i++) {
-    if (dim_order[i] < 0 || dim_order[i] >= sizes.size()) {
+    if (dim_order[i] >= sizes.size()) {
       return false;
     }
     gauss_sum += static_cast<size_t>(dim_order[i]) + 1;
diff --git a/test/utils/DeathTest.h b/test/utils/DeathTest.h
@@ -15,6 +15,10 @@
 
 #include <gtest/gtest.h>
 
+#ifndef ET_BUILD_MODE_COV
+#define ET_BUILD_MODE_COV 0
+#endif // ET_BUILD_MODE_COV
+
 #if ET_BUILD_MODE_COV
 
 /**

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ inline bool check_dim_order(`
`133`	`133`	`size_t gauss_sum = 0;`
`134`	`134`	`std::vector<int> count(dim_order.size(), 0);`
`135`	`135`	`for (int i = 0; i < dim_order.size(); i++) {`
`136`		`- if (dim_order[i] < 0 \|\| dim_order[i] >= sizes.size()) {`
	`136`	`+ if (dim_order[i] >= sizes.size()) {`
`137`	`137`	`return false;`
`138`	`138`	`}`
`139`	`139`	`gauss_sum += static_cast<size_t>(dim_order[i]) + 1;`