diff --git a/.ci/scripts/test_ane_static_llama.sh b/.ci/scripts/test_ane_static_llama.sh new file mode 100644 index 00000000000..c83c522d629 --- /dev/null +++ b/.ci/scripts/test_ane_static_llama.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.." + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +which "${PYTHON_EXECUTABLE}" + +pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama + +# Download stories llama110m artifacts +download_stories_model_artifacts + +python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w + +popd diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 410e95d9a84..c003f050ba0 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -229,6 +229,28 @@ jobs: # see if we can import the module successfully ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')" + test-static-llama-ane: + name: test-static-llama-ane + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + with: + runner: macos-m1-stable + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + bash .ci/scripts/setup-conda.sh + eval "$(conda shell.bash hook)" + + # Install requirements + sh install_requirements.sh + sh backends/apple/coreml/scripts/install_requirements.sh + python install_executorch.py --pybind coreml + sh examples/models/llama/install_requirements.sh + + # Test ANE llama + sh .ci/scripts/test_ane_static_llama.sh + test-llama-runner-macos: name: test-llama-runner-mac uses: pytorch/test-infra/.github/workflows/macos_job.yml@main diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py index c0f60529895..f440dc878d4 100644 --- a/examples/apple/coreml/llama/export.py +++ b/examples/apple/coreml/llama/export.py @@ -203,6 +203,7 @@ def main() -> None: torch.ops.aten.scaled_dot_product_attention.default, # preserve norm op for numerical stability torch.ops.aten.linalg_vector_norm.default, + torch.ops.aten.reciprocal.default, ], compile_config=EdgeCompileConfig( _check_ir_validity=False, diff --git a/examples/apple/coreml/llama/llama_transformer.py b/examples/apple/coreml/llama/llama_transformer.py index 2ce4c1d2b5b..3c371da4c00 100644 --- a/examples/apple/coreml/llama/llama_transformer.py +++ b/examples/apple/coreml/llama/llama_transformer.py @@ -134,8 +134,10 @@ def _norm(self, x): # We have yet to do large scale evaluations on the numeric stability of this solution, but note that # it appears better than what exists currently (removing FP32 casts and using FP16) rms_norm_eps0 = ( - x * torch.sqrt(torch.tensor(self.dim, dtype=x.dtype)) - ) / torch.linalg.vector_norm(x, dim=-1, keepdim=True) + x + * torch.sqrt(torch.tensor(self.dim, dtype=x.dtype)) + * torch.reciprocal(torch.linalg.vector_norm(x, dim=-1, keepdim=True)) + ) return rms_norm_eps0 def forward(self, x):