diff --git a/.github/actions/macos-ci-setup/action.yml b/.github/actions/macos-ci-setup/action.yml
new file mode 100644
index 0000000000000..e170ccf50a0ac
--- /dev/null
+++ b/.github/actions/macos-ci-setup/action.yml
@@ -0,0 +1,79 @@
+name: "macOS CI pipeline setup steps"
+description: "Common setup steps for macOS CI pipelines"
+
+inputs:
+  platform_machine:
+    required: false
+    type: string
+    default: "arm64"
+  python_version:
+    required: false
+    type: string
+    default: "3.11"
+  node_version:
+    required: false
+    type: string
+    default: "20.x"
+  java_version:
+    required: false
+    type: string
+    default: "17"
+  xcode_version:
+    required: false
+    type: string
+    default: "16"
+  use_cache:
+    required: false
+    type: boolean
+    default: false
+
+runs:
+  using: "composite"
+  steps:
+    - name: Use Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ inputs.python_version }}
+
+    - name: Verify machine architecture
+      shell: python
+      run: |
+        import platform
+        print(f"Running on {platform.machine()}")
+        assert platform.machine().lower() == "${{ inputs.platform_machine}}", "This job expects to be run on an ${{ inputs.platform_machine}} machine."
+
+    - name: Use Node.js
+      uses: actions/setup-node@v4
+      with:
+        node-version: ${{ inputs.node_version }}
+
+    - name: Install coreutils and ninja
+      shell: bash
+      run: brew install coreutils ninja
+
+    - name: Install Java
+      uses: actions/setup-java@v4
+      with:
+        distribution: "temurin"
+        java-version: ${{ inputs.java_version }}
+
+    - name: Use Xcode ${{ inputs.xcode_version }}
+      shell: bash
+      run: |
+        XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ inputs.xcode_version }}.app/Contents/Developer"
+        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
+
+    - name: Export GitHub Actions cache environment variables
+      if: ${{ inputs.use_cache }}
+      uses: actions/github-script@v7
+      with:
+        script: |
+          core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+          core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+    - name: Install python dependencies
+      shell: bash
+      working-directory: ${{ github.workspace }}
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install -r requirements-dev.txt
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 86b1cd5ee90e7..e2f45712f6f9f 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -3,12 +3,12 @@ name: "MacOS CI Pipeline"
 on:
   push:
     branches:
-    - main
-    - rel-*
+      - main
+      - rel-*
   pull_request:
     branches:
-    - main
-    - rel-*
+      - main
+      - rel-*
   workflow_dispatch:
 
 concurrency:
@@ -19,182 +19,25 @@ env:
   python_version: 3.11
 
 jobs:
-  MacOS_C_API_Packaging_CPU_x86_64:
-    runs-on: macos-13
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-      with:
-        submodules: false
-
-    - name: Use Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ env.python_version }}
-
-    - name: Use Node.js 20.x
-      uses: actions/setup-node@v4
-      with:
-        node-version: '20.x'
-
-    - name: Install Java 17
-      uses: actions/setup-java@v4
-      with:
-        distribution: 'temurin'
-        java-version: '17'
-        architecture: x64
-
-    - name: Set version number variables for Unix
-      shell: bash
-      run: |
-        # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
-        set +x
-
-        _OnnxRuntimeVersion=$(head -1 ${{ github.workspace }}/VERSION_NUMBER)
-        echo "OnnxRuntimeVersion=$_OnnxRuntimeVersion"
-
-        _OnnxRuntimeGitCommitHash=$(git rev-parse HEAD)
-        echo "OnnxRuntimeGitCommitHash=$_OnnxRuntimeGitCommitHash"
-
-        _OnnxRuntimeGitCommitHash=$(git rev-parse --short=8 HEAD)
-        echo "OnnxRuntimeGitCommitHashShort=$_OnnxRuntimeGitCommitHash"
-      working-directory: ${{ github.workspace }}
-
-    - name: Use Xcode 14.3.1
-      shell: bash
-      run: |
-        set -e -x
-        XCODE_DEVELOPER_DIR="/Applications/Xcode_14.3.1.app/Contents/Developer"
-        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
-
-    - name: Setup environment variables
-      shell: bash
-      run: |
-        set -e -x
-        export PATH=${{ github.workspace }}/installed/bin:$PATH
-        export ONNX_ML=1
-        export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=ON -DONNX_WERROR=OFF"
-        python3 -m pip install -r '${{ github.workspace }}/tools/ci_build/github/linux/docker/scripts/requirements.txt'
-
-    - name: Export GitHub Actions cache environment variables
-      uses: actions/github-script@v7
-      with:
-        script: |
-          core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
-          core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
-
-    - name: Configure Build (build.py --update)
-      shell: bash
-      run: |
-        set -e -x
-        rm -rf ${{ github.workspace }}/Release
-        python3 ${{ github.workspace }}/tools/ci_build/build.py --update --build_objc --build_wheel --use_xnnpack --build_nodejs --build_java --use_coreml --use_webgpu --build_dir ${{ github.workspace }} --skip_submodule_sync --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --build_shared_lib --config Release --use_vcpkg --use_vcpkg_ms_internal_asset_cache
-
-    - name: Build (build.py --build)
-      shell: bash
-      run: |
-        set -e -x
-        python3 ${{ github.workspace }}/tools/ci_build/build.py --build --build_objc --build_wheel --use_xnnpack --build_nodejs --build_java --use_coreml --use_webgpu --build_dir ${{ github.workspace }} --skip_submodule_sync --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --build_shared_lib --config Release --use_vcpkg --use_vcpkg_ms_internal_asset_cache
-
-    - name: Install
-      shell: bash
-      run: |
-        set -e -x
-        cd ${{ github.workspace }}/Release
-        make install DESTDIR=${{ github.workspace }}/installed
-
-    - name: Running Tests (build.py --test)
-      shell: bash
-      run: |
-        set -e -x
-        python3 ${{ github.workspace }}/tools/ci_build/build.py --test --build_objc --build_wheel --use_xnnpack --build_nodejs --build_java --use_coreml --use_webgpu --build_dir ${{ github.workspace }} --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --config Release --use_vcpkg --use_vcpkg_ms_internal_asset_cache
-    timeout-minutes: 300
-    env:
-      MACOSX_DEPLOYMENT_TARGET: '13.3'
-      ALLOW_RELEASED_ONNX_OPSET_ONLY: '0'
-
-  ARM64-Xcode16:
-    runs-on: macos-15
-
-    env:
-      xcode_version: 16
-
-    timeout-minutes: 60
-
-    steps:
-    - uses: actions/setup-python@v5
-      with:
-        python-version: ${{ env.python_version }}
-
-    - name: Verify ARM64 machine
-      shell: python
-      run: |
-        import platform
-        assert platform.machine() == "arm64", "This job expects to be run on an ARM64 machine."
+  cpu:
+    uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
 
-    - name: Use Xcode ${{ env.xcode_version }}
-      shell: bash
-      run: |
-        XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
-        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
+  coreml:
+    uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
+    with:
+      use_coreml: true
 
-    - uses: actions/checkout@v4
+  xnnpack:
+    uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
+    with:
+      use_xnnpack: true
 
-    - name: Build and test
-      shell: bash
-      run: |
-        python ./tools/ci_build/build.py \
-          --build_dir ./build \
-          --update \
-          --build --parallel \
-          --test \
-          --build_shared_lib \
-          --build_objc \
-          --use_coreml \
-          --use_xnnpack \
-          --use_binskim_compliant_compile_flags
+  webgpu:
+    uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
+    with:
+      use_webgpu: true
 
-  ARM64-Xcode16-webgpu:
-    runs-on: macos-15
-
-    env:
-      xcode_version: 16
-
-    timeout-minutes: 60
-
-    steps:
-    - uses: actions/setup-python@v5
-      with:
-        python-version: ${{ env.python_version }}
-
-    - name: Verify ARM64 machine
-      shell: python
-      run: |
-        import platform
-        assert platform.machine() == "arm64", "This job expects to be run on an ARM64 machine."
-
-    - name: Use Xcode ${{ env.xcode_version }}
-      shell: bash
-      run: |
-        XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
-        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
-
-    - uses: actions/checkout@v4
-
-    - name: Build and test
-      shell: bash
-      run: |
-        python ./tools/ci_build/build.py \
-          --build_dir ./build \
-          --update \
-          --build --parallel \
-          --test \
-          --build_shared_lib \
-          --build_nodejs \
-          --use_webgpu \
-          --use_binskim_compliant_compile_flags
-
-  ARM64-Xcode16-targeting-iphonesimulator:
+  iphone_simulator:
     runs-on: macos-15
 
     env:
@@ -207,41 +50,34 @@ jobs:
     timeout-minutes: 90
 
     steps:
-    - uses: actions/setup-python@v5
-      with:
-        python-version: ${{ env.python_version }}
-
-    - name: Verify ARM64 machine
-      shell: python
-      run: |
-        import platform
-        assert platform.machine() == "arm64", "This job expects to be run on an ARM64 machine."
-
-    - name: Use Xcode ${{ env.xcode_version }}
-      shell: bash
-      run: |
-        XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
-        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
-
-    - uses: actions/checkout@v4
-
-    - name: Build for iphonesimulator ${{ matrix.target_arch }}
-      shell: bash
-      run: |
-        python ./tools/ci_build/build.py \
-          --build_dir ./build \
-          --update \
-          --build --parallel \
-          --test \
-          --build_apple_framework \
-          --use_xcode \
-          --use_coreml \
-          --use_xnnpack \
-          --use_binskim_compliant_compile_flags \
-          --ios \
-          --apple_deploy_target=15.1 \
-          --apple_sysroot=iphonesimulator \
-          --osx_arch=${{ matrix.target_arch }}
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: macOS CI pipeline prepare steps
+        uses: ./.github/actions/macos-ci-setup
+        with:
+          platform_machine: "arm64"
+          python_version: ${{ env.python_version }}
+          xcode_version: ${{ env.xcode_version }}
+          use_cache: false
+
+      - name: Build for iphonesimulator ${{ matrix.target_arch }}
+        shell: bash
+        run: |
+          python ./tools/ci_build/build.py \
+            --build_dir ./build \
+            --update \
+            --build --parallel \
+            --test \
+            --build_apple_framework \
+            --use_xcode \
+            --use_coreml \
+            --use_xnnpack \
+            --use_binskim_compliant_compile_flags \
+            --ios \
+            --apple_deploy_target=15.1 \
+            --apple_sysroot=iphonesimulator \
+            --osx_arch=${{ matrix.target_arch }}
 
   Objective-C-StaticAnalysis:
     runs-on: macos-14
@@ -252,45 +88,44 @@ jobs:
     timeout-minutes: 30
 
     steps:
-    - uses: actions/setup-python@v5
-      with:
-        python-version: ${{ env.python_version }}
-
-    - name: Use Xcode ${{ env.xcode_version }}
-      shell: bash
-      run: |
-        XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
-        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
-
-    - uses: actions/checkout@v4
-
-    - name: Generate compile_commands.json and ONNX protobuf files
-      shell: bash
-      run: |
-        python ./tools/ci_build/build.py \
-          --build_dir ./build \
-          --cmake_generator "Unix Makefiles" \
-          --config Debug \
-          --build_shared_lib \
-          --use_coreml \
-          --build_objc \
-          --enable_training_apis \
-          --cmake_extra_defines CMAKE_EXPORT_COMPILE_COMMANDS=ON \
-          --use_binskim_compliant_compile_flags \
-          --update \
-          --build --parallel \
-          --target onnx_proto
-
-    - name: Analyze Objective-C/C++ source code
-      shell: bash
-      run: |
-        CLANG_TIDY_CHECKS="-*,clang-analyzer-*"
-
-        "$(brew --prefix llvm@15)/bin/clang-tidy" \
-          -p=./build/Debug \
-          --checks="${CLANG_TIDY_CHECKS}" \
-          --warnings-as-errors="${CLANG_TIDY_CHECKS}" \
-          --header-filter="objectivec/include|objectivec|onnxruntime/core" \
-          ./objectivec/*.mm \
-          ./onnxruntime/core/platform/apple/logging/apple_log_sink.mm \
-          ./onnxruntime/core/providers/coreml/model/*.mm
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: macOS CI pipeline prepare steps
+        uses: ./.github/actions/macos-ci-setup
+        with:
+          platform_machine: "arm64"
+          python_version: ${{ env.python_version }}
+          xcode_version: ${{ env.xcode_version }}
+          use_cache: false
+
+      - name: Generate compile_commands.json and ONNX protobuf files
+        shell: bash
+        run: |
+          python ./tools/ci_build/build.py \
+            --build_dir ./build \
+            --cmake_generator "Unix Makefiles" \
+            --config Debug \
+            --build_shared_lib \
+            --use_coreml \
+            --build_objc \
+            --enable_training_apis \
+            --cmake_extra_defines CMAKE_EXPORT_COMPILE_COMMANDS=ON \
+            --use_binskim_compliant_compile_flags \
+            --update \
+            --build --parallel \
+            --target onnx_proto
+
+      - name: Analyze Objective-C/C++ source code
+        shell: bash
+        run: |
+          CLANG_TIDY_CHECKS="-*,clang-analyzer-*"
+
+          "$(brew --prefix llvm@15)/bin/clang-tidy" \
+            -p=./build/Debug \
+            --checks="${CLANG_TIDY_CHECKS}" \
+            --warnings-as-errors="${CLANG_TIDY_CHECKS}" \
+            --header-filter="objectivec/include|objectivec|onnxruntime/core" \
+            ./objectivec/*.mm \
+            ./onnxruntime/core/platform/apple/logging/apple_log_sink.mm \
+            ./onnxruntime/core/providers/coreml/model/*.mm
diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml
new file mode 100644
index 0000000000000..a387541488621
--- /dev/null
+++ b/.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -0,0 +1,108 @@
+name: "macOS CI Reusable Workflow for build and test"
+description: "This is a reusable workflow for macOS CI pipelines to build and test"
+
+on:
+  workflow_call:
+    inputs:
+      use_webgpu:
+        required: false
+        type: boolean
+        default: false
+      use_xnnpack:
+        required: false
+        type: boolean
+        default: false
+      use_coreml:
+        required: false
+        type: boolean
+        default: false
+      python_version:
+        required: false
+        type: string
+        default: "3.11"
+
+jobs:
+  build-and-test:
+    strategy:
+      matrix:
+        platform_machine: ["x86_64", "arm64"]
+        build_config: ["Debug", "Release"]
+        include:
+          - platform_machine: "x86_64"
+            runs_on: "macos-13"
+            xcode_version: "14.3.1"
+          - platform_machine: "arm64"
+            runs_on: "macos-15"
+            xcode_version: "16"
+
+    runs-on: ${{ matrix.runs_on }}
+    env:
+      build_flags: >
+        --build_dir ./build
+        --skip_submodule_sync
+        --parallel
+        --use_binskim_compliant_compile_flags
+        --build_shared_lib
+        --build_nodejs
+        --build_objc
+        --build_java
+        --build_wheel
+        ${{ inputs.use_webgpu && '--use_webgpu' || '' }}
+        ${{ inputs.use_xnnpack && '--use_xnnpack' || '' }}
+        ${{ inputs.use_coreml && '--use_coreml' || '' }}
+        --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+        --config ${{ matrix.build_config }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: macOS CI pipeline prepare steps
+        uses: ./.github/actions/macos-ci-setup
+        with:
+          platform_machine: ${{ matrix.platform_machine }}
+          python_version: ${{ inputs.python_version }}
+          xcode_version: ${{ matrix.xcode_version }}
+          use_cache: true
+
+      - uses: actions/cache@v3
+        id: onnx-node-tests-cache
+        with:
+          path: ${{ github.workspace }}/js/test/
+          key: onnxnodetests-${{ hashFiles('js/scripts/prepare-onnx-node-tests.ts') }}
+
+      - name: Setup environment variables
+        shell: bash
+        run: |
+          set -e -x
+          export PATH=${{ github.workspace }}/build/installed/bin:$PATH
+          export ONNX_ML=1
+          export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=ON -DONNX_WERROR=OFF"
+          python -m pip install -r '${{ github.workspace }}/tools/ci_build/github/linux/docker/scripts/requirements.txt'
+
+      - name: Configure Build (build.py --update)
+        shell: bash
+        working-directory: ${{ github.workspace }}
+        run: |
+          rm -rf ${{ github.workspace }}/build/${{ matrix.build_config }}
+          python ./tools/ci_build/build.py --update ${{ env.build_flags }}
+
+      - name: Build (build.py --build)
+        shell: bash
+        working-directory: ${{ github.workspace }}
+        run: |
+          python ./tools/ci_build/build.py --build ${{ env.build_flags }}
+
+      - name: Install
+        shell: bash
+        run: |
+          set -e -x
+          rm -rf ${{ github.workspace }}/build/installed
+          cd ${{ github.workspace }}/build/${{ matrix.build_config }}
+          make install DESTDIR=${{ github.workspace }}/build/installed
+
+      - name: Running Tests (build.py --test)
+        shell: bash
+        working-directory: ${{ github.workspace }}
+        run: |
+          python ./tools/ci_build/build.py --test ${{ env.build_flags }}
diff --git a/.github/workflows/macos_coreml.yml b/.github/workflows/macos_coreml.yml
deleted file mode 100644
index f0acbd54a3fb1..0000000000000
--- a/.github/workflows/macos_coreml.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: "CoreML CI Pipeline"
-
-on:
-  push:
-    branches: [ main, 'rel-*']
-  pull_request:
-    branches: [ main ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  build_with_coreml:
-    runs-on: macos-13
-    strategy:
-      matrix:
-        use_coreml: [true, false]
-
-    env:
-      MACOSX_DEPLOYMENT_TARGET: '13.3'
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Install coreutils and ninja
-      run: brew install coreutils ninja
-
-    - name: Use Xcode 14.3.1
-      run: |
-        XCODE_DEVELOPER_DIR="/Applications/Xcode_14.3.1.app/Contents/Developer"
-        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
-
-    - name: Export GitHub Actions cache environment variables
-      uses: actions/github-script@v7
-      with:
-        script: |
-          core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
-          core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
-
-    - name: CoreML EP, Build and Test on macOS
-      run: |
-        python3 tools/ci_build/build.py \
-        --build_dir build \
-        --skip_submodule_sync \
-        --cmake_generator=Ninja \
-        --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags \
-        --build_shared_lib \
-        --config Debug \
-        ${{ matrix.use_coreml && '--use_coreml' || '' }}
\ No newline at end of file
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index d26ee0a2b766e..a36f0c3142d7d 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -65,6 +65,12 @@ jobs:
           java-version: '17'
           architecture: x64
 
+      - uses: actions/cache@v3
+        id: onnx-node-tests-cache
+        with:
+          path: ${{ github.workspace }}/js/test/
+          key: onnxnodetests-${{ hashFiles('js/scripts/prepare-onnx-node-tests.ts') }}
+
       - name: API Documentation Check and generate
         run: |
           set ORT_DOXY_SRC=${{ github.workspace }}
@@ -132,4 +138,4 @@ jobs:
       DocUpdateNeeded: false
       ONNXRUNTIME_TEST_GPU_DEVICE_ID: '0'
       AZCOPY_AUTO_LOGIN_TYPE: MSI
-      AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
\ No newline at end of file
+      AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
diff --git a/.github/workflows/windows_webgpu.yml b/.github/workflows/windows_webgpu.yml
index 7b8415c223e0f..908f28ae174d3 100644
--- a/.github/workflows/windows_webgpu.yml
+++ b/.github/workflows/windows_webgpu.yml
@@ -91,6 +91,12 @@ jobs:
         shell: cmd
         working-directory: ${{ github.workspace }}
 
+      - uses: actions/cache@v3
+        id: onnx-node-tests-cache
+        with:
+          path: ${{ github.workspace }}/js/test/
+          key: onnxnodetests-${{ hashFiles('js/scripts/prepare-onnx-node-tests.ts') }}
+
       - name: Export GitHub Actions cache environment variables
         uses: actions/github-script@v7
         with:
@@ -259,4 +265,4 @@ jobs:
       - name: Validate C# native delegates
         run: python tools\ValidateNativeDelegateAttributes.py
         shell: cmd
-        working-directory: ${{ github.workspace }}\csharp
\ No newline at end of file
+        working-directory: ${{ github.workspace }}\csharp
diff --git a/.github/workflows/windows_x64_debug_build_x64_debug.yml b/.github/workflows/windows_x64_debug_build_x64_debug.yml
index b8d1bdf228261..2508bb2079119 100644
--- a/.github/workflows/windows_x64_debug_build_x64_debug.yml
+++ b/.github/workflows/windows_x64_debug_build_x64_debug.yml
@@ -75,6 +75,12 @@ jobs:
       run: |
         nuget restore ${{ github.workspace }}\packages.config -PackagesDirectory ${{ github.workspace }}\build\Debug -ConfigFile ${{ github.workspace }}\NuGet.config
 
+    - uses: actions/cache@v3
+      id: onnx-node-tests-cache
+      with:
+        path: ${{ github.workspace }}/js/test/
+        key: onnxnodetests-${{ hashFiles('js/scripts/prepare-onnx-node-tests.ts') }}
+
     - name: Export GitHub Actions cache environment variables
       uses: actions/github-script@v7
       with:
@@ -126,4 +132,4 @@ jobs:
     env:
       OrtPackageId: Microsoft.ML.OnnxRuntime
       OnnxRuntimeBuildDirectory: ${{ github.workspace }}\build
-      DOTNET_SKIP_FIRST_TIME_EXPERIENCE: 'true'
\ No newline at end of file
+      DOTNET_SKIP_FIRST_TIME_EXPERIENCE: 'true'
diff --git a/.github/workflows/windows_x64_release_build_x64_release.yml b/.github/workflows/windows_x64_release_build_x64_release.yml
index c0e8fb24bc615..58a94c2f2208a 100644
--- a/.github/workflows/windows_x64_release_build_x64_release.yml
+++ b/.github/workflows/windows_x64_release_build_x64_release.yml
@@ -75,6 +75,12 @@ jobs:
       run: |
         nuget restore ${{ github.workspace }}\packages.config -PackagesDirectory ${{ github.workspace }}\build\RelWithDebInfo -ConfigFile ${{ github.workspace }}\NuGet.config
 
+    - uses: actions/cache@v3
+      id: onnx-node-tests-cache
+      with:
+        path: ${{ github.workspace }}/js/test/
+        key: onnxnodetests-${{ hashFiles('js/scripts/prepare-onnx-node-tests.ts') }}
+
     - name: Export GitHub Actions cache environment variables
       uses: actions/github-script@v7
       with:
diff --git a/.github/workflows/windows_x64_release_dnnl_build_x64_release.yml b/.github/workflows/windows_x64_release_dnnl_build_x64_release.yml
index 2ac7a4a646eb0..5b45f040c990b 100644
--- a/.github/workflows/windows_x64_release_dnnl_build_x64_release.yml
+++ b/.github/workflows/windows_x64_release_dnnl_build_x64_release.yml
@@ -75,6 +75,12 @@ jobs:
       run: |
         nuget restore ${{ github.workspace }}\packages.config -PackagesDirectory ${{ github.workspace }}\build\RelWithDebInfo -ConfigFile ${{ github.workspace }}\NuGet.config
 
+    - uses: actions/cache@v3
+      id: onnx-node-tests-cache
+      with:
+        path: ${{ github.workspace }}/js/test/
+        key: onnxnodetests-${{ hashFiles('js/scripts/prepare-onnx-node-tests.ts') }}
+
     - name: Export GitHub Actions cache environment variables
       uses: actions/github-script@v7
       with:
@@ -122,4 +128,4 @@ jobs:
     env:
       OrtPackageId: Microsoft.ML.OnnxRuntime
       OnnxRuntimeBuildDirectory: ${{ github.workspace }}\build
-      DOTNET_SKIP_FIRST_TIME_EXPERIENCE: 'true'
\ No newline at end of file
+      DOTNET_SKIP_FIRST_TIME_EXPERIENCE: 'true'
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index a449e42f6bf19..7b2bbdd2094d1 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -6080,3 +6080,77 @@ https://dawn.googlesource.com/dawn
    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+_____
+
+KleidiAI
+
+https://gitlab.arm.com/kleidi/kleidiai
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+
+     (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
+
+     (b) You must cause any modified files to carry prominent notices stating that You changed the files; and
+
+     (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+
+     (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+
+     You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d520f4ac9212a..560d1cd423083 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -87,6 +87,7 @@ option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
+option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
 option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
@@ -831,6 +832,20 @@ else()
   endif()
 endif()
 
+if (onnxruntime_USE_KLEIDIAI AND NOT MSVC AND (
+    (onnxruntime_target_platform STREQUAL "aarch64") OR
+    (onnxruntime_target_platform STREQUAL "ARM64") OR
+    (APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")))
+  check_cxx_compiler_flag(-march=armv8.2-a+dotprod HAS_ARM64_DOTPROD)
+  check_cxx_compiler_flag(-march=armv8.2-a+i8mm HAS_ARM64_I8MM)
+  if (NOT HAS_ARM64_DOTPROD)
+    message(FATAL_ERROR  "The compiler doesn't support dotprod")
+  endif()
+  if (NOT HAS_ARM64_I8MM)
+    message(FATAL_ERROR  "The compiler doesn't support i8mm")
+  endif()
+endif()
+
 #names in this var must match the directory names under onnxruntime/core/providers
 #ONNXRUNTIME_PROVIDER_NAMES is the list of providers that needs to export additional symbols in the global namespace.
 #For example CUDA EP exports "OrtSessionOptionsAppendExecutionProvider_CUDA", which is a global function.
diff --git a/cmake/deps.txt b/cmake/deps.txt
index c7db8ef51505d..060dd72d655d3 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.17.0.zip;13a60ac5217c104139ce0fd024f48628e7bcf5bc
-# Use the latest commit of 10.8-GA
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/c5ca8912f30e9ad630a0ef565e3d5f4bd5e91563.zip;588b294aaa9e84679ed5815cea1d399210ac98c2
+# Use the latest commit of 10.9-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/d5dce67db7c2e64b07e055571f5ec06f7f254de2.zip;01114d3b67650857281fa50faa2e412130a63b69
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
@@ -58,4 +58,4 @@ composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/arch
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
 dawn;https://github.com/google/dawn/archive/40a9fa79f76e6c76cca9e2fa69ea07f202f1d2e6.zip;e224563d5ab4a8e53a517b06f721242533bce722
-kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/d15722976120710080ca098fe8ddabf4556cb40f/kleidiai-d15722976120710080ca098fe8ddabf4556cb40f.zip;d6c840d00c3b05aedf06e957ddaece1013d1f40b
+kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.4.0.tar.gz;22d3b57b54a61c194ab256ff11b0353a3b220244
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index e599e4f04ad91..3c526cd61418a 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -67,13 +67,13 @@ if(onnxruntime_target_platform STREQUAL "ARM64EC")
         link_directories("$ENV{VCINSTALLDIR}/Tools/MSVC/$ENV{VCToolsVersion}/lib/ARM64EC")
         link_directories("$ENV{VCINSTALLDIR}/Tools/MSVC/$ENV{VCToolsVersion}/ATLMFC/lib/ARM64EC")
         link_libraries(softintrin.lib)
-        add_compile_options("/bigobj")
+        add_compile_options("$<$<NOT:$<COMPILE_LANGUAGE:ASM_MARMASM>>:/bigobj>")
     endif()
 endif()
 
 if(onnxruntime_target_platform STREQUAL "ARM64")
     if (MSVC)
-        add_compile_options("/bigobj")
+        add_compile_options("$<$<NOT:$<COMPILE_LANGUAGE:ASM_MARMASM>>:/bigobj>")
     endif()
 endif()
 
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 87387d4f281ed..9b468da44928e 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -128,6 +128,10 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDot.asm
         ${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm
       )
+
+      if (onnxruntime_USE_KLEIDIAI)
+        setup_kleidiai()
+      endif()
     else()
       target_sources(onnxruntime_mlas PRIVATE
         ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
@@ -256,6 +260,24 @@ function(setup_mlas_source_for_windows)
   endif()
 endfunction()
 
+function(setup_kleidiai)
+  target_compile_definitions(onnxruntime_mlas PRIVATE USE_KLEIDIAI)
+
+  # Disable the KleidiAI tests
+  set(KLEIDIAI_BUILD_TESTS  OFF)
+
+  # Fetch KleidiAI sources:
+  if (NOT TARGET kleidiai)
+    onnxruntime_fetchcontent_declare(kleidiai URL ${DEP_URL_kleidiai} URL_HASH SHA1=${DEP_SHA1_kleidiai} EXCLUDE_FROM_ALL)
+  endif()
+  onnxruntime_fetchcontent_makeavailable(kleidiai)
+
+  target_sources(onnxruntime_mlas PRIVATE
+    ${MLAS_SRC_DIR}/kai_ukernel_interface.cpp
+  )
+  target_link_libraries(onnxruntime_mlas PRIVATE kleidiai)
+endfunction()
+
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
     file(GLOB_RECURSE mlas_platform_srcs
@@ -395,6 +417,9 @@ else()
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.h
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
         )
+        if (onnxruntime_USE_KLEIDIAI)
+          setup_kleidiai()
+        endif()
         set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
                                     PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
         if (NOT APPLE)
diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake
index 685e77bc483bd..495ff093326ad 100644
--- a/cmake/onnxruntime_providers_migraphx.cmake
+++ b/cmake/onnxruntime_providers_migraphx.cmake
@@ -21,8 +21,10 @@
   # Add search paths for default rocm installation
   list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip /opt/rocm $ENV{HIP_PATH})
 
-  # Suppress the warning about the small capitals of the package name - Enable when support to CMake 3.27.0 is used
-  # cmake_policy(SET CMP0144 NEW)
+  if(POLICY CMP0144)
+      # Suppress the warning about the small capitals of the package name
+      cmake_policy(SET CMP0144 NEW)
+  endif()
 
   if(WIN32 AND NOT HIP_PLATFORM)
     set(HIP_PLATFORM "amd")
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
index 67addd2731744..4da9b5ffae3e4 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
@@ -50,6 +50,7 @@
 
     <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\InferenceTest.cs" />
     <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\EqualityComparers.cs" />
+    <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\AssertUtils.cs" />
     <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OnnxMl.cs" />
     <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OnnxData.cs" />
     <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp\InferenceTest.netcore.cs" />
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
index 84377d65d1213..6ab341d75683f 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
@@ -45,8 +45,8 @@ public void Dispose()
                 {
                     String failureMessage = TestContext.CurrentContext.Result.Message;
                     String jsonToSendFailure =
-                        String.Format("browserstack_executor: {\"action\": \"setSessionStatus\", \"arguments\": " +
-                                      "{\"status\":\"failed\", \"reason\": {0}}}",
+                        String.Format("browserstack_executor: {{\"action\": \"setSessionStatus\", \"arguments\": " +
+                                      "{{\"status\":\"failed\", \"reason\": {0}}}}}",
                                       JsonConvert.ToString(failureMessage));
 
                     ((IJavaScriptExecutor)driver).ExecuteScript(jsonToSendFailure);
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
index 5db3dc9957d1c..b62c2f052455e 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
@@ -89,7 +89,7 @@ public async Task ClickRunAllTest()
                 await Task.Delay(500);
             }
 
-            var (numPassed, numFailed) = GetPassFailCount();
+            (int numPassed, int numFailed) = GetPassFailCount();
 
             if (numFailed == 0)
             {
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/AssertUtils.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/AssertUtils.cs
new file mode 100644
index 0000000000000..7d689628ceab7
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/AssertUtils.cs
@@ -0,0 +1,81 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Text;
+using Xunit;
+
+namespace Microsoft.ML.OnnxRuntime.Tests
+{
+  internal static class AssertUtils
+  {
+
+    /// <summary>
+    /// Check if the action throws the expected exception. If it doesn't, the method passes. If it does, check for
+    /// the exception type and the expected exception message. More detailed Assert method to be used for unit tests
+    /// written with XUnit.
+    /// </summary>
+    /// <typeparam name="T">Type of exception expected to be thrown.</typeparam>
+    /// <param name="action">Action to be executed or tested.</param>
+    /// <param name="feedbackMessage">Feedback message if an unexpected exception happens.</param>
+    /// <param name="expectedExceptionMessage">Expected exception message. If null, the exception message is not
+    // checked.</param>
+    public static void IfThrowsCheckException<T>(Action action, string feedbackMessage, string expectedExceptionMessage = null) where T : Exception
+    {
+      try
+      {
+        action();
+      }
+      catch (T ex)
+      {
+        if (expectedExceptionMessage == null)
+        {
+          return;
+        }
+        else
+        {
+          Assert.True(ex.Message.Contains(expectedExceptionMessage),
+            $"{feedbackMessage}\nExpected exception message to contain '{expectedExceptionMessage}', but got '{ex.Message}'");
+        }
+      }
+      catch (Exception ex)
+      {
+        Assert.Fail($"{feedbackMessage}\nExpected {typeof(T).Name} but got {ex.GetType().Name}. ");
+      }
+    }
+
+
+    /// <summary>
+    /// Check if the action throws the expected exception. If it doesn't, the method fails with the feedbackMessage.
+    /// If it does, check for the exception type and the expected exception message. More detailed Assert method to be
+    /// used for unit tests written with XUnit.
+    /// </summary>
+    /// <typeparam name="T">Type of exception expected to be thrown.</typeparam>
+    /// <param name="action">Action to be executed or tested. It is expected that the action will throw.</param>
+    /// <param name="feedbackMessage">Feedback message if an unexpected exception happens.</param>
+    /// <param name="expectedExceptionMessage">Expected exception message. If null, the exception message is not
+    // checked.</param>
+    public static void AssertThrowsCheckException<T>(Action action, string feedbackMessage, string expectedExceptionMessage = null) where T : Exception
+    {
+      try
+      {
+        action();
+        Assert.Fail($"{feedbackMessage}\nExpected {typeof(T).Name} but no exception was thrown.");
+      }
+      catch (T ex)
+      {
+        if (expectedExceptionMessage == null)
+        {
+          return;
+        }
+        else
+        {
+          Assert.True(ex.Message.Contains(expectedExceptionMessage),
+            $"{feedbackMessage}\nExpected exception message to contain '{expectedExceptionMessage}', but got '{ex.Message}'");
+        }
+      }
+      catch (Exception ex)
+      {
+        Assert.Fail($"{feedbackMessage}\nExpected {typeof(T).Name} but got {ex.GetType().Name}. ");
+      }
+    }
+  }
+}
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
index 17738da515134..0a39d965979ca 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
@@ -93,18 +93,35 @@ public void TestSessionOptions()
                 opt.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_EXTENDED;
                 Assert.Equal(GraphOptimizationLevel.ORT_ENABLE_EXTENDED, opt.GraphOptimizationLevel);
 
-                Assert.Throws<OnnxRuntimeException>(() => { opt.GraphOptimizationLevel = (GraphOptimizationLevel)10; });
+                AssertUtils.AssertThrowsCheckException<OnnxRuntimeException>(
+                    () => { opt.GraphOptimizationLevel = (GraphOptimizationLevel)10; },
+                    "Set an invalid Graph Optimization Level.");
 
                 opt.AddSessionConfigEntry("key", "value");
 
-                var ex = Assert.Throws<OnnxRuntimeException>(() => { opt.AddSessionConfigEntry("", "invalid key"); });
-                Assert.Contains("[ErrorCode:InvalidArgument] Config key is empty", ex.Message);
+                AssertUtils.AssertThrowsCheckException<OnnxRuntimeException>(
+                    () => { opt.AddSessionConfigEntry("", "invalid key"); },
+                    "Added an invalid config entry.",
+                    "[ErrorCode:InvalidArgument] Config key is empty");
 
                 // SessionOptions.RegisterOrtExtensions can be manually tested by referencing the
                 // Microsoft.ML.OnnxRuntime.Extensions nuget package. After that is done, this should not throw.
-                ex = Assert.Throws<OnnxRuntimeException>(() => { opt.RegisterOrtExtensions(); });
-                Assert.Contains("Microsoft.ML.OnnxRuntime.Extensions NuGet package must be referenced", ex.Message);
-
+                AssertUtils.AssertThrowsCheckException<OnnxRuntimeException>(
+                    () => { opt.RegisterOrtExtensions(); },
+                    "RegisterOrtExtensions should throw if the Extensions package is not referenced",
+                    "Microsoft.ML.OnnxRuntime.Extensions NuGet package must be referenced");
+
+                // The below tests what happens when various execution providers are added
+                // to the session options.
+
+                // We can only check what EPs the package was built with for the
+                // Microsoft.ML.OnnxRuntime.Managed package because the managed package defines
+                // the C# preprocessor symbols (such as USE_CUDA) for the EPs that it was built with.
+
+                // The Microsoft.ML.OnnxRuntime package will use the appropriate platform bindings
+                // (ie the native Android bindings) where the C# preprocessor symbols
+                // identifying the EPs included in the build may not be available, so we use
+                // IfThrowsCheckException instead of using ifdefs.
 #if USE_CUDA
                 opt.AppendExecutionProvider_CUDA(0);
 #endif
@@ -157,30 +174,25 @@ public void TestSessionOptions()
 #if USE_TENSORRT
                 opt.AppendExecutionProvider_Tensorrt(0);
 #endif
-#if USE_XNNPACK
-                opt.AppendExecutionProvider("XNNPACK");
-#else
-                ex = Assert.Throws<OnnxRuntimeException>(() => { opt.AppendExecutionProvider("XNNPACK"); });
-                Assert.Contains("XNNPACK execution provider is not supported in this build", ex.Message);
-#endif
-#if USE_SNPE
-                opt.AppendExecutionProvider("SNPE");
-#else
-                ex = Assert.Throws<OnnxRuntimeException>(() => { opt.AppendExecutionProvider("SNPE"); });
-                Assert.Contains("SNPE execution provider is not supported in this build", ex.Message);
-#endif
-#if USE_QNN
-                opt.AppendExecutionProvider("QNN");
-#else
-                ex = Assert.Throws<OnnxRuntimeException>(() => { opt.AppendExecutionProvider("QNN"); });
-                Assert.Contains("QNN execution provider is not supported in this build", ex.Message);
-#endif
-#if USE_COREML
-                opt.AppendExecutionProvider("CoreML");
-#else
-                ex = Assert.Throws<OnnxRuntimeException>(() => { opt.AppendExecutionProvider("CoreML"); });
-                Assert.Contains("CoreML execution provider is not supported in this build", ex.Message);
-#endif
+                AssertUtils.IfThrowsCheckException<OnnxRuntimeException>(
+                    () => { opt.AppendExecutionProvider("CoreML"); },
+                    "Appending CoreML EP should have succeeded or thrown an OnnRuntimeException with the expected message. ",
+                    "CoreML execution provider is not supported in this build");
+
+                AssertUtils.IfThrowsCheckException<OnnxRuntimeException>(
+                    () => { opt.AppendExecutionProvider("XNNPACK"); },
+                    "Appending XNNPACK EP should have succeeded or thrown an OnnRuntimeException with the expected message. ",
+                    "XNNPACK execution provider is not supported in this build");
+
+                AssertUtils.IfThrowsCheckException<OnnxRuntimeException>(
+                    () => { opt.AppendExecutionProvider("SNPE"); },
+                    "Appending SNPE EP should have succeeded or thrown an OnnRuntimeException with the expected message. ",
+                    "SNPE execution provider is not supported in this build");
+
+                AssertUtils.IfThrowsCheckException<OnnxRuntimeException>(
+                    () => { opt.AppendExecutionProvider("QNN"); },
+                    "Appending QNN EP should have succeeded or thrown an OnnRuntimeException with the expected message. ",
+                    "QNN execution provider is not supported in this build");
 
                 opt.AppendExecutionProvider_CPU(1);
             }
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 8d256a2088279..60d9e8e747eeb 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -551,7 +551,7 @@ Do not modify directly.*
 |Sampling|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *in* presence_mask:**I**<br> *in* seed:**I**<br> *out* sequences:**I**<br> *out* filtered_logits:**T**|1+|**T** = tensor(float)|
 |SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
-|SparseAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* block_row_indices:**M**<br> *in* block_col_indices:**M**<br> *in* total_sequence_length:**M**<br> *in* key_total_sequence_lengths:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float)|
+|SparseAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* block_row_indices:**M**<br> *in* block_col_indices:**M**<br> *in* total_sequence_length:**M**<br> *in* key_total_sequence_lengths:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
 |SparseToDenseMatMul|*in* A:**T**<br> *in* B:**T1**<br> *out* Y:**T1**|1+|**T** = sparse_tensor(double), sparse_tensor(float), sparse_tensor(int32), sparse_tensor(int64), sparse_tensor(uint32), sparse_tensor(uint64)<br/> **T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
 |Tokenizer|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(string)|
 |TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
index c80b8c0c164b6..f40ea6591059e 100644
--- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -43,6 +43,9 @@ static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_
 // Set RPC control latency for QNN HTP backend
 static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
 
+// Set QNN Lora Config File for apply Lora in QNN context binary
+static const char* const kOrtRunOptionsConfigQnnLoraConfig = "qnn.lora_config";
+
 // Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true.
 // The value should be an integer. If the value is not set, the default value is 0 and
 // ORT session only captures one cuda graph before another capture is requested.
diff --git a/js/.eslintrc.js b/js/.eslintrc.js
index 462e417df1d66..f20adcb0eaa52 100644
--- a/js/.eslintrc.js
+++ b/js/.eslintrc.js
@@ -185,10 +185,9 @@ module.exports = {
               '_OrtCreateTensor',
               '_OrtEndProfiling',
               '_OrtFree',
-              '_OrtGetInputName',
               '_OrtGetInputOutputCount',
+              '_OrtGetInputOutputMetadata',
               '_OrtGetLastError',
-              '_OrtGetOutputName',
               '_OrtGetTensorData',
               '_OrtInit',
               '_OrtReleaseBinding',
diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
index e63f9c6c9147f..9b7c1db219188 100644
--- a/js/common/lib/backend.ts
+++ b/js/common/lib/backend.ts
@@ -23,6 +23,9 @@ interface SessionHandler {
 
   readonly inputNames: readonly string[];
   readonly outputNames: readonly string[];
+
+  readonly inputMetadata: readonly InferenceSession.ValueMetadata[];
+  readonly outputMetadata: readonly InferenceSession.ValueMetadata[];
 }
 
 /**
diff --git a/js/common/lib/inference-session-impl.ts b/js/common/lib/inference-session-impl.ts
index d47ed7a331045..797dba8b94089 100644
--- a/js/common/lib/inference-session-impl.ts
+++ b/js/common/lib/inference-session-impl.ts
@@ -225,5 +225,13 @@ export class InferenceSession implements InferenceSessionInterface {
     return this.handler.outputNames;
   }
 
+  get inputMetadata(): readonly InferenceSessionInterface.ValueMetadata[] {
+    return this.handler.inputMetadata;
+  }
+
+  get outputMetadata(): readonly InferenceSessionInterface.ValueMetadata[] {
+    return this.handler.outputMetadata;
+  }
+
   private handler: InferenceSessionHandler;
 }
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 26784be41ca7c..330d4121475ec 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -4,6 +4,7 @@
 import { InferenceSession as InferenceSessionImpl } from './inference-session-impl.js';
 import { OnnxModelOptions } from './onnx-model.js';
 import { OnnxValue, OnnxValueDataLocation } from './onnx-value.js';
+import type { Tensor } from './tensor.js';
 import { TryGetGlobalType } from './type-helper.js';
 
 /* eslint-disable @typescript-eslint/no-redeclare */
@@ -430,11 +431,53 @@ export declare namespace InferenceSession {
 
   // #region value metadata
 
-  // eslint-disable-next-line @typescript-eslint/no-empty-interface
-  interface ValueMetadata {
-    // TBD
+  /**
+   * The common part of the value metadata type for both tensor and non-tensor values.
+   */
+  export interface ValueMetadataBase {
+    /**
+     * The name of the specified input or output.
+     */
+    readonly name: string;
   }
 
+  /**
+   * Represents the metadata of a non-tensor value.
+   */
+  export interface NonTensorValueMetadata extends ValueMetadataBase {
+    /**
+     * Get a value indicating whether the value is a tensor.
+     */
+    readonly isTensor: false;
+  }
+
+  /**
+   * Represents the metadata of a tensor value.
+   */
+  export interface TensorValueMetadata extends ValueMetadataBase {
+    /**
+     * Get a value indicating whether the value is a tensor.
+     */
+    readonly isTensor: true;
+    /**
+     * Get the data type of the tensor.
+     */
+    readonly type: Tensor.Type;
+    /**
+     * Get the shape of the tensor.
+     *
+     * If the shape is not defined, the value will an empty array. Otherwise, it will be an array representing the shape
+     * of the tensor. Each element in the array can be a number or a string. If the element is a number, it represents
+     * the corresponding dimension size. If the element is a string, it represents a symbolic dimension.
+     */
+    readonly shape: ReadonlyArray<number | string>;
+  }
+
+  /**
+   * Represents the metadata of a value.
+   */
+  export type ValueMetadata = NonTensorValueMetadata | TensorValueMetadata;
+
   // #endregion
 }
 
@@ -505,15 +548,15 @@ export interface InferenceSession {
    */
   readonly outputNames: readonly string[];
 
-  // /**
-  //  * Get input metadata of the loaded model.
-  //  */
-  // readonly inputMetadata: ReadonlyArray<Readonly<InferenceSession.ValueMetadata>>;
+  /**
+   * Get input metadata of the loaded model.
+   */
+  readonly inputMetadata: readonly InferenceSession.ValueMetadata[];
 
-  // /**
-  //  * Get output metadata of the loaded model.
-  //  */
-  // readonly outputMetadata: ReadonlyArray<Readonly<InferenceSession.ValueMetadata>>;
+  /**
+   * Get output metadata of the loaded model.
+   */
+  readonly outputMetadata: readonly InferenceSession.ValueMetadata[];
 
   // #endregion
 }
diff --git a/js/node/lib/backend.ts b/js/node/lib/backend.ts
index 004a3c890a7e4..bea9debcfdd4d 100644
--- a/js/node/lib/backend.ts
+++ b/js/node/lib/backend.ts
@@ -5,6 +5,32 @@ import { Backend, InferenceSession, InferenceSessionHandler, SessionHandler } fr
 
 import { Binding, binding, initOrt } from './binding';
 
+const dataTypeStrings = [
+  undefined, // 0
+  'float32',
+  'uint8',
+  'int8',
+  'uint16',
+  'int16',
+  'int32',
+  'int64',
+  'string',
+  'bool',
+  'float16',
+  'float64',
+  'uint32',
+  'uint64',
+  undefined, // 14
+  undefined, // 15
+  undefined, // 16
+  undefined, // 17
+  undefined, // 18
+  undefined, // 19
+  undefined, // 20
+  'uint4',
+  'int4',
+] as const;
+
 class OnnxruntimeSessionHandler implements InferenceSessionHandler {
   #inferenceSession: Binding.InferenceSession;
 
@@ -17,8 +43,56 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler {
     } else {
       this.#inferenceSession.loadModel(pathOrBuffer.buffer, pathOrBuffer.byteOffset, pathOrBuffer.byteLength, options);
     }
-    this.inputNames = this.#inferenceSession.inputNames;
-    this.outputNames = this.#inferenceSession.outputNames;
+
+    // prepare input/output names and metadata
+    this.inputNames = [];
+    this.outputNames = [];
+    this.inputMetadata = [];
+    this.outputMetadata = [];
+
+    // this function takes raw metadata from binding and returns a tuple of the following 2 items:
+    // - an array of string representing names
+    // - an array of converted InferenceSession.ValueMetadata
+    const fillNamesAndMetadata = (
+      rawMetadata: readonly Binding.ValueMetadata[],
+    ): [names: string[], metadata: InferenceSession.ValueMetadata[]] => {
+      const names: string[] = [];
+      const metadata: InferenceSession.ValueMetadata[] = [];
+
+      for (const m of rawMetadata) {
+        names.push(m.name);
+        if (!m.isTensor) {
+          metadata.push({ name: m.name, isTensor: false });
+        } else {
+          const type = dataTypeStrings[m.type];
+          if (type === undefined) {
+            throw new Error(`Unsupported data type: ${m.type}`);
+          }
+          const shape: Array<number | string> = [];
+          for (let i = 0; i < m.shape.length; ++i) {
+            const dim = m.shape[i];
+            if (dim === -1) {
+              shape.push(m.symbolicDimensions[i]);
+            } else if (dim >= 0) {
+              shape.push(dim);
+            } else {
+              throw new Error(`Invalid dimension: ${dim}`);
+            }
+          }
+          metadata.push({
+            name: m.name,
+            isTensor: m.isTensor,
+            type,
+            shape,
+          });
+        }
+      }
+
+      return [names, metadata];
+    };
+
+    [this.inputNames, this.inputMetadata] = fillNamesAndMetadata(this.#inferenceSession.inputMetadata);
+    [this.outputNames, this.outputMetadata] = fillNamesAndMetadata(this.#inferenceSession.outputMetadata);
   }
 
   async dispose(): Promise<void> {
@@ -28,6 +102,9 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler {
   readonly inputNames: string[];
   readonly outputNames: string[];
 
+  readonly inputMetadata: InferenceSession.ValueMetadata[];
+  readonly outputMetadata: InferenceSession.ValueMetadata[];
+
   startProfiling(): void {
     // startProfiling is a no-op.
     //
diff --git a/js/node/lib/binding.ts b/js/node/lib/binding.ts
index 56203f5a5ca02..ed133734ce66a 100644
--- a/js/node/lib/binding.ts
+++ b/js/node/lib/binding.ts
@@ -19,12 +19,19 @@ type RunOptions = InferenceSession.RunOptions;
  * Binding exports a simple synchronized inference session object wrap.
  */
 export declare namespace Binding {
+  export interface ValueMetadata {
+    name: string;
+    isTensor: boolean;
+    symbolicDimensions: string[];
+    shape: number[];
+    type: number;
+  }
   export interface InferenceSession {
     loadModel(modelPath: string, options: SessionOptions): void;
     loadModel(buffer: ArrayBuffer, byteOffset: number, byteLength: number, options: SessionOptions): void;
 
-    readonly inputNames: string[];
-    readonly outputNames: string[];
+    readonly inputMetadata: ValueMetadata[];
+    readonly outputMetadata: ValueMetadata[];
 
     run(feeds: FeedsType, fetches: FetchesType, options: RunOptions): ReturnType;
 
diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
index 04ab71dc48ec2..5512b418b5cfb 100644
--- a/js/node/src/inference_session_wrap.cc
+++ b/js/node/src/inference_session_wrap.cc
@@ -34,8 +34,8 @@ Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
        InstanceMethod("run", &InferenceSessionWrap::Run),
        InstanceMethod("dispose", &InferenceSessionWrap::Dispose),
        InstanceMethod("endProfiling", &InferenceSessionWrap::EndProfiling),
-       InstanceAccessor("inputNames", &InferenceSessionWrap::GetInputNames, nullptr, napi_default, nullptr),
-       InstanceAccessor("outputNames", &InferenceSessionWrap::GetOutputNames, nullptr, napi_default, nullptr)});
+       InstanceAccessor("inputMetadata", &InferenceSessionWrap::GetMetadata, nullptr, napi_default, reinterpret_cast<void*>(true)),
+       InstanceAccessor("outputMetadata", &InferenceSessionWrap::GetMetadata, nullptr, napi_default, reinterpret_cast<void*>(false))});
 
   wrappedSessionConstructor = Napi::Persistent(func);
   wrappedSessionConstructor.SuppressDestruct();
@@ -120,27 +120,17 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo& info) {
     size_t count = session_->GetInputCount();
     inputNames_.reserve(count);
     for (size_t i = 0; i < count; i++) {
-      auto inp_name = session_->GetInputNameAllocated(i, allocator);
-      inputNames_.emplace_back(inp_name.get());
-      auto typeInfo = session_->GetInputTypeInfo(i);
-      auto onnxType = typeInfo.GetONNXType();
-      inputTypes_.emplace_back(onnxType);
-      inputTensorElementDataTypes_.emplace_back(onnxType == ONNX_TYPE_TENSOR
-                                                    ? typeInfo.GetTensorTypeAndShapeInfo().GetElementType()
-                                                    : ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+      auto input_name = session_->GetInputNameAllocated(i, allocator);
+      inputNames_.emplace_back(input_name.get());
+      inputTypes_.push_back(session_->GetInputTypeInfo(i));
     }
 
     count = session_->GetOutputCount();
     outputNames_.reserve(count);
     for (size_t i = 0; i < count; i++) {
-      auto out_name = session_->GetOutputNameAllocated(i, allocator);
-      outputNames_.emplace_back(out_name.get());
-      auto typeInfo = session_->GetOutputTypeInfo(i);
-      auto onnxType = typeInfo.GetONNXType();
-      outputTypes_.emplace_back(onnxType);
-      outputTensorElementDataTypes_.emplace_back(onnxType == ONNX_TYPE_TENSOR
-                                                     ? typeInfo.GetTensorTypeAndShapeInfo().GetElementType()
-                                                     : ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+      auto output_name = session_->GetOutputNameAllocated(i, allocator);
+      outputNames_.emplace_back(output_name.get());
+      outputTypes_.push_back(session_->GetOutputTypeInfo(i));
     }
 
     // cache preferred output locations
@@ -157,22 +147,32 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo& info) {
   return env.Undefined();
 }
 
-Napi::Value InferenceSessionWrap::GetInputNames(const Napi::CallbackInfo& info) {
+Napi::Value InferenceSessionWrap::GetMetadata(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
   ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
 
   Napi::EscapableHandleScope scope(env);
-  return scope.Escape(CreateNapiArrayFrom(env, inputNames_));
-}
-
-Napi::Value InferenceSessionWrap::GetOutputNames(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
-  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
-
-  Napi::EscapableHandleScope scope(env);
-  return scope.Escape(CreateNapiArrayFrom(env, outputNames_));
+  auto& names = info.Data() != nullptr ? inputNames_ : outputNames_;
+  auto& types = info.Data() != nullptr ? inputTypes_ : outputTypes_;
+  auto array = Napi::Array::New(env, types.size());
+  for (uint32_t i = 0; i < types.size(); i++) {
+    Napi::Object obj = Napi::Object::New(env);
+    obj.Set("name", names[i]);
+    auto& typeInfo = types[i];
+    if (typeInfo.GetONNXType() == ONNX_TYPE_TENSOR) {
+      obj.Set("isTensor", true);
+
+      auto tensorInfo = typeInfo.GetTensorTypeAndShapeInfo();
+      obj.Set("type", static_cast<std::underlying_type_t<ONNXTensorElementDataType>>(tensorInfo.GetElementType()));
+      obj.Set("symbolicDimensions", CreateNapiArrayFrom(env, tensorInfo.GetSymbolicDimensions()));
+      obj.Set("shape", CreateNapiArrayFrom(env, tensorInfo.GetShape()));
+    } else {
+      obj.Set("isTensor", false);
+    }
+    array.Set(i, Napi::Value::From(env, obj));
+  }
+  return scope.Escape(array);
 }
 
 Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo& info) {
diff --git a/js/node/src/inference_session_wrap.h b/js/node/src/inference_session_wrap.h
index 0b3dd1178c807..776cdc0d3b51e 100644
--- a/js/node/src/inference_session_wrap.h
+++ b/js/node/src/inference_session_wrap.h
@@ -45,19 +45,12 @@ class InferenceSessionWrap : public Napi::ObjectWrap<InferenceSessionWrap> {
   // following functions have to be called after model is loaded.
 
   /**
-   * [sync] get input names.
+   * [sync] get metadata of the model's inputs or outputs.
    * @param nothing
-   * @returns a string array.
+   * @returns an array of objects with keys: name, isTensor, type, symbolicDimensions, shape
    * @throw nothing
    */
-  Napi::Value GetInputNames(const Napi::CallbackInfo& info);
-  /**
-   * [sync] get output names.
-   * @param nothing
-   * @returns a string array.
-   * @throw nothing
-   */
-  Napi::Value GetOutputNames(const Napi::CallbackInfo& info);
+  Napi::Value GetMetadata(const Napi::CallbackInfo& info);
 
   /**
    * [sync] run the model.
@@ -98,11 +91,9 @@ class InferenceSessionWrap : public Napi::ObjectWrap<InferenceSessionWrap> {
 
   // input/output metadata
   std::vector<std::string> inputNames_;
-  std::vector<ONNXType> inputTypes_;
-  std::vector<ONNXTensorElementDataType> inputTensorElementDataTypes_;
+  std::vector<Ort::TypeInfo> inputTypes_;
   std::vector<std::string> outputNames_;
-  std::vector<ONNXType> outputTypes_;
-  std::vector<ONNXTensorElementDataType> outputTensorElementDataTypes_;
+  std::vector<Ort::TypeInfo> outputTypes_;
 
   // preferred output locations
   std::vector<int> preferredOutputLocations_;
diff --git a/js/node/test/test-main.ts b/js/node/test/test-main.ts
index fc792179d3373..6e7905a24711a 100644
--- a/js/node/test/test-main.ts
+++ b/js/node/test/test-main.ts
@@ -15,6 +15,7 @@ warmup();
 // unittests
 require('./unittests/lib/index');
 require('./unittests/lib/inference-session');
+require('./unittests/lib/model-metadata');
 require('./unittests/lib/tensor');
 
 // E2E tests
diff --git a/js/node/test/unittests/lib/model-metadata.ts b/js/node/test/unittests/lib/model-metadata.ts
new file mode 100644
index 0000000000000..e58c86d39a742
--- /dev/null
+++ b/js/node/test/unittests/lib/model-metadata.ts
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import * as assert from 'assert';
+import { InferenceSession } from 'onnxruntime-common';
+
+const ONNX_MODEL_TEST_ABS_NO_SHAPE = Uint8Array.from([
+  8, 9, 58, 73, 10, 31, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 8, 111, 117, 116, 112, 117, 116, 95, 48, 26, 3, 65,
+  98, 115, 34, 3, 65, 98, 115, 58, 0, 18, 3, 97, 98, 115, 90, 15, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 4, 10, 2,
+  8, 1, 98, 16, 10, 8, 111, 117, 116, 112, 117, 116, 95, 48, 18, 4, 10, 2, 8, 1, 66, 4, 10, 0, 16, 21,
+]);
+
+const ONNX_MODEL_TEST_ABS_SYMBOL = Uint8Array.from([
+  8, 9, 58, 105, 10, 31, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 8, 111, 117, 116, 112, 117, 116, 95, 48, 26, 3, 65,
+  98, 115, 34, 3, 65, 98, 115, 58, 0, 18, 3, 97, 98, 115, 90, 47, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 36, 10,
+  34, 8, 1, 18, 30, 10, 13, 18, 11, 95, 105, 110, 112, 117, 116, 95, 48, 95, 100, 48, 10, 13, 18, 11, 95, 105, 110, 112,
+  117, 116, 95, 48, 95, 100, 49, 98, 16, 10, 8, 111, 117, 116, 112, 117, 116, 95, 48, 18, 4, 10, 2, 8, 1, 66, 4, 10, 0,
+  16, 21,
+]);
+
+const ONNX_MODEL_TEST_ABS_STATIC = Uint8Array.from([
+  8, 9, 58, 83, 10, 31, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 8, 111, 117, 116, 112, 117, 116, 95, 48, 26, 3, 65,
+  98, 115, 34, 3, 65, 98, 115, 58, 0, 18, 3, 97, 98, 115, 90, 25, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 14, 10,
+  12, 8, 1, 18, 8, 10, 2, 8, 2, 10, 2, 8, 4, 98, 16, 10, 8, 111, 117, 116, 112, 117, 116, 95, 48, 18, 4, 10, 2, 8, 1,
+  66, 4, 10, 0, 16, 21,
+]);
+
+const testModelMetadata = async (
+  model: Uint8Array,
+  expectedInputNames: string[],
+  expectedOutputNames: string[],
+  expectedInputMetadata: InferenceSession.ValueMetadata[],
+  expectedOutputMetadata: InferenceSession.ValueMetadata[],
+) => {
+  const session = await InferenceSession.create(model);
+  assert.deepStrictEqual(session.inputNames, expectedInputNames);
+  assert.deepStrictEqual(session.outputNames, expectedOutputNames);
+  assert.deepStrictEqual(session.inputMetadata, expectedInputMetadata);
+  assert.deepStrictEqual(session.outputMetadata, expectedOutputMetadata);
+};
+
+describe('#UnitTest# - test model input/output metadata', () => {
+  it('model input/output with no shape', async () => {
+    await testModelMetadata(
+      ONNX_MODEL_TEST_ABS_NO_SHAPE,
+      ['input_0'],
+      ['output_0'],
+      [{ name: 'input_0', isTensor: true, type: 'float32', shape: [] }],
+      [{ name: 'output_0', isTensor: true, type: 'float32', shape: [] }],
+    );
+  });
+
+  it('model input/output with symbol shape', async () => {
+    await testModelMetadata(
+      ONNX_MODEL_TEST_ABS_SYMBOL,
+      ['input_0'],
+      ['output_0'],
+      [
+        {
+          name: 'input_0',
+          isTensor: true,
+          type: 'float32',
+          shape: ['_input_0_d0', '_input_0_d1'],
+        },
+      ],
+      [
+        {
+          name: 'output_0',
+          isTensor: true,
+          type: 'float32',
+          shape: ['_input_0_d0', '_input_0_d1'],
+        },
+      ],
+    );
+  });
+
+  it('model input/output with static shape', async () => {
+    await testModelMetadata(
+      ONNX_MODEL_TEST_ABS_STATIC,
+      ['input_0'],
+      ['output_0'],
+      [{ name: 'input_0', isTensor: true, type: 'float32', shape: [2, 4] }],
+      [{ name: 'output_0', isTensor: true, type: 'float32', shape: [2, 4] }],
+    );
+  });
+});
diff --git a/js/react_native/e2e/src/App.tsx b/js/react_native/e2e/src/App.tsx
index 1045bb1c0e2ce..2d8d57b576991 100644
--- a/js/react_native/e2e/src/App.tsx
+++ b/js/react_native/e2e/src/App.tsx
@@ -44,7 +44,7 @@ export default class App extends React.PureComponent<{}, State> {
         // test creating session with path
         console.log('Creating with path');
         const pathSession: InferenceSession = await InferenceSession.create(modelPath);
-        pathSession.release();
+        void pathSession.release();
 
         // and with bytes
         console.log('Creating with bytes');
diff --git a/js/react_native/lib/backend.ts b/js/react_native/lib/backend.ts
index 854a7ffd9a6ab..edc28c2e43de9 100644
--- a/js/react_native/lib/backend.ts
+++ b/js/react_native/lib/backend.ts
@@ -66,6 +66,13 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler {
   inputNames: string[];
   outputNames: string[];
 
+  get inputMetadata(): readonly InferenceSession.ValueMetadata[] {
+    throw new Error('Getting model metadata is currently not implemented for react-native backend.');
+  }
+  get outputMetadata(): readonly InferenceSession.ValueMetadata[] {
+    throw new Error('Getting model metadata is currently not implemented for react-native backend.');
+  }
+
   constructor(pathOrBuffer: string | Uint8Array) {
     this.#inferenceSession = binding;
     this.#pathOrBuffer = pathOrBuffer;
diff --git a/js/react_native/package-lock.json b/js/react_native/package-lock.json
index 3c23bc1523bd8..0c797ca2a2a16 100644
--- a/js/react_native/package-lock.json
+++ b/js/react_native/package-lock.json
@@ -9,7 +9,8 @@
       "version": "1.22.0",
       "license": "MIT",
       "dependencies": {
-        "buffer": "^6.0.3"
+        "buffer": "^6.0.3",
+        "onnxruntime-common": "file:../common"
       },
       "devDependencies": {
         "@react-native/typescript-config": "0.73.1",
@@ -17,7 +18,8 @@
         "pod-install": "^0.1.36",
         "prettier": "^2.8.8",
         "react": "^18.2.0",
-        "react-native": "^0.73.11"
+        "react-native": "^0.73.11",
+        "react-native-builder-bob": "^0.37.0"
       },
       "engines": {
         "node": ">=18"
@@ -30,7 +32,6 @@
     "../common": {
       "name": "onnxruntime-common",
       "version": "1.22.0",
-      "extraneous": true,
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
@@ -76,9 +77,9 @@
       }
     },
     "node_modules/@babel/compat-data": {
-      "version": "7.25.4",
-      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.25.4.tgz",
-      "integrity": "sha512-+LGRog6RAsCJrrrg/IO6LGmpphNe5DiK30dGjCoxxeGv49B10/3XYGxPsAwrDlMFcFEvdAUavDT8r9k/hSyQqQ==",
+      "version": "7.26.8",
+      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.26.8.tgz",
+      "integrity": "sha512-oH5UPLMWR3L2wEFLnFJ1TZXqHufiTKAiLfqw5zkhS4dKXLJ10yVztfil/twG8EDTA4F/tvVNw9nOl4ZMslB8rQ==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -174,29 +175,16 @@
         "node": ">=6.9.0"
       }
     },
-    "node_modules/@babel/helper-builder-binary-assignment-operator-visitor": {
-      "version": "7.16.7",
-      "dev": true,
-      "license": "MIT",
-      "peer": true,
-      "dependencies": {
-        "@babel/helper-explode-assignable-expression": "^7.16.7",
-        "@babel/types": "^7.16.7"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
     "node_modules/@babel/helper-compilation-targets": {
-      "version": "7.25.2",
-      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.25.2.tgz",
-      "integrity": "sha512-U2U5LsSaZ7TAt3cfaymQ8WHh0pxvdHoEk6HVpaexxixjyEquMh0L0YNJNM6CTGKMXV1iksi0iZkGw4AcFkPaaw==",
+      "version": "7.26.5",
+      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.26.5.tgz",
+      "integrity": "sha512-IXuyn5EkouFJscIDuFF5EsiSolseme1s0CZB+QxVugqJLYmKdxI1VfIBOst0SUu4rnk2Z7kqTwmoO1lp3HIfnA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/compat-data": "^7.25.2",
-        "@babel/helper-validator-option": "^7.24.8",
-        "browserslist": "^4.23.1",
+        "@babel/compat-data": "^7.26.5",
+        "@babel/helper-validator-option": "^7.25.9",
+        "browserslist": "^4.24.0",
         "lru-cache": "^5.1.1",
         "semver": "^6.3.1"
       },
@@ -244,12 +232,15 @@
       }
     },
     "node_modules/@babel/helper-create-regexp-features-plugin": {
-      "version": "7.17.12",
+      "version": "7.26.3",
+      "resolved": "https://registry.npmjs.org/@babel/helper-create-regexp-features-plugin/-/helper-create-regexp-features-plugin-7.26.3.tgz",
+      "integrity": "sha512-G7ZRb40uUgdKOQqPLjfD12ZmGA54PzqDFUv2BKImnC9QIfGhIHKvVML0oN8IUiDq4iRqpq74ABpvOaerfWdong==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-annotate-as-pure": "^7.16.7",
-        "regexpu-core": "^5.0.1"
+        "@babel/helper-annotate-as-pure": "^7.25.9",
+        "regexpu-core": "^6.2.0",
+        "semver": "^6.3.1"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -259,31 +250,30 @@
       }
     },
     "node_modules/@babel/helper-define-polyfill-provider": {
-      "version": "0.3.1",
+      "version": "0.6.3",
+      "resolved": "https://registry.npmjs.org/@babel/helper-define-polyfill-provider/-/helper-define-polyfill-provider-0.6.3.tgz",
+      "integrity": "sha512-HK7Bi+Hj6H+VTHA3ZvBis7V/6hu9QuTrnMXNybfUf2iiuU/N97I8VjB+KbhFF8Rld/Lx5MzoCwPCpPjfK+n8Cg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-compilation-targets": "^7.13.0",
-        "@babel/helper-module-imports": "^7.12.13",
-        "@babel/helper-plugin-utils": "^7.13.0",
-        "@babel/traverse": "^7.13.0",
+        "@babel/helper-compilation-targets": "^7.22.6",
+        "@babel/helper-plugin-utils": "^7.22.5",
         "debug": "^4.1.1",
         "lodash.debounce": "^4.0.8",
-        "resolve": "^1.14.2",
-        "semver": "^6.1.2"
+        "resolve": "^1.14.2"
       },
       "peerDependencies": {
-        "@babel/core": "^7.4.0-0"
+        "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0"
       }
     },
     "node_modules/@babel/helper-define-polyfill-provider/node_modules/debug": {
-      "version": "4.3.4",
+      "version": "4.4.0",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz",
+      "integrity": "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "ms": "2.1.2"
+        "ms": "^2.1.3"
       },
       "engines": {
         "node": ">=6.0"
@@ -295,54 +285,11 @@
       }
     },
     "node_modules/@babel/helper-define-polyfill-provider/node_modules/ms": {
-      "version": "2.1.2",
-      "dev": true,
-      "license": "MIT",
-      "peer": true
-    },
-    "node_modules/@babel/helper-environment-visitor": {
-      "version": "7.18.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-explode-assignable-expression": {
-      "version": "7.16.7",
-      "dev": true,
-      "license": "MIT",
-      "peer": true,
-      "dependencies": {
-        "@babel/types": "^7.16.7"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-function-name": {
-      "version": "7.17.9",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/template": "^7.16.7",
-        "@babel/types": "^7.17.0"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-hoist-variables": {
-      "version": "7.16.7",
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
       "dev": true,
-      "license": "MIT",
-      "peer": true,
-      "dependencies": {
-        "@babel/types": "^7.16.7"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
+      "license": "MIT"
     },
     "node_modules/@babel/helper-member-expression-to-functions": {
       "version": "7.25.9",
@@ -373,16 +320,15 @@
       }
     },
     "node_modules/@babel/helper-module-transforms": {
-      "version": "7.25.2",
-      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.25.2.tgz",
-      "integrity": "sha512-BjyRAbix6j/wv83ftcVJmBt72QtHI56C7JXZoG2xATiLpmoC7dpd8WnkikExHDVPpi/3qCmO6WY1EaXOluiecQ==",
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.26.0.tgz",
+      "integrity": "sha512-xO+xu6B5K2czEnQye6BHA7DolFFmS3LB7stHZFaOLb1pAwO1HWLS8fXA+eh0A2yIvltPVmx3eNNDBJA2SLHXFw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-module-imports": "^7.24.7",
-        "@babel/helper-simple-access": "^7.24.7",
-        "@babel/helper-validator-identifier": "^7.24.7",
-        "@babel/traverse": "^7.25.2"
+        "@babel/helper-module-imports": "^7.25.9",
+        "@babel/helper-validator-identifier": "^7.25.9",
+        "@babel/traverse": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -415,15 +361,15 @@
       }
     },
     "node_modules/@babel/helper-remap-async-to-generator": {
-      "version": "7.25.0",
-      "resolved": "https://registry.npmjs.org/@babel/helper-remap-async-to-generator/-/helper-remap-async-to-generator-7.25.0.tgz",
-      "integrity": "sha512-NhavI2eWEIz/H9dbrG0TuOicDhNexze43i5z7lEqwYm0WEZVTwnPpA0EafUTP7+6/W79HWIP2cTe3Z5NiSTVpw==",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-remap-async-to-generator/-/helper-remap-async-to-generator-7.25.9.tgz",
+      "integrity": "sha512-IZtukuUeBbhgOcaW2s06OXTzVNJR0ybm4W5xC1opWFFJMZbwRj5LCk+ByYH7WdZPZTt8KnFwA8pvjN2yqcPlgw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-annotate-as-pure": "^7.24.7",
-        "@babel/helper-wrap-function": "^7.25.0",
-        "@babel/traverse": "^7.25.0"
+        "@babel/helper-annotate-as-pure": "^7.25.9",
+        "@babel/helper-wrap-function": "^7.25.9",
+        "@babel/traverse": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -450,20 +396,6 @@
         "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/helper-simple-access": {
-      "version": "7.24.7",
-      "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.24.7.tgz",
-      "integrity": "sha512-zBAIvbCMh5Ts+b86r/CjU+4XGYIs+R1j951gxI3KmmxBMhCg4oQMsv6ZXQ64XOm/cvzfU1FmoCyt6+owc5QMYg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/traverse": "^7.24.7",
-        "@babel/types": "^7.24.7"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
     "node_modules/@babel/helper-skip-transparent-expression-wrappers": {
       "version": "7.25.9",
       "resolved": "https://registry.npmjs.org/@babel/helper-skip-transparent-expression-wrappers/-/helper-skip-transparent-expression-wrappers-7.25.9.tgz",
@@ -478,17 +410,6 @@
         "node": ">=6.9.0"
       }
     },
-    "node_modules/@babel/helper-split-export-declaration": {
-      "version": "7.16.7",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/types": "^7.16.7"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
     "node_modules/@babel/helper-string-parser": {
       "version": "7.25.9",
       "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz",
@@ -510,9 +431,9 @@
       }
     },
     "node_modules/@babel/helper-validator-option": {
-      "version": "7.24.8",
-      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.24.8.tgz",
-      "integrity": "sha512-xb8t9tD1MHLungh/AIoWYN+gVHaB9kwlu8gffXGSt3FFEIT7RjS+xWbc2vUD1UTZdIpKj/ab3rdqJ7ufngyi2Q==",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.25.9.tgz",
+      "integrity": "sha512-e/zv1co8pp55dNdEcCynfj9X7nyUKUXoUEwfXqaZt0omVOmDe9oOTdKStH4GmAw6zxMFs50ZayuMfHDKlO7Tfw==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -520,15 +441,15 @@
       }
     },
     "node_modules/@babel/helper-wrap-function": {
-      "version": "7.25.0",
-      "resolved": "https://registry.npmjs.org/@babel/helper-wrap-function/-/helper-wrap-function-7.25.0.tgz",
-      "integrity": "sha512-s6Q1ebqutSiZnEjaofc/UKDyC4SbzV5n5SrA2Gq8UawLycr3i04f1dX4OzoQVnexm6aOCh37SQNYlJ/8Ku+PMQ==",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-wrap-function/-/helper-wrap-function-7.25.9.tgz",
+      "integrity": "sha512-ETzz9UTjQSTmw39GboatdymDq4XIQbR8ySgVrylRhPOFpsd+JrKHIuF0de7GCWmem+T4uC5z7EZguod7Wj4A4g==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/template": "^7.25.0",
-        "@babel/traverse": "^7.25.0",
-        "@babel/types": "^7.25.0"
+        "@babel/template": "^7.25.9",
+        "@babel/traverse": "^7.25.9",
+        "@babel/types": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -564,13 +485,15 @@
         "node": ">=6.0.0"
       }
     },
-    "node_modules/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-bugfix-firefox-class-in-computed-class-key": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-firefox-class-in-computed-class-key/-/plugin-bugfix-firefox-class-in-computed-class-key-7.25.9.tgz",
+      "integrity": "sha512-ZkRyVkThtxQ/J6nv3JFYv1RYY+JT5BvU0y3k5bWrmuG4woXypRa4PXmm9RhOwodRkYFWqC0C0cqcJ4OqR7kW+g==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/traverse": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -579,114 +502,81 @@
         "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining": {
-      "version": "7.17.12",
-      "dev": true,
-      "license": "MIT",
-      "peer": true,
-      "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-skip-transparent-expression-wrappers": "^7.16.0",
-        "@babel/plugin-proposal-optional-chaining": "^7.17.12"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      },
-      "peerDependencies": {
-        "@babel/core": "^7.13.0"
-      }
-    },
-    "node_modules/@babel/plugin-proposal-async-generator-functions": {
-      "version": "7.17.12",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-remap-async-to-generator": "^7.16.8",
-        "@babel/plugin-syntax-async-generators": "^7.8.4"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      },
-      "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
-      }
-    },
-    "node_modules/@babel/plugin-proposal-class-properties": {
-      "version": "7.18.6",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-proposal-class-properties/-/plugin-proposal-class-properties-7.18.6.tgz",
-      "integrity": "sha512-cumfXOF0+nzZrrN8Rf0t7M+tF6sZc7vhQwYQck9q1/5w2OExlD+b4v4RpMJFaV1Z7WcDRgO6FqvxqxGlwo+RHQ==",
-      "deprecated": "This proposal has been merged to the ECMAScript standard and thus this plugin is no longer maintained. Please use @babel/plugin-transform-class-properties instead.",
+    "node_modules/@babel/plugin-bugfix-safari-class-field-initializer-scope": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-safari-class-field-initializer-scope/-/plugin-bugfix-safari-class-field-initializer-scope-7.25.9.tgz",
+      "integrity": "sha512-MrGRLZxLD/Zjj0gdU15dfs+HH/OXvnw/U4jJD8vpcP2CJQapPEv1IWwjc/qMg7ItBlPwSv1hRBbb7LeuANdcnw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-create-class-features-plugin": "^7.18.6",
-        "@babel/helper-plugin-utils": "^7.18.6"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/plugin-proposal-class-static-block": {
-      "version": "7.18.0",
+    "node_modules/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.25.9.tgz",
+      "integrity": "sha512-2qUwwfAFpJLZqxd02YW9btUCZHl+RFvdDkNfZwaIJrvB8Tesjsk8pEQkTvGwZXLqXUx/2oyY3ySRhm6HOXuCug==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-create-class-features-plugin": "^7.18.0",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/plugin-syntax-class-static-block": "^7.14.5"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.12.0"
+        "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/plugin-proposal-dynamic-import": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining/-/plugin-bugfix-v8-spread-parameters-in-optional-chaining-7.25.9.tgz",
+      "integrity": "sha512-6xWgLZTJXwilVjlnV7ospI3xi+sl8lN8rXXbBD6vYn3UYDlGsag8wrZkKcSI8G6KgqKP7vNFaDgeDnfAABq61g==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7",
-        "@babel/plugin-syntax-dynamic-import": "^7.8.3"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9",
+        "@babel/plugin-transform-optional-chaining": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.13.0"
       }
     },
-    "node_modules/@babel/plugin-proposal-export-default-from": {
+    "node_modules/@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly": {
       "version": "7.25.9",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-proposal-export-default-from/-/plugin-proposal-export-default-from-7.25.9.tgz",
-      "integrity": "sha512-ykqgwNfSnNOB+C8fV5X4mG3AVmvu+WVxcaU9xHHtBb7PCrPeweMmPjGsn8eMaeJg6SJuoUuZENeeSWaarWqonQ==",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly/-/plugin-bugfix-v8-static-class-fields-redefine-readonly-7.25.9.tgz",
+      "integrity": "sha512-aLnMXYPnzwwqhYSCyXfKkIkYgJ8zv9RK+roo9DkTXz38ynIhd9XCbN08s3MGvqL2MYGVUGdRQLL/JqBIeJhJBg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.25.9"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/traverse": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/plugin-proposal-export-namespace-from": {
+    "node_modules/@babel/plugin-proposal-async-generator-functions": {
       "version": "7.17.12",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/plugin-syntax-export-namespace-from": "^7.8.3"
+        "@babel/helper-remap-async-to-generator": "^7.16.8",
+        "@babel/plugin-syntax-async-generators": "^7.8.4"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -695,14 +585,16 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-proposal-json-strings": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-proposal-class-properties": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-proposal-class-properties/-/plugin-proposal-class-properties-7.18.6.tgz",
+      "integrity": "sha512-cumfXOF0+nzZrrN8Rf0t7M+tF6sZc7vhQwYQck9q1/5w2OExlD+b4v4RpMJFaV1Z7WcDRgO6FqvxqxGlwo+RHQ==",
+      "deprecated": "This proposal has been merged to the ECMAScript standard and thus this plugin is no longer maintained. Please use @babel/plugin-transform-class-properties instead.",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/plugin-syntax-json-strings": "^7.8.3"
+        "@babel/helper-create-class-features-plugin": "^7.18.6",
+        "@babel/helper-plugin-utils": "^7.18.6"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -711,14 +603,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-proposal-logical-assignment-operators": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-proposal-export-default-from": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-proposal-export-default-from/-/plugin-proposal-export-default-from-7.25.9.tgz",
+      "integrity": "sha512-ykqgwNfSnNOB+C8fV5X4mG3AVmvu+WVxcaU9xHHtBb7PCrPeweMmPjGsn8eMaeJg6SJuoUuZENeeSWaarWqonQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/plugin-syntax-logical-assignment-operators": "^7.10.4"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -815,14 +707,49 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-proposal-private-methods": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-proposal-private-property-in-object": {
+      "version": "7.21.0-placeholder-for-preset-env.2",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-proposal-private-property-in-object/-/plugin-proposal-private-property-in-object-7.21.0-placeholder-for-preset-env.2.tgz",
+      "integrity": "sha512-SOSkfJDddaM7mak6cPEpswyTRnuRltl429hMraQEglW+OkovnCzsiszTmsrlY//qLFjCpQDFRvjdm2wA5pPm9w==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-async-generators": {
+      "version": "7.8.4",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-dynamic-import": {
+      "version": "7.8.3",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-create-class-features-plugin": "^7.17.12",
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-export-default-from": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-export-default-from/-/plugin-syntax-export-default-from-7.25.9.tgz",
+      "integrity": "sha512-9MhJ/SMTsVqsd69GyQg89lYR4o9T+oDGv5F6IsigxxqFVOyR/IflDLYP8WDI1l8fkhNGGktqkvL5qwNCtGEpgQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -831,16 +758,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-proposal-private-property-in-object": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-syntax-flow": {
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-flow/-/plugin-syntax-flow-7.26.0.tgz",
+      "integrity": "sha512-B+O2DnPc0iG+YXFqOxv2WNuNU97ToWjOomUQ78DouOENWUaM5sVrmet9mcomUGQFwpJd//gvUagXBSdzO1fRKg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-annotate-as-pure": "^7.16.7",
-        "@babel/helper-create-class-features-plugin": "^7.17.12",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/plugin-syntax-private-property-in-object": "^7.14.5"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -849,24 +774,56 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-proposal-unicode-property-regex": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-syntax-import-assertions": {
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-assertions/-/plugin-syntax-import-assertions-7.26.0.tgz",
+      "integrity": "sha512-QCWT5Hh830hK5EQa7XzuqIkQU9tT/whqbDz7kuaZMHFl1inRRg7JnuAEOQ0Ur0QUl0NufCk1msK2BeY79Aj/eg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-create-regexp-features-plugin": "^7.17.12",
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
-        "node": ">=4"
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-async-generators": {
-      "version": "7.8.4",
+    "node_modules/@babel/plugin-syntax-import-attributes": {
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-attributes/-/plugin-syntax-import-attributes-7.26.0.tgz",
+      "integrity": "sha512-e2dttdsJ1ZTpi3B9UYGLw41hifAubg19AtCu/2I/F1QNVclOBr1dYpTdmdyZ84Xiz43BS/tCUkMAZNLv12Pi+A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-jsx": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-jsx/-/plugin-syntax-jsx-7.25.9.tgz",
+      "integrity": "sha512-ld6oezHQMZsZfp6pWtbjaNDF2tiiCYYDqQszHt5VV437lewP9aSi2Of99CK0D0XB21k7FLgnLcmQKyKzynfeAA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-nullish-coalescing-operator": {
+      "version": "7.8.3",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -876,34 +833,40 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-class-properties": {
-      "version": "7.12.13",
+    "node_modules/@babel/plugin-syntax-numeric-separator": {
+      "version": "7.10.4",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.12.13"
+        "@babel/helper-plugin-utils": "^7.10.4"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-class-static-block": {
-      "version": "7.14.5",
+    "node_modules/@babel/plugin-syntax-object-rest-spread": {
+      "version": "7.8.3",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.14.5"
+        "@babel/helper-plugin-utils": "^7.8.0"
       },
-      "engines": {
-        "node": ">=6.9.0"
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-optional-catch-binding": {
+      "version": "7.8.3",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-dynamic-import": {
+    "node_modules/@babel/plugin-syntax-optional-chaining": {
       "version": "7.8.3",
       "dev": true,
       "license": "MIT",
@@ -914,10 +877,10 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-export-default-from": {
+    "node_modules/@babel/plugin-syntax-typescript": {
       "version": "7.25.9",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-export-default-from/-/plugin-syntax-export-default-from-7.25.9.tgz",
-      "integrity": "sha512-9MhJ/SMTsVqsd69GyQg89lYR4o9T+oDGv5F6IsigxxqFVOyR/IflDLYP8WDI1l8fkhNGGktqkvL5qwNCtGEpgQ==",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-typescript/-/plugin-syntax-typescript-7.25.9.tgz",
+      "integrity": "sha512-hjMgRy5hb8uJJjUcdWunWVcoi9bGpJp8p5Ol1229PoN6aytsLwNMgmdftO23wnCLMfVmTwZDWMPNq/D1SY60JQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -930,26 +893,31 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-export-namespace-from": {
-      "version": "7.8.3",
+    "node_modules/@babel/plugin-syntax-unicode-sets-regex": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-unicode-sets-regex/-/plugin-syntax-unicode-sets-regex-7.18.6.tgz",
+      "integrity": "sha512-727YkEAPwSIQTv5im8QHz3upqp92JTWhidIC81Tdx4VJYIte/VndKf1qKrfnnhPLiPghStWfvC/iFaMCQu7Nqg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.8.3"
+        "@babel/helper-create-regexp-features-plugin": "^7.18.6",
+        "@babel/helper-plugin-utils": "^7.18.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/plugin-syntax-flow": {
-      "version": "7.24.7",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-flow/-/plugin-syntax-flow-7.24.7.tgz",
-      "integrity": "sha512-9G8GYT/dxn/D1IIKOUBmGX0mnmj46mGH9NnZyJLwtCpgh5f7D2VbuKodb+2s9m1Yavh1s7ASQN8lf0eqrb1LTw==",
+    "node_modules/@babel/plugin-transform-arrow-functions": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-arrow-functions/-/plugin-transform-arrow-functions-7.25.9.tgz",
+      "integrity": "sha512-6jmooXYIwn9ca5/RylZADJ+EnSxVUS5sjeJ9UPk6RWRzXCmOJCy6dqItPJFpw2cuCangPK4OYr5uhGKcmrm5Qg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.24.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -958,13 +926,16 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-import-assertions": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-async-generator-functions": {
+      "version": "7.26.8",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-async-generator-functions/-/plugin-transform-async-generator-functions-7.26.8.tgz",
+      "integrity": "sha512-He9Ej2X7tNf2zdKMAGOsmg2MrFc+hfoAhd3po4cWfo/NWjzEAKa0oQruj1ROVUdl0e6fb6/kE/G3SSxE0lRJOg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.26.5",
+        "@babel/helper-remap-async-to-generator": "^7.25.9",
+        "@babel/traverse": "^7.26.8"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -973,24 +944,32 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-json-strings": {
-      "version": "7.8.3",
+    "node_modules/@babel/plugin-transform-async-to-generator": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-async-to-generator/-/plugin-transform-async-to-generator-7.25.9.tgz",
+      "integrity": "sha512-NT7Ejn7Z/LjUH0Gv5KsBCxh7BH3fbLTV0ptHvpeMvrt3cPThHfJfst9Wrb7S8EvJ7vRTFI7z+VAvFVEQn/m5zQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.8.0"
+        "@babel/helper-module-imports": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-remap-async-to-generator": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-jsx": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-block-scoped-functions": {
+      "version": "7.26.5",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-block-scoped-functions/-/plugin-transform-block-scoped-functions-7.26.5.tgz",
+      "integrity": "sha512-chuTSY+hq09+/f5lMj8ZSYgCFpppV2CbYrhNFJ1BFoXpiWPnnAb7R0MqrafCpN8E1+YRrtM1MXZHJdIx8B6rMQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.26.5"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -999,80 +978,119 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-logical-assignment-operators": {
-      "version": "7.10.4",
+    "node_modules/@babel/plugin-transform-block-scoping": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-block-scoping/-/plugin-transform-block-scoping-7.25.9.tgz",
+      "integrity": "sha512-1F05O7AYjymAtqbsFETboN1NvBdcnzMerO+zlMyJBEz6WkMdejvGWw9p05iTSjC85RLlBseHHQpYaM4gzJkBGg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.10.4"
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-nullish-coalescing-operator": {
-      "version": "7.8.3",
+    "node_modules/@babel/plugin-transform-class-properties": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-class-properties/-/plugin-transform-class-properties-7.25.9.tgz",
+      "integrity": "sha512-bbMAII8GRSkcd0h0b4X+36GksxuheLFjP65ul9w6C3KgAamI3JqErNgSrosX6ZPj+Mpim5VvEbawXxJCyEUV3Q==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.8.0"
+        "@babel/helper-create-class-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-numeric-separator": {
-      "version": "7.10.4",
+    "node_modules/@babel/plugin-transform-class-static-block": {
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-class-static-block/-/plugin-transform-class-static-block-7.26.0.tgz",
+      "integrity": "sha512-6J2APTs7BDDm+UMqP1useWqhcRAXo0WIoVj26N7kPFB6S73Lgvyka4KTZYIxtgYXiN5HTyRObA72N2iu628iTQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.10.4"
+        "@babel/helper-create-class-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.12.0"
       }
     },
-    "node_modules/@babel/plugin-syntax-object-rest-spread": {
-      "version": "7.8.3",
+    "node_modules/@babel/plugin-transform-classes": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-classes/-/plugin-transform-classes-7.25.9.tgz",
+      "integrity": "sha512-mD8APIXmseE7oZvZgGABDyM34GUmK45Um2TXiBUt7PnuAxrgoSVf123qUzPxEr/+/BHrRn5NMZCdE2m/1F8DGg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.8.0"
+        "@babel/helper-annotate-as-pure": "^7.25.9",
+        "@babel/helper-compilation-targets": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-replace-supers": "^7.25.9",
+        "@babel/traverse": "^7.25.9",
+        "globals": "^11.1.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-optional-catch-binding": {
-      "version": "7.8.3",
+    "node_modules/@babel/plugin-transform-computed-properties": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-computed-properties/-/plugin-transform-computed-properties-7.25.9.tgz",
+      "integrity": "sha512-HnBegGqXZR12xbcTHlJ9HGxw1OniltT26J5YpfruGqtUHlz/xKf/G2ak9e+t0rVqrjXa9WOhvYPz1ERfMj23AA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.8.0"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/template": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-optional-chaining": {
-      "version": "7.8.3",
+    "node_modules/@babel/plugin-transform-destructuring": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-destructuring/-/plugin-transform-destructuring-7.25.9.tgz",
+      "integrity": "sha512-WkCGb/3ZxXepmMiX101nnGiU+1CAdut8oHyEOHxkKuS1qKpU2SMXE2uSvfz8PBuLd49V6LEsbtyPhWC7fnkgvQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.8.0"
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-private-property-in-object": {
-      "version": "7.14.5",
+    "node_modules/@babel/plugin-transform-dotall-regex": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-dotall-regex/-/plugin-transform-dotall-regex-7.25.9.tgz",
+      "integrity": "sha512-t7ZQ7g5trIgSRYhI9pIJtRl64KHotutUJsh4Eze5l7olJv+mRSg4/MmbZ0tv1eeqRbdvo/+trvJD/Oc5DmW2cA==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.14.5"
+        "@babel/helper-create-regexp-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1081,13 +1099,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-top-level-await": {
-      "version": "7.14.5",
+    "node_modules/@babel/plugin-transform-duplicate-keys": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-duplicate-keys/-/plugin-transform-duplicate-keys-7.25.9.tgz",
+      "integrity": "sha512-LZxhJ6dvBb/f3x8xwWIuyiAHy56nrRG3PeYTpBkkzkYRRQ6tJLu68lEF5VIqMUZiAV7a8+Tb78nEoMCMcqjXBw==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.14.5"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1096,26 +1115,31 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-syntax-typescript": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-duplicate-named-capturing-groups-regex": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-duplicate-named-capturing-groups-regex/-/plugin-transform-duplicate-named-capturing-groups-regex-7.25.9.tgz",
+      "integrity": "sha512-0UfuJS0EsXbRvKnwcLjFtJy/Sxc5J5jhLHnFhy7u4zih97Hz6tJkLU+O+FMMrNZrosUPxDi6sYxJ/EA8jDiAog==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-create-regexp-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/plugin-transform-arrow-functions": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-dynamic-import": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-dynamic-import/-/plugin-transform-dynamic-import-7.25.9.tgz",
+      "integrity": "sha512-GCggjexbmSLaFhqsojeugBpeaRIgWNTcgKVq/0qIteFEqY2A+b9QidYadrWlnbWQUrW5fn+mCvf3tr7OeBFTyg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1124,16 +1148,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-async-to-generator": {
-      "version": "7.24.7",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-async-to-generator/-/plugin-transform-async-to-generator-7.24.7.tgz",
-      "integrity": "sha512-SQY01PcJfmQ+4Ash7NE+rpbLFbmqA2GPIgqzxfFTL4t1FKRq4zTms/7htKpoCUI9OcFYgzqfmCdH53s6/jn5fA==",
+    "node_modules/@babel/plugin-transform-exponentiation-operator": {
+      "version": "7.26.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-exponentiation-operator/-/plugin-transform-exponentiation-operator-7.26.3.tgz",
+      "integrity": "sha512-7CAHcQ58z2chuXPWblnn1K6rLDnDWieghSOEmqQsrBenH0P9InCUtOJYD89pvngljmZlJcz3fcmgYsXFNGa1ZQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-module-imports": "^7.24.7",
-        "@babel/helper-plugin-utils": "^7.24.7",
-        "@babel/helper-remap-async-to-generator": "^7.24.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1142,13 +1164,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-block-scoped-functions": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-export-namespace-from": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-export-namespace-from/-/plugin-transform-export-namespace-from-7.25.9.tgz",
+      "integrity": "sha512-2NsEz+CxzJIVOPx2o9UsW1rXLqtChtLoVnwYHHiB04wS5sgn7mrV45fWMBX0Kk+ub9uXytVYfNP2HjbVbCB3Ww==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1157,12 +1180,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-block-scoping": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-flow-strip-types": {
+      "version": "7.26.5",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-flow-strip-types/-/plugin-transform-flow-strip-types-7.26.5.tgz",
+      "integrity": "sha512-eGK26RsbIkYUns3Y8qKl362juDDYK+wEdPGHGrhzUl6CewZFo55VZ7hg+CyMFU4dd5QQakBN86nBMpRsFpRvbQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.26.5",
+        "@babel/plugin-syntax-flow": "^7.26.0"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1171,19 +1197,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-classes": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-for-of": {
+      "version": "7.26.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-for-of/-/plugin-transform-for-of-7.26.9.tgz",
+      "integrity": "sha512-Hry8AusVm8LW5BVFgiyUReuoGzPUpdHQQqJY5bZnbbf+ngOHWuCuYFKw/BqaaWlvEUrF91HMhDtEaI1hZzNbLg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-annotate-as-pure": "^7.16.7",
-        "@babel/helper-environment-visitor": "^7.16.7",
-        "@babel/helper-function-name": "^7.17.9",
-        "@babel/helper-optimise-call-expression": "^7.16.7",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-replace-supers": "^7.16.7",
-        "@babel/helper-split-export-declaration": "^7.16.7",
-        "globals": "^11.1.0"
+        "@babel/helper-plugin-utils": "^7.26.5",
+        "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1192,12 +1214,16 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-computed-properties": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-function-name": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-function-name/-/plugin-transform-function-name-7.25.9.tgz",
+      "integrity": "sha512-8lP+Yxjv14Vc5MuWBpJsoUCd3hD6V9DgBon2FVYL4jJgbnVQ9fTgYmonchzZJOVNgzEgbxp4OwAf6xz6M/14XA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-compilation-targets": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/traverse": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1206,14 +1232,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-destructuring": {
-      "version": "7.24.8",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-destructuring/-/plugin-transform-destructuring-7.24.8.tgz",
-      "integrity": "sha512-36e87mfY8TnRxc7yc6M9g9gOB7rKgSahqkIKwLpz4Ppk2+zC2Cy1is0uwtuSG6AE4zlTOUa+7JGz9jCJGLqQFQ==",
+    "node_modules/@babel/plugin-transform-json-strings": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-json-strings/-/plugin-transform-json-strings-7.25.9.tgz",
+      "integrity": "sha512-xoTMk0WXceiiIvsaquQQUaLLXSW1KJ159KP87VilruQm0LNNGxWzahxSS6T6i4Zg3ezp4vA4zuwiNUR53qmQAw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.24.8"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1222,14 +1248,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-dotall-regex": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-literals": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-literals/-/plugin-transform-literals-7.25.9.tgz",
+      "integrity": "sha512-9N7+2lFziW8W9pBl2TzaNht3+pgMIRP74zizeCSrtnSKVdUl8mAjjOP2OOVQAfZ881P2cNjDj1uAMEdeD50nuQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-create-regexp-features-plugin": "^7.16.7",
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1238,13 +1264,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-duplicate-keys": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-logical-assignment-operators": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-logical-assignment-operators/-/plugin-transform-logical-assignment-operators-7.25.9.tgz",
+      "integrity": "sha512-wI4wRAzGko551Y8eVf6iOY9EouIDTtPb0ByZx+ktDGHwv6bHFimrgJM/2T021txPZ2s4c7bqvHbd+vXG6K948Q==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1253,14 +1280,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-exponentiation-operator": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-member-expression-literals": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-member-expression-literals/-/plugin-transform-member-expression-literals-7.25.9.tgz",
+      "integrity": "sha512-PYazBVfofCQkkMzh2P6IdIUaCEWni3iYEerAsRWuVd8+jlM1S9S9cz1dF9hIzyoZ8IA3+OwVYIp9v9e+GbgZhA==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-builder-binary-assignment-operator-visitor": "^7.16.7",
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1269,15 +1296,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-flow-strip-types": {
-      "version": "7.25.2",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-flow-strip-types/-/plugin-transform-flow-strip-types-7.25.2.tgz",
-      "integrity": "sha512-InBZ0O8tew5V0K6cHcQ+wgxlrjOw1W4wDXLkOTjLRD8GYhTSkxTVBtdy3MMtvYBrbAWa1Qm3hNoTc1620Yj+Mg==",
+    "node_modules/@babel/plugin-transform-modules-amd": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-amd/-/plugin-transform-modules-amd-7.25.9.tgz",
+      "integrity": "sha512-g5T11tnI36jVClQlMlt4qKDLlWnG5pP9CSM4GhdRciTNMRgkfpo5cR6b4rGIOYPgRRuFAvwjPQ/Yk+ql4dyhbw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.24.8",
-        "@babel/plugin-syntax-flow": "^7.24.7"
+        "@babel/helper-module-transforms": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1286,13 +1313,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-for-of": {
-      "version": "7.18.1",
+    "node_modules/@babel/plugin-transform-modules-commonjs": {
+      "version": "7.26.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-commonjs/-/plugin-transform-modules-commonjs-7.26.3.tgz",
+      "integrity": "sha512-MgR55l4q9KddUDITEzEFYn5ZsGDXMSsU9E+kh7fjRXTIC3RHqfCo8RPRbyReYJh44HQ/yomFkqbOFohXvDCiIQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-module-transforms": "^7.26.0",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1301,14 +1330,17 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-function-name": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-modules-systemjs": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-systemjs/-/plugin-transform-modules-systemjs-7.25.9.tgz",
+      "integrity": "sha512-hyss7iIlH/zLHaehT+xwiymtPOpsiwIIRlCAOwBB04ta5Tt+lNItADdlXw3jAWZ96VJ2jlhl/c+PNIQPKNfvcA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-compilation-targets": "^7.16.7",
-        "@babel/helper-function-name": "^7.16.7",
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-module-transforms": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-validator-identifier": "^7.25.9",
+        "@babel/traverse": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1317,12 +1349,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-literals": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-modules-umd": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-umd/-/plugin-transform-modules-umd-7.25.9.tgz",
+      "integrity": "sha512-bS9MVObUgE7ww36HEfwe6g9WakQ0KF07mQF74uuXdkoziUPfKyu/nIm663kz//e5O1nPInPFx36z7WJmJ4yNEw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-module-transforms": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1331,30 +1366,31 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-member-expression-literals": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-named-capturing-groups-regex": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-named-capturing-groups-regex/-/plugin-transform-named-capturing-groups-regex-7.25.9.tgz",
+      "integrity": "sha512-oqB6WHdKTGl3q/ItQhpLSnWWOpjUJLsOCLVyeFgeTktkBSCiurvPOsyt93gibI9CmuKvTUEtWmG5VhZD+5T/KA==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-create-regexp-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/plugin-transform-modules-amd": {
-      "version": "7.18.0",
+    "node_modules/@babel/plugin-transform-new-target": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-new-target/-/plugin-transform-new-target-7.25.9.tgz",
+      "integrity": "sha512-U/3p8X1yCSoKyUj2eOBIx3FOn6pElFOKvAAGf8HTtItuPyB+ZeOqfn+mvTtg9ZlOAjsPdK3ayQEjqHjU/yLeVQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-module-transforms": "^7.18.0",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "babel-plugin-dynamic-import-node": "^2.3.3"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1363,15 +1399,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-modules-commonjs": {
-      "version": "7.18.2",
+    "node_modules/@babel/plugin-transform-nullish-coalescing-operator": {
+      "version": "7.26.6",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-nullish-coalescing-operator/-/plugin-transform-nullish-coalescing-operator-7.26.6.tgz",
+      "integrity": "sha512-CKW8Vu+uUZneQCPtXmSBUC6NCAUdya26hWCElAWh5mVSlSRsmiCPUUDKb3Z0szng1hiAJa098Hkhg9o4SE35Qw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-module-transforms": "^7.18.0",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-simple-access": "^7.18.2",
-        "babel-plugin-dynamic-import-node": "^2.3.3"
+        "@babel/helper-plugin-utils": "^7.26.5"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1380,17 +1415,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-modules-systemjs": {
-      "version": "7.18.0",
+    "node_modules/@babel/plugin-transform-numeric-separator": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-numeric-separator/-/plugin-transform-numeric-separator-7.25.9.tgz",
+      "integrity": "sha512-TlprrJ1GBZ3r6s96Yq8gEQv82s8/5HnCVHtEJScUj90thHQbwe+E5MLhi2bbNHBEJuzrvltXSru+BUxHDoog7Q==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-hoist-variables": "^7.16.7",
-        "@babel/helper-module-transforms": "^7.18.0",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-validator-identifier": "^7.16.7",
-        "babel-plugin-dynamic-import-node": "^2.3.3"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1399,14 +1431,16 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-modules-umd": {
-      "version": "7.18.0",
+    "node_modules/@babel/plugin-transform-object-rest-spread": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-object-rest-spread/-/plugin-transform-object-rest-spread-7.25.9.tgz",
+      "integrity": "sha512-fSaXafEE9CVHPweLYw4J0emp1t8zYTXyzN3UuG+lylqkvYd7RMrsOQ8TYx5RF231be0vqtFC6jnx3UmpJmKBYg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-module-transforms": "^7.18.0",
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-compilation-targets": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/plugin-transform-parameters": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1415,28 +1449,31 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-named-capturing-groups-regex": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-object-super": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-object-super/-/plugin-transform-object-super-7.25.9.tgz",
+      "integrity": "sha512-Kj/Gh+Rw2RNLbCK1VAWj2U48yxxqL2x0k10nPtSdRa0O2xnHXalD0s+o1A6a0W43gJ00ANo38jxkQreckOzv5A==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-create-regexp-features-plugin": "^7.17.12",
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-replace-supers": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0"
+        "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-new-target": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-optional-catch-binding": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-optional-catch-binding/-/plugin-transform-optional-catch-binding-7.25.9.tgz",
+      "integrity": "sha512-qM/6m6hQZzDcZF3onzIhZeDHDO43bkNNlOX0i8n3lR6zLbu0GN2d8qfM/IERJZYauhAHSLHy39NF0Ctdvcid7g==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1445,14 +1482,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-object-super": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-optional-chaining": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-optional-chaining/-/plugin-transform-optional-chaining-7.25.9.tgz",
+      "integrity": "sha512-6AvV0FsLULbpnXeBjrY4dmWF8F7gf8QnvTEoO/wX/5xm/xE1Xo8oPuD3MPS+KS9f9XBEAWN7X1aWr4z9HdOr7A==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7",
-        "@babel/helper-replace-supers": "^7.16.7"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1462,13 +1500,13 @@
       }
     },
     "node_modules/@babel/plugin-transform-parameters": {
-      "version": "7.24.7",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-parameters/-/plugin-transform-parameters-7.24.7.tgz",
-      "integrity": "sha512-yGWW5Rr+sQOhK0Ot8hjDJuxU3XLRQGflvT4lhlSY0DFvdb3TwKaY26CJzHtYllU0vT9j58hc37ndFPsqT1SrzA==",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-parameters/-/plugin-transform-parameters-7.25.9.tgz",
+      "integrity": "sha512-wzz6MKwpnshBAiRmn4jR8LYz/g8Ksg0o80XmwZDlordjwEk9SxBzTWC7F5ef1jhbrbOW2DJ5J6ayRukrJmnr0g==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.24.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1513,12 +1551,13 @@
       }
     },
     "node_modules/@babel/plugin-transform-property-literals": {
-      "version": "7.16.7",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-property-literals/-/plugin-transform-property-literals-7.25.9.tgz",
+      "integrity": "sha512-IvIUeV5KrS/VPavfSM/Iu+RE6llrHrYIKY1yfCzyO/lMXHQ+p7uGhonmGVisv6tSBSVgWzMBohTcvkC9vQcQFA==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1528,11 +1567,13 @@
       }
     },
     "node_modules/@babel/plugin-transform-react-display-name": {
-      "version": "7.16.7",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-display-name/-/plugin-transform-react-display-name-7.25.9.tgz",
+      "integrity": "sha512-KJfMlYIUxQB1CJfO3e0+h0ZHWOTLCPP115Awhaz8U0Zpq36Gl/cXlpoyMRnUWlhNUBAzldnCiAZNvCDj7CrKxQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1542,15 +1583,33 @@
       }
     },
     "node_modules/@babel/plugin-transform-react-jsx": {
-      "version": "7.17.12",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx/-/plugin-transform-react-jsx-7.25.9.tgz",
+      "integrity": "sha512-s5XwpQYCqGerXl+Pu6VDL3x0j2d82eiV77UJ8a2mDHAW7j9SWRqQ2y1fNo1Z74CdcYipl5Z41zvjj4Nfzq36rw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-annotate-as-pure": "^7.16.7",
-        "@babel/helper-module-imports": "^7.16.7",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/plugin-syntax-jsx": "^7.17.12",
-        "@babel/types": "^7.17.12"
+        "@babel/helper-annotate-as-pure": "^7.25.9",
+        "@babel/helper-module-imports": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/plugin-syntax-jsx": "^7.25.9",
+        "@babel/types": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-transform-react-jsx-development": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-development/-/plugin-transform-react-jsx-development-7.25.9.tgz",
+      "integrity": "sha512-9mj6rm7XVYs4mdLIpbZnHOYdpW42uoiBCTVowg7sP1thUOiANgMb4UtpRivR0pp5iL+ocvUv7X4mZgFRpJEzGw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/plugin-transform-react-jsx": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1591,16 +1650,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-regenerator": {
-      "version": "7.24.7",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-regenerator/-/plugin-transform-regenerator-7.24.7.tgz",
-      "integrity": "sha512-lq3fvXPdimDrlg6LWBoqj+r/DEWgONuwjuOuQCSYgRroXDH/IdM1C0IZf59fL5cHLpjEH/O6opIRBbqv7ELnuA==",
+    "node_modules/@babel/plugin-transform-react-pure-annotations": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-pure-annotations/-/plugin-transform-react-pure-annotations-7.25.9.tgz",
+      "integrity": "sha512-KQ/Takk3T8Qzj5TppkS1be588lkbTp5uj7w6a0LeQaTMSckU/wK0oJ/pih+T690tkgI5jfmg2TqDJvd41Sj1Cg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.24.7",
-        "regenerator-transform": "^0.15.2"
+        "@babel/helper-annotate-as-pure": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1609,13 +1667,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-reserved-words": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-regenerator": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-regenerator/-/plugin-transform-regenerator-7.25.9.tgz",
+      "integrity": "sha512-vwDcDNsgMPDGP0nMqzahDWE5/MLcX8sv96+wfX7as7LoF/kr97Bo/7fI00lXY4wUXYfVmwIIyG80fGZ1uvt2qg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "regenerator-transform": "^0.15.2"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1624,57 +1684,58 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-runtime": {
-      "version": "7.26.9",
-      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-runtime/-/plugin-transform-runtime-7.26.9.tgz",
-      "integrity": "sha512-Jf+8y9wXQbbxvVYTM8gO5oEF2POdNji0NMltEkG7FtmzD9PVz7/lxpqSdTvwsjTMU5HIHuDVNf2SOxLkWi+wPQ==",
+    "node_modules/@babel/plugin-transform-regexp-modifiers": {
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-regexp-modifiers/-/plugin-transform-regexp-modifiers-7.26.0.tgz",
+      "integrity": "sha512-vN6saax7lrA2yA/Pak3sCxuD6F5InBjn9IcrIKQPjpsLvuHYLVroTxjdlVRHjjBWxKOqIwpTXDkOssYT4BFdRw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-module-imports": "^7.25.9",
-        "@babel/helper-plugin-utils": "^7.26.5",
-        "babel-plugin-polyfill-corejs2": "^0.4.10",
-        "babel-plugin-polyfill-corejs3": "^0.10.6",
-        "babel-plugin-polyfill-regenerator": "^0.6.1",
-        "semver": "^6.3.1"
+        "@babel/helper-create-regexp-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.0.0"
       }
     },
-    "node_modules/@babel/plugin-transform-runtime/node_modules/@babel/helper-define-polyfill-provider": {
-      "version": "0.6.3",
-      "resolved": "https://registry.npmjs.org/@babel/helper-define-polyfill-provider/-/helper-define-polyfill-provider-0.6.3.tgz",
-      "integrity": "sha512-HK7Bi+Hj6H+VTHA3ZvBis7V/6hu9QuTrnMXNybfUf2iiuU/N97I8VjB+KbhFF8Rld/Lx5MzoCwPCpPjfK+n8Cg==",
+    "node_modules/@babel/plugin-transform-reserved-words": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-reserved-words/-/plugin-transform-reserved-words-7.25.9.tgz",
+      "integrity": "sha512-7DL7DKYjn5Su++4RXu8puKZm2XBPHyjWLUidaPEkCUBbE7IPcsrkRHggAOOKydH1dASWdcUBxrkOGNxUv5P3Jg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-compilation-targets": "^7.22.6",
-        "@babel/helper-plugin-utils": "^7.22.5",
-        "debug": "^4.1.1",
-        "lodash.debounce": "^4.0.8",
-        "resolve": "^1.14.2"
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0"
+        "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-runtime/node_modules/babel-plugin-polyfill-corejs2": {
-      "version": "0.4.12",
-      "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs2/-/babel-plugin-polyfill-corejs2-0.4.12.tgz",
-      "integrity": "sha512-CPWT6BwvhrTO2d8QVorhTCQw9Y43zOu7G9HigcfxvepOU6b8o3tcWad6oVgZIsZCTt42FFv97aA7ZJsbM4+8og==",
+    "node_modules/@babel/plugin-transform-runtime": {
+      "version": "7.26.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-runtime/-/plugin-transform-runtime-7.26.9.tgz",
+      "integrity": "sha512-Jf+8y9wXQbbxvVYTM8gO5oEF2POdNji0NMltEkG7FtmzD9PVz7/lxpqSdTvwsjTMU5HIHuDVNf2SOxLkWi+wPQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/compat-data": "^7.22.6",
-        "@babel/helper-define-polyfill-provider": "^0.6.3",
+        "@babel/helper-module-imports": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.26.5",
+        "babel-plugin-polyfill-corejs2": "^0.4.10",
+        "babel-plugin-polyfill-corejs3": "^0.10.6",
+        "babel-plugin-polyfill-regenerator": "^0.6.1",
         "semver": "^6.3.1"
       },
+      "engines": {
+        "node": ">=6.9.0"
+      },
       "peerDependencies": {
-        "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0"
+        "@babel/core": "^7.0.0-0"
       }
     },
     "node_modules/@babel/plugin-transform-runtime/node_modules/babel-plugin-polyfill-corejs3": {
@@ -1691,50 +1752,47 @@
         "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0"
       }
     },
-    "node_modules/@babel/plugin-transform-runtime/node_modules/babel-plugin-polyfill-regenerator": {
-      "version": "0.6.3",
-      "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-regenerator/-/babel-plugin-polyfill-regenerator-0.6.3.tgz",
-      "integrity": "sha512-LiWSbl4CRSIa5x/JAU6jZiG9eit9w6mz+yVMFwDE83LAWvt0AfGBoZ7HS/mkhrKuh2ZlzfVZYKoLjXdqw6Yt7Q==",
+    "node_modules/@babel/plugin-transform-shorthand-properties": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-shorthand-properties/-/plugin-transform-shorthand-properties-7.25.9.tgz",
+      "integrity": "sha512-MUv6t0FhO5qHnS/W8XCbHmiRWOphNufpE1IVxhK5kuN3Td9FT1x4rx4K42s3RYdMXCXpfWkGSbCSd0Z64xA7Ng==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-define-polyfill-provider": "^0.6.3"
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0"
+        "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-runtime/node_modules/debug": {
-      "version": "4.4.0",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz",
-      "integrity": "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==",
+    "node_modules/@babel/plugin-transform-spread": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-spread/-/plugin-transform-spread-7.25.9.tgz",
+      "integrity": "sha512-oNknIB0TbURU5pqJFVbOOFspVlrpVwo2H1+HUIsVDvp5VauGGDP1ZEvO8Nn5xyMEs3dakajOxlmkNW7kNgSm6A==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "ms": "^2.1.3"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9"
       },
       "engines": {
-        "node": ">=6.0"
+        "node": ">=6.9.0"
       },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-runtime/node_modules/ms": {
-      "version": "2.1.3",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
-      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@babel/plugin-transform-shorthand-properties": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-sticky-regex": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-sticky-regex/-/plugin-transform-sticky-regex-7.25.9.tgz",
+      "integrity": "sha512-WqBUSgeVwucYDP9U/xNRQam7xV8W5Zf+6Eo7T2SRVUFlhRiMNFdFz58u0KZmCVVqs2i7SHgpRnAhzRNmKfi2uA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1743,13 +1801,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-spread": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-strict-mode": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-strict-mode/-/plugin-transform-strict-mode-7.25.9.tgz",
+      "integrity": "sha512-DplEwkN9xt6XCz/4oC9l8FJGn7LnOGPU7v08plq+OclMT55zAR9lkX7QIbQ9XscvvJNYpLUfYO4IYz/7JGkbXQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-skip-transparent-expression-wrappers": "^7.16.0"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1758,12 +1817,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-sticky-regex": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-template-literals": {
+      "version": "7.26.8",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-template-literals/-/plugin-transform-template-literals-7.26.8.tgz",
+      "integrity": "sha512-OmGDL5/J0CJPJZTHZbi2XpO0tyT2Ia7fzpW5GURwdtp2X3fMmN8au/ej6peC/T33/+CRiIpA8Krse8hFGVmT5Q==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-plugin-utils": "^7.26.5"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1772,13 +1833,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-template-literals": {
-      "version": "7.18.2",
+    "node_modules/@babel/plugin-transform-typeof-symbol": {
+      "version": "7.26.7",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-typeof-symbol/-/plugin-transform-typeof-symbol-7.26.7.tgz",
+      "integrity": "sha512-jfoTXXZTgGg36BmhqT3cAYK5qkmqvJpvNrPhaK/52Vgjhw4Rq29s9UqpWWV0D6yuRmgiFH/BUVlkl96zJWqnaw==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.26.5"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1787,13 +1849,18 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-typeof-symbol": {
-      "version": "7.17.12",
+    "node_modules/@babel/plugin-transform-typescript": {
+      "version": "7.26.8",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-typescript/-/plugin-transform-typescript-7.26.8.tgz",
+      "integrity": "sha512-bME5J9AC8ChwA7aEPJ6zym3w7aObZULHhbNLU0bKUhKsAkylkzUdq+0kdymh9rzi8nlNFl2bmldFBCKNJBUpuw==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12"
+        "@babel/helper-annotate-as-pure": "^7.25.9",
+        "@babel/helper-create-class-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.26.5",
+        "@babel/helper-skip-transparent-expression-wrappers": "^7.25.9",
+        "@babel/plugin-syntax-typescript": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1802,14 +1869,14 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-typescript": {
-      "version": "7.18.1",
+    "node_modules/@babel/plugin-transform-unicode-escapes": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-escapes/-/plugin-transform-unicode-escapes-7.25.9.tgz",
+      "integrity": "sha512-s5EDrE6bW97LtxOcGj1Khcx5AaXwiMmi4toFWRDP9/y0Woo6pXC+iyPu/KuhKtfSrNFd7jJB+/fkOtZy6aIC6Q==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-create-class-features-plugin": "^7.18.0",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/plugin-syntax-typescript": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1818,13 +1885,15 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/@babel/plugin-transform-unicode-escapes": {
-      "version": "7.16.7",
+    "node_modules/@babel/plugin-transform-unicode-property-regex": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-property-regex/-/plugin-transform-unicode-property-regex-7.25.9.tgz",
+      "integrity": "sha512-Jt2d8Ga+QwRluxRQ307Vlxa6dMrYEMZCgGxoPR8V52rxPyldHu3hdlHspxaqYmE7oID5+kB+UKUB/eWS+DkkWg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-create-regexp-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1834,12 +1903,14 @@
       }
     },
     "node_modules/@babel/plugin-transform-unicode-regex": {
-      "version": "7.16.7",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-regex/-/plugin-transform-unicode-regex-7.25.9.tgz",
+      "integrity": "sha512-yoxstj7Rg9dlNn9UQxzk4fcNivwv4nUYz7fYXBaKxvw/lnmPuOm/ikoELygbYq68Bls3D/D+NBPHiLwZdZZ4HA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-create-regexp-features-plugin": "^7.16.7",
-        "@babel/helper-plugin-utils": "^7.16.7"
+        "@babel/helper-create-regexp-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1848,87 +1919,99 @@
         "@babel/core": "^7.0.0-0"
       }
     },
+    "node_modules/@babel/plugin-transform-unicode-sets-regex": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-sets-regex/-/plugin-transform-unicode-sets-regex-7.25.9.tgz",
+      "integrity": "sha512-8BYqO3GeVNHtx69fdPshN3fnzUNLrWdHhk/icSwigksJGczKSizZ+Z6SBCxTs723Fr5VSNorTIK7a+R2tISvwQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-create-regexp-features-plugin": "^7.25.9",
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0"
+      }
+    },
     "node_modules/@babel/preset-env": {
-      "version": "7.18.2",
+      "version": "7.26.9",
+      "resolved": "https://registry.npmjs.org/@babel/preset-env/-/preset-env-7.26.9.tgz",
+      "integrity": "sha512-vX3qPGE8sEKEAZCWk05k3cpTAE3/nOYca++JA+Rd0z2NCNzabmYvEiSShKzm10zdquOIAVXsy2Ei/DTW34KlKQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/compat-data": "^7.17.10",
-        "@babel/helper-compilation-targets": "^7.18.2",
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-validator-option": "^7.16.7",
-        "@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression": "^7.17.12",
-        "@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining": "^7.17.12",
-        "@babel/plugin-proposal-async-generator-functions": "^7.17.12",
-        "@babel/plugin-proposal-class-properties": "^7.17.12",
-        "@babel/plugin-proposal-class-static-block": "^7.18.0",
-        "@babel/plugin-proposal-dynamic-import": "^7.16.7",
-        "@babel/plugin-proposal-export-namespace-from": "^7.17.12",
-        "@babel/plugin-proposal-json-strings": "^7.17.12",
-        "@babel/plugin-proposal-logical-assignment-operators": "^7.17.12",
-        "@babel/plugin-proposal-nullish-coalescing-operator": "^7.17.12",
-        "@babel/plugin-proposal-numeric-separator": "^7.16.7",
-        "@babel/plugin-proposal-object-rest-spread": "^7.18.0",
-        "@babel/plugin-proposal-optional-catch-binding": "^7.16.7",
-        "@babel/plugin-proposal-optional-chaining": "^7.17.12",
-        "@babel/plugin-proposal-private-methods": "^7.17.12",
-        "@babel/plugin-proposal-private-property-in-object": "^7.17.12",
-        "@babel/plugin-proposal-unicode-property-regex": "^7.17.12",
-        "@babel/plugin-syntax-async-generators": "^7.8.4",
-        "@babel/plugin-syntax-class-properties": "^7.12.13",
-        "@babel/plugin-syntax-class-static-block": "^7.14.5",
-        "@babel/plugin-syntax-dynamic-import": "^7.8.3",
-        "@babel/plugin-syntax-export-namespace-from": "^7.8.3",
-        "@babel/plugin-syntax-import-assertions": "^7.17.12",
-        "@babel/plugin-syntax-json-strings": "^7.8.3",
-        "@babel/plugin-syntax-logical-assignment-operators": "^7.10.4",
-        "@babel/plugin-syntax-nullish-coalescing-operator": "^7.8.3",
-        "@babel/plugin-syntax-numeric-separator": "^7.10.4",
-        "@babel/plugin-syntax-object-rest-spread": "^7.8.3",
-        "@babel/plugin-syntax-optional-catch-binding": "^7.8.3",
-        "@babel/plugin-syntax-optional-chaining": "^7.8.3",
-        "@babel/plugin-syntax-private-property-in-object": "^7.14.5",
-        "@babel/plugin-syntax-top-level-await": "^7.14.5",
-        "@babel/plugin-transform-arrow-functions": "^7.17.12",
-        "@babel/plugin-transform-async-to-generator": "^7.17.12",
-        "@babel/plugin-transform-block-scoped-functions": "^7.16.7",
-        "@babel/plugin-transform-block-scoping": "^7.17.12",
-        "@babel/plugin-transform-classes": "^7.17.12",
-        "@babel/plugin-transform-computed-properties": "^7.17.12",
-        "@babel/plugin-transform-destructuring": "^7.18.0",
-        "@babel/plugin-transform-dotall-regex": "^7.16.7",
-        "@babel/plugin-transform-duplicate-keys": "^7.17.12",
-        "@babel/plugin-transform-exponentiation-operator": "^7.16.7",
-        "@babel/plugin-transform-for-of": "^7.18.1",
-        "@babel/plugin-transform-function-name": "^7.16.7",
-        "@babel/plugin-transform-literals": "^7.17.12",
-        "@babel/plugin-transform-member-expression-literals": "^7.16.7",
-        "@babel/plugin-transform-modules-amd": "^7.18.0",
-        "@babel/plugin-transform-modules-commonjs": "^7.18.2",
-        "@babel/plugin-transform-modules-systemjs": "^7.18.0",
-        "@babel/plugin-transform-modules-umd": "^7.18.0",
-        "@babel/plugin-transform-named-capturing-groups-regex": "^7.17.12",
-        "@babel/plugin-transform-new-target": "^7.17.12",
-        "@babel/plugin-transform-object-super": "^7.16.7",
-        "@babel/plugin-transform-parameters": "^7.17.12",
-        "@babel/plugin-transform-property-literals": "^7.16.7",
-        "@babel/plugin-transform-regenerator": "^7.18.0",
-        "@babel/plugin-transform-reserved-words": "^7.17.12",
-        "@babel/plugin-transform-shorthand-properties": "^7.16.7",
-        "@babel/plugin-transform-spread": "^7.17.12",
-        "@babel/plugin-transform-sticky-regex": "^7.16.7",
-        "@babel/plugin-transform-template-literals": "^7.18.2",
-        "@babel/plugin-transform-typeof-symbol": "^7.17.12",
-        "@babel/plugin-transform-unicode-escapes": "^7.16.7",
-        "@babel/plugin-transform-unicode-regex": "^7.16.7",
-        "@babel/preset-modules": "^0.1.5",
-        "@babel/types": "^7.18.2",
-        "babel-plugin-polyfill-corejs2": "^0.3.0",
-        "babel-plugin-polyfill-corejs3": "^0.5.0",
-        "babel-plugin-polyfill-regenerator": "^0.3.0",
-        "core-js-compat": "^3.22.1",
-        "semver": "^6.3.0"
+        "@babel/compat-data": "^7.26.8",
+        "@babel/helper-compilation-targets": "^7.26.5",
+        "@babel/helper-plugin-utils": "^7.26.5",
+        "@babel/helper-validator-option": "^7.25.9",
+        "@babel/plugin-bugfix-firefox-class-in-computed-class-key": "^7.25.9",
+        "@babel/plugin-bugfix-safari-class-field-initializer-scope": "^7.25.9",
+        "@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression": "^7.25.9",
+        "@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining": "^7.25.9",
+        "@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly": "^7.25.9",
+        "@babel/plugin-proposal-private-property-in-object": "7.21.0-placeholder-for-preset-env.2",
+        "@babel/plugin-syntax-import-assertions": "^7.26.0",
+        "@babel/plugin-syntax-import-attributes": "^7.26.0",
+        "@babel/plugin-syntax-unicode-sets-regex": "^7.18.6",
+        "@babel/plugin-transform-arrow-functions": "^7.25.9",
+        "@babel/plugin-transform-async-generator-functions": "^7.26.8",
+        "@babel/plugin-transform-async-to-generator": "^7.25.9",
+        "@babel/plugin-transform-block-scoped-functions": "^7.26.5",
+        "@babel/plugin-transform-block-scoping": "^7.25.9",
+        "@babel/plugin-transform-class-properties": "^7.25.9",
+        "@babel/plugin-transform-class-static-block": "^7.26.0",
+        "@babel/plugin-transform-classes": "^7.25.9",
+        "@babel/plugin-transform-computed-properties": "^7.25.9",
+        "@babel/plugin-transform-destructuring": "^7.25.9",
+        "@babel/plugin-transform-dotall-regex": "^7.25.9",
+        "@babel/plugin-transform-duplicate-keys": "^7.25.9",
+        "@babel/plugin-transform-duplicate-named-capturing-groups-regex": "^7.25.9",
+        "@babel/plugin-transform-dynamic-import": "^7.25.9",
+        "@babel/plugin-transform-exponentiation-operator": "^7.26.3",
+        "@babel/plugin-transform-export-namespace-from": "^7.25.9",
+        "@babel/plugin-transform-for-of": "^7.26.9",
+        "@babel/plugin-transform-function-name": "^7.25.9",
+        "@babel/plugin-transform-json-strings": "^7.25.9",
+        "@babel/plugin-transform-literals": "^7.25.9",
+        "@babel/plugin-transform-logical-assignment-operators": "^7.25.9",
+        "@babel/plugin-transform-member-expression-literals": "^7.25.9",
+        "@babel/plugin-transform-modules-amd": "^7.25.9",
+        "@babel/plugin-transform-modules-commonjs": "^7.26.3",
+        "@babel/plugin-transform-modules-systemjs": "^7.25.9",
+        "@babel/plugin-transform-modules-umd": "^7.25.9",
+        "@babel/plugin-transform-named-capturing-groups-regex": "^7.25.9",
+        "@babel/plugin-transform-new-target": "^7.25.9",
+        "@babel/plugin-transform-nullish-coalescing-operator": "^7.26.6",
+        "@babel/plugin-transform-numeric-separator": "^7.25.9",
+        "@babel/plugin-transform-object-rest-spread": "^7.25.9",
+        "@babel/plugin-transform-object-super": "^7.25.9",
+        "@babel/plugin-transform-optional-catch-binding": "^7.25.9",
+        "@babel/plugin-transform-optional-chaining": "^7.25.9",
+        "@babel/plugin-transform-parameters": "^7.25.9",
+        "@babel/plugin-transform-private-methods": "^7.25.9",
+        "@babel/plugin-transform-private-property-in-object": "^7.25.9",
+        "@babel/plugin-transform-property-literals": "^7.25.9",
+        "@babel/plugin-transform-regenerator": "^7.25.9",
+        "@babel/plugin-transform-regexp-modifiers": "^7.26.0",
+        "@babel/plugin-transform-reserved-words": "^7.25.9",
+        "@babel/plugin-transform-shorthand-properties": "^7.25.9",
+        "@babel/plugin-transform-spread": "^7.25.9",
+        "@babel/plugin-transform-sticky-regex": "^7.25.9",
+        "@babel/plugin-transform-template-literals": "^7.26.8",
+        "@babel/plugin-transform-typeof-symbol": "^7.26.7",
+        "@babel/plugin-transform-unicode-escapes": "^7.25.9",
+        "@babel/plugin-transform-unicode-property-regex": "^7.25.9",
+        "@babel/plugin-transform-unicode-regex": "^7.25.9",
+        "@babel/plugin-transform-unicode-sets-regex": "^7.25.9",
+        "@babel/preset-modules": "0.1.6-no-external-plugins",
+        "babel-plugin-polyfill-corejs2": "^0.4.10",
+        "babel-plugin-polyfill-corejs3": "^0.11.0",
+        "babel-plugin-polyfill-regenerator": "^0.6.1",
+        "core-js-compat": "^3.40.0",
+        "semver": "^6.3.1"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1938,13 +2021,15 @@
       }
     },
     "node_modules/@babel/preset-flow": {
-      "version": "7.17.12",
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/preset-flow/-/preset-flow-7.25.9.tgz",
+      "integrity": "sha512-EASHsAhE+SSlEzJ4bzfusnXSHiU+JfAYzj+jbw2vgQKgq5HrUr8qs+vgtiEL5dOH6sEweI+PNt2D7AqrDSHyqQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-validator-option": "^7.16.7",
-        "@babel/plugin-transform-flow-strip-types": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-validator-option": "^7.25.9",
+        "@babel/plugin-transform-flow-strip-types": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -1954,29 +2039,53 @@
       }
     },
     "node_modules/@babel/preset-modules": {
-      "version": "0.1.5",
+      "version": "0.1.6-no-external-plugins",
+      "resolved": "https://registry.npmjs.org/@babel/preset-modules/-/preset-modules-0.1.6-no-external-plugins.tgz",
+      "integrity": "sha512-HrcgcIESLm9aIR842yhJ5RWan/gebQUJ6E/E5+rf0y9o6oj7w0Br+sWuL6kEQ/o/AdfvR1Je9jG18/gnpwjEyA==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@babel/helper-plugin-utils": "^7.0.0",
-        "@babel/plugin-proposal-unicode-property-regex": "^7.4.4",
-        "@babel/plugin-transform-dotall-regex": "^7.4.4",
         "@babel/types": "^7.4.4",
         "esutils": "^2.0.2"
       },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0 || ^8.0.0-0 <8.0.0"
+      }
+    },
+    "node_modules/@babel/preset-react": {
+      "version": "7.26.3",
+      "resolved": "https://registry.npmjs.org/@babel/preset-react/-/preset-react-7.26.3.tgz",
+      "integrity": "sha512-Nl03d6T9ky516DGK2YMxrTqvnpUW63TnJMOMonj+Zae0JiPC5BC9xPMSL6L8fiSpA5vP88qfygavVQvnLp+6Cw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-validator-option": "^7.25.9",
+        "@babel/plugin-transform-react-display-name": "^7.25.9",
+        "@babel/plugin-transform-react-jsx": "^7.25.9",
+        "@babel/plugin-transform-react-jsx-development": "^7.25.9",
+        "@babel/plugin-transform-react-pure-annotations": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
       "peerDependencies": {
         "@babel/core": "^7.0.0-0"
       }
     },
     "node_modules/@babel/preset-typescript": {
-      "version": "7.17.12",
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/preset-typescript/-/preset-typescript-7.26.0.tgz",
+      "integrity": "sha512-NMk1IGZ5I/oHhoXEElcm+xUnL/szL6xflkFZmoEU9xj1qSJXpiS7rsspYo92B4DRCDvZn2erT5LdsCeXAKNCkg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@babel/helper-plugin-utils": "^7.17.12",
-        "@babel/helper-validator-option": "^7.16.7",
-        "@babel/plugin-transform-typescript": "^7.17.12"
+        "@babel/helper-plugin-utils": "^7.25.9",
+        "@babel/helper-validator-option": "^7.25.9",
+        "@babel/plugin-syntax-jsx": "^7.25.9",
+        "@babel/plugin-transform-modules-commonjs": "^7.25.9",
+        "@babel/plugin-transform-typescript": "^7.25.9"
       },
       "engines": {
         "node": ">=6.9.0"
@@ -2235,8 +2344,46 @@
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@jridgewell/resolve-uri": "^3.1.0",
-        "@jridgewell/sourcemap-codec": "^1.4.14"
+        "@jridgewell/resolve-uri": "^3.1.0",
+        "@jridgewell/sourcemap-codec": "^1.4.14"
+      }
+    },
+    "node_modules/@nodelib/fs.scandir": {
+      "version": "2.1.5",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
+      "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.stat": "2.0.5",
+        "run-parallel": "^1.1.9"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/@nodelib/fs.stat": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
+      "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/@nodelib/fs.walk": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
+      "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.scandir": "2.1.5",
+        "fastq": "^1.6.0"
+      },
+      "engines": {
+        "node": ">= 8"
       }
     },
     "node_modules/@react-native-community/cli": {
@@ -3000,6 +3147,20 @@
         "node": ">=0.4.0"
       }
     },
+    "node_modules/aggregate-error": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/aggregate-error/-/aggregate-error-3.1.0.tgz",
+      "integrity": "sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "clean-stack": "^2.0.0",
+        "indent-string": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/anser": {
       "version": "1.4.10",
       "dev": true,
@@ -3089,6 +3250,16 @@
         "sprintf-js": "~1.0.2"
       }
     },
+    "node_modules/array-union": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/array-union/-/array-union-2.1.0.tgz",
+      "integrity": "sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/asap": {
       "version": "2.0.6",
       "resolved": "https://registry.npmjs.org/asap/-/asap-2.0.6.tgz",
@@ -3134,51 +3305,105 @@
         "@babel/core": "^7.0.0-0"
       }
     },
-    "node_modules/babel-plugin-dynamic-import-node": {
-      "version": "2.3.3",
+    "node_modules/babel-plugin-module-resolver": {
+      "version": "5.0.2",
+      "resolved": "https://registry.npmjs.org/babel-plugin-module-resolver/-/babel-plugin-module-resolver-5.0.2.tgz",
+      "integrity": "sha512-9KtaCazHee2xc0ibfqsDeamwDps6FZNo5S0Q81dUqEuFzVwPhcT4J5jOqIVvgCA3Q/wO9hKYxN/Ds3tIsp5ygg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "find-babel-config": "^2.1.1",
+        "glob": "^9.3.3",
+        "pkg-up": "^3.1.0",
+        "reselect": "^4.1.7",
+        "resolve": "^1.22.8"
+      }
+    },
+    "node_modules/babel-plugin-module-resolver/node_modules/brace-expansion": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "object.assign": "^4.1.0"
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/babel-plugin-module-resolver/node_modules/glob": {
+      "version": "9.3.5",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-9.3.5.tgz",
+      "integrity": "sha512-e1LleDykUz2Iu+MTYdkSsuWX8lvAjAcs0Xef0lNIu0S2wOAzuTxCJtcd9S3cijlwYF18EsU3rzb8jPVobxDh9Q==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "fs.realpath": "^1.0.0",
+        "minimatch": "^8.0.2",
+        "minipass": "^4.2.4",
+        "path-scurry": "^1.6.1"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/babel-plugin-module-resolver/node_modules/minimatch": {
+      "version": "8.0.4",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-8.0.4.tgz",
+      "integrity": "sha512-W0Wvr9HyFXZRGIDgCicunpQ299OKXs9RgZfaukz4qAW/pJhcpUfupc9c+OObPOFueNy8VSrZgEmDtk6Kh4WzDA==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
       }
     },
     "node_modules/babel-plugin-polyfill-corejs2": {
-      "version": "0.3.1",
+      "version": "0.4.12",
+      "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs2/-/babel-plugin-polyfill-corejs2-0.4.12.tgz",
+      "integrity": "sha512-CPWT6BwvhrTO2d8QVorhTCQw9Y43zOu7G9HigcfxvepOU6b8o3tcWad6oVgZIsZCTt42FFv97aA7ZJsbM4+8og==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/compat-data": "^7.13.11",
-        "@babel/helper-define-polyfill-provider": "^0.3.1",
-        "semver": "^6.1.1"
+        "@babel/compat-data": "^7.22.6",
+        "@babel/helper-define-polyfill-provider": "^0.6.3",
+        "semver": "^6.3.1"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0"
       }
     },
     "node_modules/babel-plugin-polyfill-corejs3": {
-      "version": "0.5.2",
+      "version": "0.11.1",
+      "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs3/-/babel-plugin-polyfill-corejs3-0.11.1.tgz",
+      "integrity": "sha512-yGCqvBT4rwMczo28xkH/noxJ6MZ4nJfkVYdoDaC/utLtWrXxv27HVrzAeSbqR8SxDsp46n0YF47EbHoixy6rXQ==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-define-polyfill-provider": "^0.3.1",
-        "core-js-compat": "^3.21.0"
+        "@babel/helper-define-polyfill-provider": "^0.6.3",
+        "core-js-compat": "^3.40.0"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0"
       }
     },
     "node_modules/babel-plugin-polyfill-regenerator": {
-      "version": "0.3.1",
+      "version": "0.6.3",
+      "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-regenerator/-/babel-plugin-polyfill-regenerator-0.6.3.tgz",
+      "integrity": "sha512-LiWSbl4CRSIa5x/JAU6jZiG9eit9w6mz+yVMFwDE83LAWvt0AfGBoZ7HS/mkhrKuh2ZlzfVZYKoLjXdqw6Yt7Q==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
-        "@babel/helper-define-polyfill-provider": "^0.3.1"
+        "@babel/helper-define-polyfill-provider": "^0.6.3"
       },
       "peerDependencies": {
-        "@babel/core": "^7.0.0-0"
+        "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0"
       }
     },
     "node_modules/babel-plugin-transform-flow-enums": {
@@ -3272,9 +3497,9 @@
       }
     },
     "node_modules/browserslist": {
-      "version": "4.23.3",
-      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.23.3.tgz",
-      "integrity": "sha512-btwCFJVjI4YWDNfau8RhZ+B1Q/VLoUITrm3RlP6y1tYGWIOa+InuYiRGXUBXo8nA1qKmHMyLB/iVQg5TT4eFoA==",
+      "version": "4.24.4",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.24.4.tgz",
+      "integrity": "sha512-KDi1Ny1gSePi1vm0q4oxSF8b4DR44GF4BbmS2YdhPLOEqd8pDviZOGH/GsmRwoWJ2+5Lr085X7naowMwKHDG1A==",
       "dev": true,
       "funding": [
         {
@@ -3292,10 +3517,10 @@
       ],
       "license": "MIT",
       "dependencies": {
-        "caniuse-lite": "^1.0.30001646",
-        "electron-to-chromium": "^1.5.4",
-        "node-releases": "^2.0.18",
-        "update-browserslist-db": "^1.1.0"
+        "caniuse-lite": "^1.0.30001688",
+        "electron-to-chromium": "^1.5.73",
+        "node-releases": "^2.0.19",
+        "update-browserslist-db": "^1.1.1"
       },
       "bin": {
         "browserslist": "cli.js"
@@ -3349,18 +3574,6 @@
         "node": ">= 0.8"
       }
     },
-    "node_modules/call-bind": {
-      "version": "1.0.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "function-bind": "^1.1.1",
-        "get-intrinsic": "^1.0.2"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/caller-callsite": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/caller-callsite/-/caller-callsite-2.0.0.tgz",
@@ -3397,6 +3610,16 @@
         "node": ">=4"
       }
     },
+    "node_modules/callsites": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
+      "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/camelcase": {
       "version": "5.3.1",
       "dev": true,
@@ -3406,9 +3629,9 @@
       }
     },
     "node_modules/caniuse-lite": {
-      "version": "1.0.30001660",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001660.tgz",
-      "integrity": "sha512-GacvNTTuATm26qC74pt+ad1fW15mlQ/zuTzzY1ZoIzECTP8HURDfF43kNxPgf7H1jmelCBQTTbBNxdSXOA7Bqg==",
+      "version": "1.0.30001704",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001704.tgz",
+      "integrity": "sha512-+L2IgBbV6gXB4ETf0keSvLr7JUrRVbIaB/lrQ1+z8mRcQiisG5k+lG6O4n6Y5q6f5EuNfaYXKgymucphlEXQew==",
       "dev": true,
       "funding": [
         {
@@ -3460,19 +3683,6 @@
         "node": ">=12.13.0"
       }
     },
-    "node_modules/chrome-launcher/node_modules/escape-string-regexp": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
-      "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/chromium-edge-launcher": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/chromium-edge-launcher/-/chromium-edge-launcher-1.0.0.tgz",
@@ -3488,19 +3698,6 @@
         "rimraf": "^3.0.2"
       }
     },
-    "node_modules/chromium-edge-launcher/node_modules/escape-string-regexp": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
-      "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/chromium-edge-launcher/node_modules/mkdirp": {
       "version": "1.0.4",
       "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-1.0.4.tgz",
@@ -3519,6 +3716,16 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/clean-stack": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/clean-stack/-/clean-stack-2.2.0.tgz",
+      "integrity": "sha512-4diC9HaTE+KRAMWhDhrGOECgWZxoevMc5TlkObMqNSsVU62PYzXZ/SMTjzyGAFF1YusgxGcSWTEXBhp0CPwQ1A==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/cli-cursor": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-3.1.0.tgz",
@@ -3707,13 +3914,13 @@
       }
     },
     "node_modules/core-js-compat": {
-      "version": "3.38.1",
-      "resolved": "https://registry.npmjs.org/core-js-compat/-/core-js-compat-3.38.1.tgz",
-      "integrity": "sha512-JRH6gfXxGmrzF3tZ57lFx97YARxCXPaMzPo6jELZhv88pBH5VXpQ+y0znKGlFnzuaihqhLbefxSJxWJMPtfDzw==",
+      "version": "3.41.0",
+      "resolved": "https://registry.npmjs.org/core-js-compat/-/core-js-compat-3.41.0.tgz",
+      "integrity": "sha512-RFsU9LySVue9RTwdDVX/T0e2Y6jRYWXERKElIjpuEOEnxaXffI0X7RUwVzfYLfzuLXSNJDYoRYUAmRUcyln20A==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "browserslist": "^4.23.3"
+        "browserslist": "^4.24.4"
       },
       "funding": {
         "type": "opencollective",
@@ -3836,6 +4043,13 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/dedent": {
+      "version": "0.7.0",
+      "resolved": "https://registry.npmjs.org/dedent/-/dedent-0.7.0.tgz",
+      "integrity": "sha512-Q6fKUPqnAHAyhiUgFU7BUzLiv0kd8saH9al7tnu5Q/okj6dnupxyTgFIBjVzJATdfIAm9NAsvXNzjaKa+bxVyA==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/deepmerge": {
       "version": "4.3.1",
       "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
@@ -3859,19 +4073,27 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/define-properties": {
-      "version": "1.1.4",
+    "node_modules/del": {
+      "version": "6.1.1",
+      "resolved": "https://registry.npmjs.org/del/-/del-6.1.1.tgz",
+      "integrity": "sha512-ua8BhapfP0JUJKC/zV9yHHDW/rDoDxP4Zhn3AkA6/xT6gY7jYXJiaeyBZznYVujhZZET+UgcbZiQ7sN3WqcImg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "has-property-descriptors": "^1.0.0",
-        "object-keys": "^1.1.1"
+        "globby": "^11.0.1",
+        "graceful-fs": "^4.2.4",
+        "is-glob": "^4.0.1",
+        "is-path-cwd": "^2.2.0",
+        "is-path-inside": "^3.0.2",
+        "p-map": "^4.0.0",
+        "rimraf": "^3.0.2",
+        "slash": "^3.0.0"
       },
       "engines": {
-        "node": ">= 0.4"
+        "node": ">=10"
       },
       "funding": {
-        "url": "https://github.com/sponsors/ljharb"
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/denodeify": {
@@ -3917,6 +4139,19 @@
         "npm": "1.2.8000 || >= 1.4.16"
       }
     },
+    "node_modules/dir-glob": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz",
+      "integrity": "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "path-type": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/ee-first": {
       "version": "1.1.1",
       "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
@@ -3925,9 +4160,9 @@
       "license": "MIT"
     },
     "node_modules/electron-to-chromium": {
-      "version": "1.5.24",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.24.tgz",
-      "integrity": "sha512-0x0wLCmpdKFCi9ulhvYZebgcPmHTkFVUfU2wzDykadkslKwT4oAmDTHEKLnlrDsMGZe4B+ksn8quZfZjYsBetA==",
+      "version": "1.5.118",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.118.tgz",
+      "integrity": "sha512-yNDUus0iultYyVoEFLnQeei7LOQkL8wg8GQpkPCRrOlJXlcCwa6eGKZkxQ9ciHsqZyYbj8Jd94X1CTPzGm+uIA==",
       "dev": true,
       "license": "ISC"
     },
@@ -3946,6 +4181,26 @@
         "node": ">= 0.8"
       }
     },
+    "node_modules/end-of-stream": {
+      "version": "1.4.4",
+      "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz",
+      "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "once": "^1.4.0"
+      }
+    },
+    "node_modules/env-paths": {
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz",
+      "integrity": "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/envinfo": {
       "version": "7.14.0",
       "resolved": "https://registry.npmjs.org/envinfo/-/envinfo-7.14.0.tgz",
@@ -4008,6 +4263,19 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/escape-string-regexp": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
+      "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/esprima": {
       "version": "4.0.1",
       "dev": true,
@@ -4022,9 +4290,10 @@
     },
     "node_modules/esutils": {
       "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
+      "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
       "dev": true,
       "license": "BSD-2-Clause",
-      "peer": true,
       "engines": {
         "node": ">=0.10.0"
       }
@@ -4088,6 +4357,23 @@
       "dev": true,
       "license": "Apache-2.0"
     },
+    "node_modules/fast-glob": {
+      "version": "3.3.3",
+      "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
+      "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.stat": "^2.0.2",
+        "@nodelib/fs.walk": "^1.2.3",
+        "glob-parent": "^5.1.2",
+        "merge2": "^1.3.0",
+        "micromatch": "^4.0.8"
+      },
+      "engines": {
+        "node": ">=8.6.0"
+      }
+    },
     "node_modules/fast-xml-parser": {
       "version": "4.5.3",
       "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.3.tgz",
@@ -4107,6 +4393,16 @@
         "fxparser": "src/cli/cli.js"
       }
     },
+    "node_modules/fastq": {
+      "version": "1.19.1",
+      "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz",
+      "integrity": "sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "reusify": "^1.0.4"
+      }
+    },
     "node_modules/fb-watchman": {
       "version": "2.0.1",
       "dev": true,
@@ -4145,6 +4441,16 @@
         "node": ">= 0.8"
       }
     },
+    "node_modules/find-babel-config": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/find-babel-config/-/find-babel-config-2.1.2.tgz",
+      "integrity": "sha512-ZfZp1rQyp4gyuxqt1ZqjFGVeVBvmpURMqdIWXbPRfB97Bf6BzdK/xSIbylEINzQ0kB5tlDQfn9HkNXXWsqTqLg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "json5": "^2.2.3"
+      }
+    },
     "node_modules/find-cache-dir": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/find-cache-dir/-/find-cache-dir-2.1.0.tgz",
@@ -4305,9 +4611,14 @@
       }
     },
     "node_modules/function-bind": {
-      "version": "1.1.1",
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
       "dev": true,
-      "license": "MIT"
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
     },
     "node_modules/gensync": {
       "version": "1.0.0-beta.2",
@@ -4325,19 +4636,6 @@
         "node": "6.* || 8.* || >= 10.*"
       }
     },
-    "node_modules/get-intrinsic": {
-      "version": "1.1.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "function-bind": "^1.1.1",
-        "has": "^1.0.3",
-        "has-symbols": "^1.0.1"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/get-stream": {
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-6.0.1.tgz",
@@ -4370,6 +4668,19 @@
         "url": "https://github.com/sponsors/isaacs"
       }
     },
+    "node_modules/glob-parent": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "is-glob": "^4.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
     "node_modules/globals": {
       "version": "11.12.0",
       "dev": true,
@@ -4378,22 +4689,32 @@
         "node": ">=4"
       }
     },
-    "node_modules/graceful-fs": {
-      "version": "4.2.10",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/has": {
-      "version": "1.0.3",
+    "node_modules/globby": {
+      "version": "11.1.0",
+      "resolved": "https://registry.npmjs.org/globby/-/globby-11.1.0.tgz",
+      "integrity": "sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "function-bind": "^1.1.1"
+        "array-union": "^2.1.0",
+        "dir-glob": "^3.0.1",
+        "fast-glob": "^3.2.9",
+        "ignore": "^5.2.0",
+        "merge2": "^1.4.1",
+        "slash": "^3.0.0"
       },
       "engines": {
-        "node": ">= 0.4.0"
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/graceful-fs": {
+      "version": "4.2.10",
+      "dev": true,
+      "license": "ISC"
+    },
     "node_modules/has-flag": {
       "version": "4.0.0",
       "dev": true,
@@ -4402,26 +4723,17 @@
         "node": ">=8"
       }
     },
-    "node_modules/has-property-descriptors": {
-      "version": "1.0.0",
+    "node_modules/hasown": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
+      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "get-intrinsic": "^1.1.1"
+        "function-bind": "^1.1.2"
       },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/has-symbols": {
-      "version": "1.0.3",
-      "dev": true,
-      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
       }
     },
     "node_modules/hermes-estree": {
@@ -4491,6 +4803,16 @@
         "node": ">= 0.8"
       }
     },
+    "node_modules/human-signals": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-1.1.1.tgz",
+      "integrity": "sha512-SEQu7vl8KjNL2eoGBLF3+wAjpsNfA9XMlXAYj/3EdaNfAlxKthD1xjEQfGOUhllCGGJVNY34bRr6lPINhNjyZw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=8.12.0"
+      }
+    },
     "node_modules/ieee754": {
       "version": "1.2.1",
       "funding": [
@@ -4509,6 +4831,16 @@
       ],
       "license": "BSD-3-Clause"
     },
+    "node_modules/ignore": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
+      "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 4"
+      }
+    },
     "node_modules/image-size": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/image-size/-/image-size-1.2.0.tgz",
@@ -4521,8 +4853,25 @@
       "bin": {
         "image-size": "bin/image-size.js"
       },
-      "engines": {
-        "node": ">=16.x"
+      "engines": {
+        "node": ">=16.x"
+      }
+    },
+    "node_modules/import-fresh": {
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz",
+      "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "parent-module": "^1.0.0",
+        "resolve-from": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/imurmurhash": {
@@ -4533,6 +4882,16 @@
         "node": ">=0.8.19"
       }
     },
+    "node_modules/indent-string": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz",
+      "integrity": "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/inflight": {
       "version": "1.0.6",
       "dev": true,
@@ -4557,17 +4916,36 @@
         "loose-envify": "^1.0.0"
       }
     },
+    "node_modules/is-absolute": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-absolute/-/is-absolute-1.0.0.tgz",
+      "integrity": "sha512-dOWoqflvcydARa360Gvv18DZ/gRuHKi2NU/wU5X1ZFzdYfH29nkiNZsF3mp4OJ3H4yo9Mx8A/uAGNzpzPN3yBA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-relative": "^1.0.0",
+        "is-windows": "^1.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/is-arrayish": {
       "version": "0.2.1",
       "dev": true,
       "license": "MIT"
     },
     "node_modules/is-core-module": {
-      "version": "2.9.0",
+      "version": "2.16.1",
+      "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz",
+      "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "has": "^1.0.3"
+        "hasown": "^2.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
       },
       "funding": {
         "url": "https://github.com/sponsors/ljharb"
@@ -4599,6 +4977,16 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/is-extglob": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+      "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/is-fullwidth-code-point": {
       "version": "3.0.0",
       "dev": true,
@@ -4607,6 +4995,124 @@
         "node": ">=8"
       }
     },
+    "node_modules/is-git-dirty": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/is-git-dirty/-/is-git-dirty-2.0.2.tgz",
+      "integrity": "sha512-U3YCo+GKR/rDsY7r0v/LBICbQwsx859tDQnAT+v0E/zCDeWbQ1TUt1FtyExeyik7VIJlYOLHCIifLdz71HDalg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "execa": "^4.0.3",
+        "is-git-repository": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/is-git-dirty/node_modules/execa": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/execa/-/execa-4.1.0.tgz",
+      "integrity": "sha512-j5W0//W7f8UxAn8hXVnwG8tLwdiUy4FJLcSupCg6maBYZDpyBvTApK7KyuI4bKj8KOh1r2YH+6ucuYtJv1bTZA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "cross-spawn": "^7.0.0",
+        "get-stream": "^5.0.0",
+        "human-signals": "^1.1.1",
+        "is-stream": "^2.0.0",
+        "merge-stream": "^2.0.0",
+        "npm-run-path": "^4.0.0",
+        "onetime": "^5.1.0",
+        "signal-exit": "^3.0.2",
+        "strip-final-newline": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sindresorhus/execa?sponsor=1"
+      }
+    },
+    "node_modules/is-git-dirty/node_modules/get-stream": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
+      "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "pump": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/is-git-repository": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/is-git-repository/-/is-git-repository-2.0.0.tgz",
+      "integrity": "sha512-HDO50CG5suIAcmqG4F1buqVXEZRPn+RaXIn9pFKq/947FBo2bCRwK7ZluEVZOy99a4IQyqsjbKEpAiOXCccOHQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "execa": "^4.0.3",
+        "is-absolute": "^1.0.0"
+      }
+    },
+    "node_modules/is-git-repository/node_modules/execa": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/execa/-/execa-4.1.0.tgz",
+      "integrity": "sha512-j5W0//W7f8UxAn8hXVnwG8tLwdiUy4FJLcSupCg6maBYZDpyBvTApK7KyuI4bKj8KOh1r2YH+6ucuYtJv1bTZA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "cross-spawn": "^7.0.0",
+        "get-stream": "^5.0.0",
+        "human-signals": "^1.1.1",
+        "is-stream": "^2.0.0",
+        "merge-stream": "^2.0.0",
+        "npm-run-path": "^4.0.0",
+        "onetime": "^5.1.0",
+        "signal-exit": "^3.0.2",
+        "strip-final-newline": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sindresorhus/execa?sponsor=1"
+      }
+    },
+    "node_modules/is-git-repository/node_modules/get-stream": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
+      "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "pump": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/is-glob": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+      "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-extglob": "^2.1.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/is-interactive": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/is-interactive/-/is-interactive-1.0.0.tgz",
@@ -4617,6 +5123,26 @@
         "node": ">=8"
       }
     },
+    "node_modules/is-path-cwd": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/is-path-cwd/-/is-path-cwd-2.2.0.tgz",
+      "integrity": "sha512-w942bTcih8fdJPJmQHFzkS76NEP8Kzzvmw92cXsazb8intwLqPibPPdXf4ANdKV3rYMuuQYGIWtvz9JilB3NFQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/is-path-inside": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/is-path-inside/-/is-path-inside-3.0.3.tgz",
+      "integrity": "sha512-Fd4gABb+ycGAmKou8eMftCupSir5lRxqf4aD/vd0cD2qc4HL07OjCeuHMr8Ro4CoMaeCKDB0/ECBOVWjTwUvPQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/is-plain-object": {
       "version": "2.0.4",
       "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
@@ -4630,6 +5156,19 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/is-relative": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-relative/-/is-relative-1.0.0.tgz",
+      "integrity": "sha512-Kw/ReK0iqwKeu0MITLFuj0jbPAmEiOsIwyIXvvbfa6QfmN9pkD1M+8pdk7Rl/dTKbH34/XBFMbgD4iMJhLQbGA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-unc-path": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/is-stream": {
       "version": "2.0.1",
       "dev": true,
@@ -4641,6 +5180,19 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/is-unc-path": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-unc-path/-/is-unc-path-1.0.0.tgz",
+      "integrity": "sha512-mrGpVd0fs7WWLfVsStvgF6iEJnbjDFZh9/emhRDcGWTduTfNHd9CHeUwH3gYIjdbwo4On6hunkztwOaAw0yllQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "unc-path-regex": "^0.1.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/is-unicode-supported": {
       "version": "0.1.0",
       "resolved": "https://registry.npmjs.org/is-unicode-supported/-/is-unicode-supported-0.1.0.tgz",
@@ -4654,6 +5206,16 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/is-windows": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/is-windows/-/is-windows-1.0.2.tgz",
+      "integrity": "sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/is-wsl": {
       "version": "2.2.0",
       "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-2.2.0.tgz",
@@ -4788,6 +5350,13 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/json-parse-even-better-errors": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
+      "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/json5": {
       "version": "2.2.3",
       "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz",
@@ -4848,6 +5417,13 @@
         "marky": "^1.2.2"
       }
     },
+    "node_modules/lines-and-columns": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
+      "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/locate-path": {
       "version": "5.0.0",
       "dev": true,
@@ -4985,6 +5561,13 @@
         "loose-envify": "cli.js"
       }
     },
+    "node_modules/lru-cache": {
+      "version": "10.4.3",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz",
+      "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==",
+      "dev": true,
+      "license": "ISC"
+    },
     "node_modules/make-dir": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-2.1.0.tgz",
@@ -5034,6 +5617,16 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/merge2": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
+      "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 8"
+      }
+    },
     "node_modules/metro": {
       "version": "0.80.12",
       "resolved": "https://registry.npmjs.org/metro/-/metro-0.80.12.tgz",
@@ -5752,6 +6345,16 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/minipass": {
+      "version": "4.2.8",
+      "resolved": "https://registry.npmjs.org/minipass/-/minipass-4.2.8.tgz",
+      "integrity": "sha512-fNzuVyifolSLFL4NzpF+wEF4qrgqaaKX0haXPQEdQ7NKAN+WecoKMHV09YcuL/DHxrUsYQOK3MiuDf7Ip2OXfQ==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/mkdirp": {
       "version": "0.5.6",
       "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz",
@@ -5871,9 +6474,9 @@
       "license": "MIT"
     },
     "node_modules/node-releases": {
-      "version": "2.0.18",
-      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.18.tgz",
-      "integrity": "sha512-d9VeXT4SJ7ZeOqGX6R5EM022wpL+eWPooLI+5UpWn2jCT1aosUQEhQP214x33Wkwx3JQMvIm+tIoVOdodFS40g==",
+      "version": "2.0.19",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.19.tgz",
+      "integrity": "sha512-xxOWJsBKtzAq7DY0J+DTzuz58K8e7sJbdgwkbMWQe8UYB6ekmsQ45q0M/tJDsGaZmbC+l7n57UV8Hl5tHxO9uw==",
       "dev": true,
       "license": "MIT"
     },
@@ -5948,31 +6551,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/object-keys": {
-      "version": "1.1.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/object.assign": {
-      "version": "4.1.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.0",
-        "define-properties": "^1.1.3",
-        "has-symbols": "^1.0.1",
-        "object-keys": "^1.1.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/on-finished": {
       "version": "2.3.0",
       "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
@@ -6018,6 +6596,10 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/onnxruntime-common": {
+      "resolved": "../common",
+      "link": true
+    },
     "node_modules/open": {
       "version": "6.4.0",
       "resolved": "https://registry.npmjs.org/open/-/open-6.4.0.tgz",
@@ -6090,6 +6672,22 @@
         "node": ">=8"
       }
     },
+    "node_modules/p-map": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/p-map/-/p-map-4.0.0.tgz",
+      "integrity": "sha512-/bjOqmgETBYB5BoEeGVea8dmvHb2m9GLy1E9W43yeyfP6QQCZGFNa+XRceJEuDB6zqr+gKpIAmlLebMpykw/MQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "aggregate-error": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/p-try": {
       "version": "2.2.0",
       "dev": true,
@@ -6098,6 +6696,38 @@
         "node": ">=6"
       }
     },
+    "node_modules/parent-module": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
+      "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "callsites": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/parse-json": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz",
+      "integrity": "sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.0.0",
+        "error-ex": "^1.3.1",
+        "json-parse-even-better-errors": "^2.3.0",
+        "lines-and-columns": "^1.1.6"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/parseurl": {
       "version": "1.3.3",
       "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
@@ -6111,28 +6741,65 @@
     "node_modules/path-exists": {
       "version": "4.0.0",
       "dev": true,
-      "license": "MIT",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/path-is-absolute": {
+      "version": "1.0.1",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/path-parse": {
+      "version": "1.0.7",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/path-scurry": {
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz",
+      "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==",
+      "dev": true,
+      "license": "BlueOak-1.0.0",
+      "dependencies": {
+        "lru-cache": "^10.2.0",
+        "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/path-scurry/node_modules/minipass": {
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz",
+      "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==",
+      "dev": true,
+      "license": "ISC",
       "engines": {
-        "node": ">=8"
+        "node": ">=16 || 14 >=14.17"
       }
     },
-    "node_modules/path-is-absolute": {
-      "version": "1.0.1",
+    "node_modules/path-type": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz",
+      "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==",
       "dev": true,
       "license": "MIT",
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">=8"
       }
     },
-    "node_modules/path-parse": {
-      "version": "1.0.7",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/picocolors": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.0.tgz",
-      "integrity": "sha512-TQ92mBOW0l3LeMeyLV6mzy/kWr8lkd/hp3mTg7wYK7zJhuBStmGMBG0BdeDZS/dZx1IukaX6Bk11zcln25o1Aw==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
       "dev": true,
       "license": "ISC"
     },
@@ -6167,6 +6834,69 @@
         "node": ">= 6"
       }
     },
+    "node_modules/pkg-up": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/pkg-up/-/pkg-up-3.1.0.tgz",
+      "integrity": "sha512-nDywThFk1i4BQK4twPQ6TA4RT8bDY96yeuCVBWL3ePARCiEKDRSrNGbFIgUJpLp+XeIR65v8ra7WuJOFUBtkMA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "find-up": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/pkg-up/node_modules/find-up": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz",
+      "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "locate-path": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/pkg-up/node_modules/locate-path": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz",
+      "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "p-locate": "^3.0.0",
+        "path-exists": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/pkg-up/node_modules/p-locate": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz",
+      "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "p-limit": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/pkg-up/node_modules/path-exists": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz",
+      "integrity": "sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/pod-install": {
       "version": "0.1.36",
       "dev": true,
@@ -6239,6 +6969,17 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/pump": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.2.tgz",
+      "integrity": "sha512-tUPXtzlGM8FE3P0ZL6DVs/3P58k9nk8/jZeQCurTJylQA8qFYzHFfhBJkuqyE0FifOsQ0uKWekiZ5g8wtr28cw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "end-of-stream": "^1.1.0",
+        "once": "^1.3.1"
+      }
+    },
     "node_modules/queue": {
       "version": "6.0.2",
       "resolved": "https://registry.npmjs.org/queue/-/queue-6.0.2.tgz",
@@ -6249,6 +6990,27 @@
         "inherits": "~2.0.3"
       }
     },
+    "node_modules/queue-microtask": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
+      "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
     "node_modules/range-parser": {
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
@@ -6344,6 +7106,172 @@
         "react": "18.2.0"
       }
     },
+    "node_modules/react-native-builder-bob": {
+      "version": "0.37.0",
+      "resolved": "https://registry.npmjs.org/react-native-builder-bob/-/react-native-builder-bob-0.37.0.tgz",
+      "integrity": "sha512-CkM4csFrYtdGJoRLbPY6V8LBbOxgPZIuM0MkPaiOI2F/ASwxMAzoJu9wBw8Pyvx1p27XnrIEKPyDiTqimJ7xbA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/core": "^7.25.2",
+        "@babel/plugin-transform-strict-mode": "^7.24.7",
+        "@babel/preset-env": "^7.25.2",
+        "@babel/preset-flow": "^7.24.7",
+        "@babel/preset-react": "^7.24.7",
+        "@babel/preset-typescript": "^7.24.7",
+        "babel-plugin-module-resolver": "^5.0.2",
+        "browserslist": "^4.20.4",
+        "cosmiconfig": "^9.0.0",
+        "cross-spawn": "^7.0.3",
+        "dedent": "^0.7.0",
+        "del": "^6.1.1",
+        "escape-string-regexp": "^4.0.0",
+        "fs-extra": "^10.1.0",
+        "glob": "^8.0.3",
+        "is-git-dirty": "^2.0.1",
+        "json5": "^2.2.1",
+        "kleur": "^4.1.4",
+        "metro-config": "^0.80.9",
+        "prompts": "^2.4.2",
+        "which": "^2.0.2",
+        "yargs": "^17.5.1"
+      },
+      "bin": {
+        "bob": "bin/bob"
+      },
+      "engines": {
+        "node": ">= 20.0.0"
+      }
+    },
+    "node_modules/react-native-builder-bob/node_modules/argparse": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+      "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+      "dev": true,
+      "license": "Python-2.0"
+    },
+    "node_modules/react-native-builder-bob/node_modules/brace-expansion": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/react-native-builder-bob/node_modules/cosmiconfig": {
+      "version": "9.0.0",
+      "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz",
+      "integrity": "sha512-itvL5h8RETACmOTFc4UfIyB2RfEHi71Ax6E/PivVxq9NseKbOWpeyHEOIbmAw1rs8Ak0VursQNww7lf7YtUwzg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "env-paths": "^2.2.1",
+        "import-fresh": "^3.3.0",
+        "js-yaml": "^4.1.0",
+        "parse-json": "^5.2.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/d-fischer"
+      },
+      "peerDependencies": {
+        "typescript": ">=4.9.5"
+      },
+      "peerDependenciesMeta": {
+        "typescript": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/react-native-builder-bob/node_modules/fs-extra": {
+      "version": "10.1.0",
+      "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz",
+      "integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "graceful-fs": "^4.2.0",
+        "jsonfile": "^6.0.1",
+        "universalify": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/react-native-builder-bob/node_modules/glob": {
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-8.1.0.tgz",
+      "integrity": "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ==",
+      "deprecated": "Glob versions prior to v9 are no longer supported",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "fs.realpath": "^1.0.0",
+        "inflight": "^1.0.4",
+        "inherits": "2",
+        "minimatch": "^5.0.1",
+        "once": "^1.3.0"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/react-native-builder-bob/node_modules/js-yaml": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
+      "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "argparse": "^2.0.1"
+      },
+      "bin": {
+        "js-yaml": "bin/js-yaml.js"
+      }
+    },
+    "node_modules/react-native-builder-bob/node_modules/jsonfile": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz",
+      "integrity": "sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "universalify": "^2.0.0"
+      },
+      "optionalDependencies": {
+        "graceful-fs": "^4.1.6"
+      }
+    },
+    "node_modules/react-native-builder-bob/node_modules/kleur": {
+      "version": "4.1.5",
+      "resolved": "https://registry.npmjs.org/kleur/-/kleur-4.1.5.tgz",
+      "integrity": "sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/react-native-builder-bob/node_modules/minimatch": {
+      "version": "5.1.6",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
+      "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
     "node_modules/react-native/node_modules/@jest/environment": {
       "version": "29.7.0",
       "resolved": "https://registry.npmjs.org/@jest/environment/-/environment-29.7.0.tgz",
@@ -6784,11 +7712,15 @@
     },
     "node_modules/regenerate": {
       "version": "1.4.2",
+      "resolved": "https://registry.npmjs.org/regenerate/-/regenerate-1.4.2.tgz",
+      "integrity": "sha512-zrceR/XhGYU/d/opr2EKO7aRHUeiBI8qjtfHqADTwZd6Szfy16la6kqD0MIUs5z5hx6AaKa+PixpPrR289+I0A==",
       "dev": true,
       "license": "MIT"
     },
     "node_modules/regenerate-unicode-properties": {
-      "version": "10.0.1",
+      "version": "10.2.0",
+      "resolved": "https://registry.npmjs.org/regenerate-unicode-properties/-/regenerate-unicode-properties-10.2.0.tgz",
+      "integrity": "sha512-DqHn3DwbmmPVzeKj9woBadqmXxLvQoQIwu7nopMc72ztvxVmVk2SBhSnx67zuye5TP+lJsb/TBQsjLKhnDf3MA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -6809,48 +7741,59 @@
       "integrity": "sha512-hfMp2BoF0qOk3uc5V20ALGDS2ddjQaLrdl7xrGXvAIow7qeWRM2VA2HuCHkUKk9slq3VwEwLNK3DFBqDfPGYtg==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@babel/runtime": "^7.8.4"
       }
     },
     "node_modules/regexpu-core": {
-      "version": "5.0.1",
+      "version": "6.2.0",
+      "resolved": "https://registry.npmjs.org/regexpu-core/-/regexpu-core-6.2.0.tgz",
+      "integrity": "sha512-H66BPQMrv+V16t8xtmq+UC0CBpiTBA60V8ibS1QVReIp8T1z8hwFxqcGzm9K6lgsN7sB5edVH8a+ze6Fqm4weA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
         "regenerate": "^1.4.2",
-        "regenerate-unicode-properties": "^10.0.1",
-        "regjsgen": "^0.6.0",
-        "regjsparser": "^0.8.2",
+        "regenerate-unicode-properties": "^10.2.0",
+        "regjsgen": "^0.8.0",
+        "regjsparser": "^0.12.0",
         "unicode-match-property-ecmascript": "^2.0.0",
-        "unicode-match-property-value-ecmascript": "^2.0.0"
+        "unicode-match-property-value-ecmascript": "^2.1.0"
       },
       "engines": {
         "node": ">=4"
       }
     },
     "node_modules/regjsgen": {
-      "version": "0.6.0",
+      "version": "0.8.0",
+      "resolved": "https://registry.npmjs.org/regjsgen/-/regjsgen-0.8.0.tgz",
+      "integrity": "sha512-RvwtGe3d7LvWiDQXeQw8p5asZUmfU1G/l6WbUXeHta7Y2PEIvBTwH6E2EfmYUK8pxcxEdEmaomqyp0vZZ7C+3Q==",
       "dev": true,
       "license": "MIT"
     },
     "node_modules/regjsparser": {
-      "version": "0.8.4",
+      "version": "0.12.0",
+      "resolved": "https://registry.npmjs.org/regjsparser/-/regjsparser-0.12.0.tgz",
+      "integrity": "sha512-cnE+y8bz4NhMjISKbgeVJtqNbtf5QpjZP+Bslo+UqkIt9QPnX9q095eiRRASJG1/tz6dlNr6Z5NsBiWYokp6EQ==",
       "dev": true,
       "license": "BSD-2-Clause",
       "dependencies": {
-        "jsesc": "~0.5.0"
+        "jsesc": "~3.0.2"
       },
       "bin": {
         "regjsparser": "bin/parser"
       }
     },
     "node_modules/regjsparser/node_modules/jsesc": {
-      "version": "0.5.0",
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.0.2.tgz",
+      "integrity": "sha512-xKqzzWXDttJuOcawBt4KnKHHIf5oQ/Cxax+0PWFG+DFDgHNAdi+TXECADI+RYiFUMmx8792xsMbbgXj4CwnP4g==",
       "dev": true,
+      "license": "MIT",
       "bin": {
         "jsesc": "bin/jsesc"
+      },
+      "engines": {
+        "node": ">=6"
       }
     },
     "node_modules/require-directory": {
@@ -6868,22 +7811,44 @@
       "dev": true,
       "license": "ISC"
     },
+    "node_modules/reselect": {
+      "version": "4.1.8",
+      "resolved": "https://registry.npmjs.org/reselect/-/reselect-4.1.8.tgz",
+      "integrity": "sha512-ab9EmR80F/zQTMNeneUr4cv+jSwPJgIlvEmVwLerwrWVbpLlBuls9XHzIeTFy4cegU2NHBp3va0LKOzU5qFEYQ==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/resolve": {
-      "version": "1.22.0",
+      "version": "1.22.10",
+      "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.10.tgz",
+      "integrity": "sha512-NPRy+/ncIMeDlTAsuqwKIiferiawhefFJtkNSW0qZJEqMEb+qBt/77B/jGeeek+F0uOeN05CDa6HXbbIgtVX4w==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "is-core-module": "^2.8.1",
+        "is-core-module": "^2.16.0",
         "path-parse": "^1.0.7",
         "supports-preserve-symlinks-flag": "^1.0.0"
       },
       "bin": {
         "resolve": "bin/resolve"
       },
+      "engines": {
+        "node": ">= 0.4"
+      },
       "funding": {
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/resolve-from": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+      "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/restore-cursor": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-3.1.0.tgz",
@@ -6898,6 +7863,17 @@
         "node": ">=8"
       }
     },
+    "node_modules/reusify": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz",
+      "integrity": "sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "iojs": ">=1.0.0",
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/rimraf": {
       "version": "3.0.2",
       "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
@@ -6915,6 +7891,30 @@
         "url": "https://github.com/sponsors/isaacs"
       }
     },
+    "node_modules/run-parallel": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
+      "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "queue-microtask": "^1.2.2"
+      }
+    },
     "node_modules/safe-buffer": {
       "version": "5.1.2",
       "dev": true,
@@ -7506,8 +8506,20 @@
         "node": ">=8"
       }
     },
+    "node_modules/unc-path-regex": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/unc-path-regex/-/unc-path-regex-0.1.2.tgz",
+      "integrity": "sha512-eXL4nmJT7oCpkZsHZUOJo8hcX3GbsiDOa0Qu9F646fi8dT3XuSVopVqAcEiVzSKKH7UoDti23wNX3qGFxcW5Qg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/unicode-canonical-property-names-ecmascript": {
-      "version": "2.0.0",
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-2.0.1.tgz",
+      "integrity": "sha512-dA8WbNeb2a6oQzAQ55YlT5vQAWGV9WXOsi3SskE3bcCdM0P4SDd+24zS/OCacdRq5BkdsRj9q3Pg6YyQoxIGqg==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -7516,6 +8528,8 @@
     },
     "node_modules/unicode-match-property-ecmascript": {
       "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/unicode-match-property-ecmascript/-/unicode-match-property-ecmascript-2.0.0.tgz",
+      "integrity": "sha512-5kaZCrbp5mmbz5ulBkDkbY0SsPOjKqVS35VpL9ulMPfSl0J0Xsm+9Evphv9CoIZFwre7aJoa94AY6seMKGVN5Q==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -7527,7 +8541,9 @@
       }
     },
     "node_modules/unicode-match-property-value-ecmascript": {
-      "version": "2.0.0",
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/unicode-match-property-value-ecmascript/-/unicode-match-property-value-ecmascript-2.2.0.tgz",
+      "integrity": "sha512-4IehN3V/+kkr5YeSSDDQG8QLqO26XpL2XP3GQtqwlT/QYSECAwFztxVHjlbh0+gjJ3XmNLS0zDsbgs9jWKExLg==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -7535,13 +8551,25 @@
       }
     },
     "node_modules/unicode-property-aliases-ecmascript": {
-      "version": "2.0.0",
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/unicode-property-aliases-ecmascript/-/unicode-property-aliases-ecmascript-2.1.0.tgz",
+      "integrity": "sha512-6t3foTQI9qne+OZoVQB/8x8rk2k1eVy1gRXhV3oFQ5T6R1dqQ1xtin3XqSlx3+ATBkliTaR/hHyJBm+LVPNM8w==",
       "dev": true,
       "license": "MIT",
       "engines": {
         "node": ">=4"
       }
     },
+    "node_modules/universalify": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz",
+      "integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 10.0.0"
+      }
+    },
     "node_modules/unpipe": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
@@ -7553,9 +8581,9 @@
       }
     },
     "node_modules/update-browserslist-db": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.0.tgz",
-      "integrity": "sha512-EdRAaAyk2cUE1wOf2DkEhzxqOQvFOoRJFNS6NeyJ01Gp2beMRpBAINjM2iDXE3KCuKhwnvHIQCJm6ThL2Z+HzQ==",
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.3.tgz",
+      "integrity": "sha512-UxhIZQ+QInVdunkDAaiazvvT/+fXL5Osr0JZlJulepYu6Jd7qJtDZjlur0emRlT71EN3ScPoE7gvsuIKKNavKw==",
       "dev": true,
       "funding": [
         {
@@ -7573,8 +8601,8 @@
       ],
       "license": "MIT",
       "dependencies": {
-        "escalade": "^3.1.2",
-        "picocolors": "^1.0.1"
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.1"
       },
       "bin": {
         "update-browserslist-db": "cli.js"
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 253f7e24cf7ed..4ac4fa79d49fc 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -25,7 +25,8 @@
     "pod-install": "^0.1.36",
     "prettier": "^2.8.8",
     "react": "^18.2.0",
-    "react-native": "^0.73.11"
+    "react-native": "^0.73.11",
+    "react-native-builder-bob": "^0.37.0"
   },
   "peerDependencies": {
     "react": "*",
@@ -60,17 +61,35 @@
   ],
   "description": "ONNX Runtime bridge for react native",
   "repository": "https://github.com/Microsoft/onnxruntime.git",
+  "react-native-builder-bob": {
+    "source": "lib",
+    "targets": [
+      "commonjs",
+      "module",
+      [
+        "typescript",
+        {
+          "project": "tsconfig.build.json",
+          "tsc": "../node_modules/.bin/tsc"
+        }
+      ]
+    ],
+    "output": "dist"
+  },
   "dependencies": {
-    "buffer": "^6.0.3"
+    "buffer": "^6.0.3",
+    "onnxruntime-common": "file:../common"
   },
   "scripts": {
     "typescript": "tsc --noEmit",
-    "bootstrap": "npm run pack-common && npm run unpack-common && npm run pack-libs && npm run unpack-libs && npm run e2e && npm run pods",
+    "prepare": "bob build",
+    "bootstrap-no-pods": "npm run pack-common && npm run unpack-common && npm run pack-libs && npm run unpack-libs && npm run e2e",
+    "bootstrap": "npm run bootstrap-no-pods && npm run pods",
     "test": "jest",
     "pack-common": "cd ../common && npm pack && mv -f onnxruntime-common-*.tgz ../react_native/e2e/onnxruntime-common.tgz",
-    "unpack-common": "cd e2e && npm install --no-save ./onnxruntime-common.tgz",
+    "unpack-common": "npm --prefix e2e install ./e2e/onnxruntime-common.tgz",
     "pack-libs": "npm pack --ort-js-pack-mode=e2e && mv -f onnxruntime-react-native-*.tgz e2e/onnxruntime-react-native.tgz",
-    "unpack-libs": "cd e2e && npm install --no-save ./onnxruntime-react-native.tgz",
+    "unpack-libs": "npm --prefix e2e install ./e2e/onnxruntime-react-native.tgz",
     "prepack": "tsc --build ./tsconfig.scripts.json && node ./scripts/prepack",
     "pods": "cd e2e && npx pod-install --quiet",
     "e2e": "npm --prefix e2e install"
diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
index 72b51d565896a..5e5f4804aed92 100644
--- a/js/web/lib/backend-wasm.ts
+++ b/js/web/lib/backend-wasm.ts
@@ -88,7 +88,7 @@ export class OnnxruntimeWebAssemblyBackend implements Backend {
   ): Promise<InferenceSessionHandler> {
     const handler = new OnnxruntimeWebAssemblySessionHandler();
     await handler.loadModel(pathOrBuffer, options);
-    return Promise.resolve(handler);
+    return handler;
   }
 }
 
diff --git a/js/web/lib/onnxjs/session-handler-inference.ts b/js/web/lib/onnxjs/session-handler-inference.ts
index c1c2576971840..f13ee7a56c78a 100644
--- a/js/web/lib/onnxjs/session-handler-inference.ts
+++ b/js/web/lib/onnxjs/session-handler-inference.ts
@@ -12,6 +12,14 @@ export class OnnxjsSessionHandler implements InferenceSessionHandler {
     this.outputNames = this.session.outputNames;
   }
 
+  get inputMetadata(): readonly InferenceSession.ValueMetadata[] {
+    throw new Error('Getting model metadata is not supported in webgl backend.');
+  }
+
+  get outputMetadata(): readonly InferenceSession.ValueMetadata[] {
+    throw new Error('Getting model metadata is not supported in webgl backend.');
+  }
+
   async dispose(): Promise<void> {}
   inputNames: readonly string[];
   outputNames: readonly string[];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts b/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts
index 50c71472434ad..16c3af871b4e6 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts
@@ -126,6 +126,8 @@ const pixelAtGrid = (input: IndicesHelper, dataType: string, attributes: GridSam
           if (r >= 0 && r < H && c >=0 && c < W) {
             indices[${idxH}] = u32(r);
             indices[${idxW}] = u32(c);
+          } else {
+            return ${dataType}(0);
           }
         `;
       case 'border':
diff --git a/js/web/lib/wasm/proxy-messages.ts b/js/web/lib/wasm/proxy-messages.ts
index 559f319a10f66..2584203d4503b 100644
--- a/js/web/lib/wasm/proxy-messages.ts
+++ b/js/web/lib/wasm/proxy-messages.ts
@@ -46,7 +46,13 @@ export type UnserializableTensorMetadata =
  */
 export type TensorMetadata = SerializableTensorMetadata | UnserializableTensorMetadata;
 
-export type SerializableSessionMetadata = [sessionHandle: number, inputNames: string[], outputNames: string[]];
+export type SerializableSessionMetadata = [
+  sessionHandle: number,
+  inputNames: string[],
+  outputNames: string[],
+  inputMetadata: InferenceSession.ValueMetadata[],
+  outputMetadata: InferenceSession.ValueMetadata[],
+];
 
 export type SerializableInternalBuffer = [bufferOffset: number, bufferLength: number];
 
diff --git a/js/web/lib/wasm/session-handler-inference.ts b/js/web/lib/wasm/session-handler-inference.ts
index c19043cc3637f..1fa2216b57219 100644
--- a/js/web/lib/wasm/session-handler-inference.ts
+++ b/js/web/lib/wasm/session-handler-inference.ts
@@ -57,8 +57,10 @@ export const decodeTensorMetadata = (tensor: TensorMetadata): Tensor => {
 export class OnnxruntimeWebAssemblySessionHandler implements InferenceSessionHandler {
   private sessionId: number;
 
-  inputNames: string[];
-  outputNames: string[];
+  inputNames: readonly string[];
+  outputNames: readonly string[];
+  inputMetadata: readonly InferenceSession.ValueMetadata[];
+  outputMetadata: readonly InferenceSession.ValueMetadata[];
 
   async fetchModelAndCopyToWasmMemory(path: string): Promise<SerializableInternalBuffer> {
     // fetch model from url and move to wasm heap.
@@ -82,7 +84,10 @@ export class OnnxruntimeWebAssemblySessionHandler implements InferenceSessionHan
       model = pathOrBuffer;
     }
 
-    [this.sessionId, this.inputNames, this.outputNames] = await createSession(model, options);
+    [this.sessionId, this.inputNames, this.outputNames, this.inputMetadata, this.outputMetadata] = await createSession(
+      model,
+      options,
+    );
     TRACE_FUNC_END();
   }
 
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index bb532e0fbae74..8dd643293937b 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -229,6 +229,49 @@ const getSessionInputOutputCount = (sessionHandle: number): [number, number] =>
   }
 };
 
+const getSessionInputOutputMetadata = (
+  sessionHandle: number,
+  index: number,
+): [nameOffset: number, elementType: number, dims?: Array<number | string>] => {
+  const wasm = getInstance();
+  const stack = wasm.stackSave();
+  let metadataOffset = 0;
+  try {
+    const ptrSize = wasm.PTR_SIZE;
+    const dataOffset = wasm.stackAlloc(2 * ptrSize);
+    const errorCode = wasm._OrtGetInputOutputMetadata(sessionHandle, index, dataOffset, dataOffset + ptrSize);
+    if (errorCode !== 0) {
+      checkLastError("Can't get session input/output metadata.");
+    }
+    const nameOffset = Number(wasm.getValue(dataOffset, '*'));
+    metadataOffset = Number(wasm.getValue(dataOffset + ptrSize, '*'));
+    // get element type
+    const elementType = wasm.HEAP32[metadataOffset / 4];
+    if (elementType === 0) {
+      return [nameOffset, 0]; // non-tensor
+    }
+
+    // get dims count
+    const dimsCount = wasm.HEAPU32[metadataOffset / 4 + 1];
+    // get dims
+    const dims: Array<number | string> = [];
+    for (let i = 0; i < dimsCount; i++) {
+      const symbolicDimNameOffset = Number(wasm.getValue(metadataOffset + 8 + i * ptrSize, '*'));
+      dims.push(
+        symbolicDimNameOffset !== 0
+          ? wasm.UTF8ToString(symbolicDimNameOffset)
+          : Number(wasm.getValue(metadataOffset + 8 + (i + dimsCount) * ptrSize, '*')),
+      );
+    }
+    return [nameOffset, elementType, dims];
+  } finally {
+    wasm.stackRestore(stack);
+    if (metadataOffset !== 0) {
+      wasm._OrtFree(metadataOffset);
+    }
+  }
+};
+
 /**
  * allocate the memory and memcpy the external buffer.
  *
@@ -341,23 +384,36 @@ export const createSession = async (
 
     const inputNames = [];
     const outputNames = [];
+    const inputMetadata: InferenceSession.ValueMetadata[] = [];
+    const outputMetadata: InferenceSession.ValueMetadata[] = [];
     const outputPreferredLocations: SupportedTensorDataLocationForInputOutput[] = [];
     for (let i = 0; i < inputCount; i++) {
-      const name = wasm._OrtGetInputName(sessionHandle, i);
-      if (name === 0) {
+      const [nameOffset, elementType, shape] = getSessionInputOutputMetadata(sessionHandle, i);
+      if (nameOffset === 0) {
         checkLastError("Can't get an input name.");
       }
-      inputNamesUTF8Encoded.push(name);
-      inputNames.push(wasm.UTF8ToString(name));
+      inputNamesUTF8Encoded.push(nameOffset);
+      const name = wasm.UTF8ToString(nameOffset);
+      inputNames.push(name);
+      inputMetadata.push(
+        elementType === 0
+          ? { name, isTensor: false }
+          : { name, isTensor: true, type: tensorDataTypeEnumToString(elementType), shape: shape! },
+      );
     }
     for (let i = 0; i < outputCount; i++) {
-      const name = wasm._OrtGetOutputName(sessionHandle, i);
-      if (name === 0) {
+      const [nameOffset, elementType, shape] = getSessionInputOutputMetadata(sessionHandle, i + inputCount);
+      if (nameOffset === 0) {
         checkLastError("Can't get an output name.");
       }
-      outputNamesUTF8Encoded.push(name);
-      const nameString = wasm.UTF8ToString(name);
+      outputNamesUTF8Encoded.push(nameOffset);
+      const nameString = wasm.UTF8ToString(nameOffset);
       outputNames.push(nameString);
+      outputMetadata.push(
+        elementType === 0
+          ? { name: nameString, isTensor: false }
+          : { name: nameString, isTensor: true, type: tensorDataTypeEnumToString(elementType), shape: shape! },
+      );
 
       if (!BUILD_DEFS.DISABLE_JSEP) {
         if (enableGraphCapture && options?.preferredOutputLocation === undefined) {
@@ -403,7 +459,7 @@ export const createSession = async (
       enableGraphCapture,
       false,
     ]);
-    return [sessionHandle, inputNames, outputNames];
+    return [sessionHandle, inputNames, outputNames, inputMetadata, outputMetadata];
   } catch (e) {
     inputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
     outputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
@@ -470,6 +526,7 @@ export const prepareInputOutputTensor = async (
   tensorHandles: number[],
   allocs: number[],
   sessionId: number,
+  tensorNameUTF8Encoded: number,
   index: number,
   enableGraphCapture = false,
 ): Promise<void> => {
@@ -543,8 +600,7 @@ export const prepareInputOutputTensor = async (
     } else {
       const isGraphInput = wasm.webnnIsGraphInput;
       if (dataType !== 'string' && isGraphInput) {
-        const tensorNameUTF8 = wasm._OrtGetInputName(sessionId, index);
-        const tensorName = wasm.UTF8ToString(tensorNameUTF8);
+        const tensorName = wasm.UTF8ToString(tensorNameUTF8Encoded);
         // Promote the tensor to 'ml-tensor' if it is a graph input.
         if (isGraphInput(sessionId, tensorName)) {
           const dataTypeEnum = tensorDataTypeStringToEnum(dataType);
@@ -644,6 +700,7 @@ export const run = async (
         inputTensorHandles,
         inputOutputAllocs,
         sessionId,
+        inputNamesUTF8Encoded[inputIndices[i]],
         inputIndices[i],
         enableGraphCapture,
       );
@@ -656,6 +713,7 @@ export const run = async (
         outputTensorHandles,
         inputOutputAllocs,
         sessionId,
+        outputNamesUTF8Encoded[outputIndices[i]],
         inputCount + outputIndices[i],
         enableGraphCapture,
       );
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index 752bac28d7efb..b2ca8480f1546 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -327,8 +327,12 @@ export interface OrtInferenceAPIs {
   _OrtCreateSession(dataOffset: number, dataLength: number, sessionOptionsHandle: number): Promise<number>;
   _OrtReleaseSession(sessionHandle: number): number;
   _OrtGetInputOutputCount(sessionHandle: number, inputCountOffset: number, outputCountOffset: number): number;
-  _OrtGetInputName(sessionHandle: number, index: number): number;
-  _OrtGetOutputName(sessionHandle: number, index: number): number;
+  _OrtGetInputOutputMetadata(
+    sessionHandle: number,
+    index: number,
+    namePtrOffset: number,
+    metadataPtrOffset: number,
+  ): number;
 
   _OrtFree(stringHandle: number): number;
 
diff --git a/js/web/package.json b/js/web/package.json
index b95d78e404cb0..af02453b0870a 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -72,15 +72,24 @@
         "import": "./dist/ort.node.min.mjs",
         "require": "./dist/ort.node.min.js"
       },
-      "import": "./dist/ort.bundle.min.mjs",
+      "import": {
+        "onnxruntime-web-use-extern-wasm": "./dist/ort.min.mjs",
+        "default": "./dist/ort.bundle.min.mjs"
+      },
       "require": "./dist/ort.min.js"
     },
     "./all": {
-      "import": "./dist/ort.all.bundle.min.mjs",
+      "import": {
+        "onnxruntime-web-use-extern-wasm": "./dist/ort.all.min.mjs",
+        "default": "./dist/ort.all.bundle.min.mjs"
+      },
       "require": "./dist/ort.all.min.js"
     },
     "./wasm": {
-      "import": "./dist/ort.wasm.bundle.min.mjs",
+      "import": {
+        "onnxruntime-web-use-extern-wasm": "./dist/ort.wasm.min.mjs",
+        "default": "./dist/ort.wasm.bundle.min.mjs"
+      },
       "require": "./dist/ort.wasm.min.js"
     },
     "./webgl": {
@@ -88,7 +97,10 @@
       "require": "./dist/ort.webgl.min.js"
     },
     "./webgpu": {
-      "import": "./dist/ort.webgpu.bundle.min.mjs",
+      "import": {
+        "onnxruntime-web-use-extern-wasm": "./dist/ort.webgpu.min.mjs",
+        "default": "./dist/ort.webgpu.bundle.min.mjs"
+      },
       "require": "./dist/ort.webgpu.min.js"
     }
   },
diff --git a/js/web/test/e2e/exports/testcases/nextjs-default/package-lock.json b/js/web/test/e2e/exports/testcases/nextjs-default/package-lock.json
index 174812402f578..11a5c1ac3f2ee 100644
--- a/js/web/test/e2e/exports/testcases/nextjs-default/package-lock.json
+++ b/js/web/test/e2e/exports/testcases/nextjs-default/package-lock.json
@@ -8,7 +8,7 @@
       "name": "nextjs-default",
       "version": "0.1.0",
       "dependencies": {
-        "next": "15.1.2",
+        "next": "15.2.3",
         "react": "^19.0.0",
         "react-dom": "^19.0.0"
       }
@@ -385,15 +385,15 @@
       }
     },
     "node_modules/@next/env": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.1.2.tgz",
-      "integrity": "sha512-Hm3jIGsoUl6RLB1vzY+dZeqb+/kWPZ+h34yiWxW0dV87l8Im/eMOwpOA+a0L78U0HM04syEjXuRlCozqpwuojQ==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.2.3.tgz",
+      "integrity": "sha512-a26KnbW9DFEUsSxAxKBORR/uD9THoYoKbkpFywMN/AFvboTt94b8+g/07T8J6ACsdLag8/PDU60ov4rPxRAixw==",
       "license": "MIT"
     },
     "node_modules/@next/swc-darwin-arm64": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.1.2.tgz",
-      "integrity": "sha512-b9TN7q+j5/7+rGLhFAVZiKJGIASuo8tWvInGfAd8wsULjB1uNGRCj1z1WZwwPWzVQbIKWFYqc+9L7W09qwt52w==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.2.3.tgz",
+      "integrity": "sha512-uaBhA8aLbXLqwjnsHSkxs353WrRgQgiFjduDpc7YXEU0B54IKx3vU+cxQlYwPCyC8uYEEX7THhtQQsfHnvv8dw==",
       "cpu": [
         "arm64"
       ],
@@ -407,9 +407,9 @@
       }
     },
     "node_modules/@next/swc-darwin-x64": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.1.2.tgz",
-      "integrity": "sha512-caR62jNDUCU+qobStO6YJ05p9E+LR0EoXh1EEmyU69cYydsAy7drMcOlUlRtQihM6K6QfvNwJuLhsHcCzNpqtA==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.2.3.tgz",
+      "integrity": "sha512-pVwKvJ4Zk7h+4hwhqOUuMx7Ib02u3gDX3HXPKIShBi9JlYllI0nU6TWLbPT94dt7FSi6mSBhfc2JrHViwqbOdw==",
       "cpu": [
         "x64"
       ],
@@ -423,9 +423,9 @@
       }
     },
     "node_modules/@next/swc-linux-arm64-gnu": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.1.2.tgz",
-      "integrity": "sha512-fHHXBusURjBmN6VBUtu6/5s7cCeEkuGAb/ZZiGHBLVBXMBy4D5QpM8P33Or8JD1nlOjm/ZT9sEE5HouQ0F+hUA==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.2.3.tgz",
+      "integrity": "sha512-50ibWdn2RuFFkOEUmo9NCcQbbV9ViQOrUfG48zHBCONciHjaUKtHcYFiCwBVuzD08fzvzkWuuZkd4AqbvKO7UQ==",
       "cpu": [
         "arm64"
       ],
@@ -439,9 +439,9 @@
       }
     },
     "node_modules/@next/swc-linux-arm64-musl": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.1.2.tgz",
-      "integrity": "sha512-9CF1Pnivij7+M3G74lxr+e9h6o2YNIe7QtExWq1KUK4hsOLTBv6FJikEwCaC3NeYTflzrm69E5UfwEAbV2U9/g==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.2.3.tgz",
+      "integrity": "sha512-2gAPA7P652D3HzR4cLyAuVYwYqjG0mt/3pHSWTCyKZq/N/dJcUAEoNQMyUmwTZWCJRKofB+JPuDVP2aD8w2J6Q==",
       "cpu": [
         "arm64"
       ],
@@ -455,9 +455,9 @@
       }
     },
     "node_modules/@next/swc-linux-x64-gnu": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.1.2.tgz",
-      "integrity": "sha512-tINV7WmcTUf4oM/eN3Yuu/f8jQ5C6AkueZPKeALs/qfdfX57eNv4Ij7rt0SA6iZ8+fMobVfcFVv664Op0caCCg==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.2.3.tgz",
+      "integrity": "sha512-ODSKvrdMgAJOVU4qElflYy1KSZRM3M45JVbeZu42TINCMG3anp7YCBn80RkISV6bhzKwcUqLBAmOiWkaGtBA9w==",
       "cpu": [
         "x64"
       ],
@@ -471,9 +471,9 @@
       }
     },
     "node_modules/@next/swc-linux-x64-musl": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.1.2.tgz",
-      "integrity": "sha512-jf2IseC4WRsGkzeUw/cK3wci9pxR53GlLAt30+y+B+2qAQxMw6WAC3QrANIKxkcoPU3JFh/10uFfmoMDF9JXKg==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.2.3.tgz",
+      "integrity": "sha512-ZR9kLwCWrlYxwEoytqPi1jhPd1TlsSJWAc+H/CJHmHkf2nD92MQpSRIURR1iNgA/kuFSdxB8xIPt4p/T78kwsg==",
       "cpu": [
         "x64"
       ],
@@ -487,9 +487,9 @@
       }
     },
     "node_modules/@next/swc-win32-arm64-msvc": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.1.2.tgz",
-      "integrity": "sha512-wvg7MlfnaociP7k8lxLX4s2iBJm4BrNiNFhVUY+Yur5yhAJHfkS8qPPeDEUH8rQiY0PX3u/P7Q/wcg6Mv6GSAA==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.2.3.tgz",
+      "integrity": "sha512-+G2FrDcfm2YDbhDiObDU/qPriWeiz/9cRR0yMWJeTLGGX6/x8oryO3tt7HhodA1vZ8r2ddJPCjtLcpaVl7TE2Q==",
       "cpu": [
         "arm64"
       ],
@@ -503,9 +503,9 @@
       }
     },
     "node_modules/@next/swc-win32-x64-msvc": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.1.2.tgz",
-      "integrity": "sha512-D3cNA8NoT3aWISWmo7HF5Eyko/0OdOO+VagkoJuiTk7pyX3P/b+n8XA/MYvyR+xSVcbKn68B1rY9fgqjNISqzQ==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.2.3.tgz",
+      "integrity": "sha512-gHYS9tc+G2W0ZC8rBL+H6RdtXIyk40uLiaos0yj5US85FNhbFEndMA2nW3z47nzOWiSvXTZ5kBClc3rD0zJg0w==",
       "cpu": [
         "x64"
       ],
@@ -651,12 +651,12 @@
       }
     },
     "node_modules/next": {
-      "version": "15.1.2",
-      "resolved": "https://registry.npmjs.org/next/-/next-15.1.2.tgz",
-      "integrity": "sha512-nLJDV7peNy+0oHlmY2JZjzMfJ8Aj0/dd3jCwSZS8ZiO5nkQfcZRqDrRN3U5rJtqVTQneIOGZzb6LCNrk7trMCQ==",
+      "version": "15.2.3",
+      "resolved": "https://registry.npmjs.org/next/-/next-15.2.3.tgz",
+      "integrity": "sha512-x6eDkZxk2rPpu46E1ZVUWIBhYCLszmUY6fvHBFcbzJ9dD+qRX6vcHusaqqDlnY+VngKzKbAiG2iRCkPbmi8f7w==",
       "license": "MIT",
       "dependencies": {
-        "@next/env": "15.1.2",
+        "@next/env": "15.2.3",
         "@swc/counter": "0.1.3",
         "@swc/helpers": "0.5.15",
         "busboy": "1.6.0",
@@ -671,14 +671,14 @@
         "node": "^18.18.0 || ^19.8.0 || >= 20.0.0"
       },
       "optionalDependencies": {
-        "@next/swc-darwin-arm64": "15.1.2",
-        "@next/swc-darwin-x64": "15.1.2",
-        "@next/swc-linux-arm64-gnu": "15.1.2",
-        "@next/swc-linux-arm64-musl": "15.1.2",
-        "@next/swc-linux-x64-gnu": "15.1.2",
-        "@next/swc-linux-x64-musl": "15.1.2",
-        "@next/swc-win32-arm64-msvc": "15.1.2",
-        "@next/swc-win32-x64-msvc": "15.1.2",
+        "@next/swc-darwin-arm64": "15.2.3",
+        "@next/swc-darwin-x64": "15.2.3",
+        "@next/swc-linux-arm64-gnu": "15.2.3",
+        "@next/swc-linux-arm64-musl": "15.2.3",
+        "@next/swc-linux-x64-gnu": "15.2.3",
+        "@next/swc-linux-x64-musl": "15.2.3",
+        "@next/swc-win32-arm64-msvc": "15.2.3",
+        "@next/swc-win32-x64-msvc": "15.2.3",
         "sharp": "^0.33.5"
       },
       "peerDependencies": {
diff --git a/js/web/test/e2e/exports/testcases/nextjs-default/package.json b/js/web/test/e2e/exports/testcases/nextjs-default/package.json
index 6688445cded26..9bad599248259 100644
--- a/js/web/test/e2e/exports/testcases/nextjs-default/package.json
+++ b/js/web/test/e2e/exports/testcases/nextjs-default/package.json
@@ -11,6 +11,6 @@
   "dependencies": {
     "react": "^19.0.0",
     "react-dom": "^19.0.0",
-    "next": "15.1.2"
+    "next": "15.2.3"
   }
 }
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 5de39535a5c07..6fbce114093c0 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -1016,8 +1016,9 @@ export class ProtoOpTestContext {
       // check if all test cases have the same shape for each inputs
       if (
         test.cases.some((testCase) =>
-          testCase.inputs!.some((input: Test.TensorValue, i) =>
-            TensorResultValidator.integerEqual(input.dims, (test.cases[0].inputs![i] as Test.TensorValue).dims),
+          testCase.inputs!.some(
+            (input: Test.TensorValue, i) =>
+              !TensorResultValidator.integerEqual(input.dims, (test.cases[0].inputs![i] as Test.TensorValue).dims),
           ),
         )
       ) {
diff --git a/js/web/test/unittests/backends/wasm/test-model-metadata.ts b/js/web/test/unittests/backends/wasm/test-model-metadata.ts
new file mode 100644
index 0000000000000..9b70686633f27
--- /dev/null
+++ b/js/web/test/unittests/backends/wasm/test-model-metadata.ts
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { expect } from 'chai';
+import { InferenceSession } from 'onnxruntime-common';
+
+const ONNX_MODEL_TEST_ABS_NO_SHAPE = Uint8Array.from([
+  8, 9, 58, 73, 10, 31, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 8, 111, 117, 116, 112, 117, 116, 95, 48, 26, 3, 65,
+  98, 115, 34, 3, 65, 98, 115, 58, 0, 18, 3, 97, 98, 115, 90, 15, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 4, 10, 2,
+  8, 1, 98, 16, 10, 8, 111, 117, 116, 112, 117, 116, 95, 48, 18, 4, 10, 2, 8, 1, 66, 4, 10, 0, 16, 21,
+]);
+
+const ONNX_MODEL_TEST_ABS_SYMBOL = Uint8Array.from([
+  8, 9, 58, 105, 10, 31, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 8, 111, 117, 116, 112, 117, 116, 95, 48, 26, 3, 65,
+  98, 115, 34, 3, 65, 98, 115, 58, 0, 18, 3, 97, 98, 115, 90, 47, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 36, 10,
+  34, 8, 1, 18, 30, 10, 13, 18, 11, 95, 105, 110, 112, 117, 116, 95, 48, 95, 100, 48, 10, 13, 18, 11, 95, 105, 110, 112,
+  117, 116, 95, 48, 95, 100, 49, 98, 16, 10, 8, 111, 117, 116, 112, 117, 116, 95, 48, 18, 4, 10, 2, 8, 1, 66, 4, 10, 0,
+  16, 21,
+]);
+
+const ONNX_MODEL_TEST_ABS_STATIC = Uint8Array.from([
+  8, 9, 58, 83, 10, 31, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 8, 111, 117, 116, 112, 117, 116, 95, 48, 26, 3, 65,
+  98, 115, 34, 3, 65, 98, 115, 58, 0, 18, 3, 97, 98, 115, 90, 25, 10, 7, 105, 110, 112, 117, 116, 95, 48, 18, 14, 10,
+  12, 8, 1, 18, 8, 10, 2, 8, 2, 10, 2, 8, 4, 98, 16, 10, 8, 111, 117, 116, 112, 117, 116, 95, 48, 18, 4, 10, 2, 8, 1,
+  66, 4, 10, 0, 16, 21,
+]);
+
+const testModelMetadata = async (
+  model: Uint8Array,
+  expectedInputNames: string[],
+  expectedOutputNames: string[],
+  expectedInputMetadata: InferenceSession.ValueMetadata[],
+  expectedOutputMetadata: InferenceSession.ValueMetadata[],
+) => {
+  const session = await InferenceSession.create(model);
+  expect(session.inputNames).to.deep.equal(expectedInputNames);
+  expect(session.outputNames).to.deep.equal(expectedOutputNames);
+  expect(session.inputMetadata).to.deep.equal(expectedInputMetadata);
+  expect(session.outputMetadata).to.deep.equal(expectedOutputMetadata);
+};
+
+describe('#UnitTest# - wasm - test model input/output metadata', () => {
+  it('model input/output with no shape', async () => {
+    await testModelMetadata(
+      ONNX_MODEL_TEST_ABS_NO_SHAPE,
+      ['input_0'],
+      ['output_0'],
+      [{ name: 'input_0', isTensor: true, type: 'float32', shape: [] }],
+      [{ name: 'output_0', isTensor: true, type: 'float32', shape: [] }],
+    );
+  });
+
+  it('model input/output with symbol shape', async () => {
+    await testModelMetadata(
+      ONNX_MODEL_TEST_ABS_SYMBOL,
+      ['input_0'],
+      ['output_0'],
+      [
+        {
+          name: 'input_0',
+          isTensor: true,
+          type: 'float32',
+          shape: ['_input_0_d0', '_input_0_d1'],
+        },
+      ],
+      [
+        {
+          name: 'output_0',
+          isTensor: true,
+          type: 'float32',
+          shape: ['_input_0_d0', '_input_0_d1'],
+        },
+      ],
+    );
+  });
+
+  it('model input/output with static shape', async () => {
+    await testModelMetadata(
+      ONNX_MODEL_TEST_ABS_STATIC,
+      ['input_0'],
+      ['output_0'],
+      [{ name: 'input_0', isTensor: true, type: 'float32', shape: [2, 4] }],
+      [{ name: 'output_0', isTensor: true, type: 'float32', shape: [2, 4] }],
+    );
+  });
+});
diff --git a/js/web/test/unittests/index.ts b/js/web/test/unittests/index.ts
index 4a0b155ecc80b..b68681f4977c0 100644
--- a/js/web/test/unittests/index.ts
+++ b/js/web/test/unittests/index.ts
@@ -11,4 +11,6 @@ if (typeof window !== 'undefined') {
   require('./backends/webgl/test-matmul-packed');
 }
 
+require('./backends/wasm/test-model-metadata');
+
 require('./opset');
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h b/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h
index 417865bb166ec..c3d5128948c6f 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h
@@ -87,9 +87,9 @@ struct GroupQueryAttentionParameters : AttentionParameters {
   int seqlen_present_kv_cache;  // sequence length of present kv tensor
   int kv_hidden_size;
   int kv_num_heads;
-  int num_splits;  // number of splits for splitkv
-  int rotary_dim;  // rotary embedding dimension
-  int local_window_size;
+  int num_splits;         // number of splits for splitkv
+  int rotary_dim;         // rotary embedding dimension
+  int local_window_size;  // The window size excludes current token. It only includes tokens on the left side.
   bool kv_share_buffer;
   bool is_packed_qkv;
   bool is_subsequent_prompt;  // indicates whether we have past context and seqlen > 1
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
index c8c66c880852f..c79508cbae273 100644
--- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -270,7 +270,8 @@ class GQAAttentionBase {
         for (size_t seq = 0; seq < sequence_length; seq++) {
           size_t seq_causal_length = past_seqlen + seq + 1;
 
-          const bool should_apply_local_window = local_window_size_ > 0 &&
+          // local_window_size does not include the current query token, while window_size includes it.
+          const bool should_apply_local_window = local_window_size_ >= 0 &&
                                                  seq_causal_length > static_cast<size_t>(local_window_size_) + 1;
 
           const size_t start_offset = should_apply_local_window ? seq_causal_length - local_window_size_ - 1 : 0;
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index c742cd1e95bdd..345b5e793a764 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -24,6 +24,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GroupQueryAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SparseAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, SparseAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, RotaryEmbedding);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling);
@@ -299,6 +300,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GroupQueryAttention)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SparseAttention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, SparseAttention)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, RotaryEmbedding)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index c3e43f897c509..d5a6a1ae699d9 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -133,6 +133,7 @@ class MatMulNBits final : public OpKernel {
   const size_t nbits_;
   const bool has_g_idx_;
   const bool has_bias_;
+  bool scales_are_packed_{false};
   const MLAS_QNBIT_GEMM_COMPUTE_TYPE compute_type_;
   bool has_unquantized_zero_point_{false};
   const bool column_wise_quant_{true};
@@ -181,13 +182,18 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
     return Status::OK();
   }
   if (input_idx == InputIndex::B) {
-    packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
+    const Tensor* scales = nullptr;
+    OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales);
+
+    packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, has_zp_input_, compute_type_);
     if (packed_b_size_ == 0) {
       return Status::OK();
     }
     auto qptr = tensor.DataRaw();
+    auto scale_ptr = scales ? scales->DataRaw() : nullptr;
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
-    MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr);
+    MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), scale_ptr,
+                                has_zp_input_, nullptr, nullptr);
     is_packed = true;
   } else if (compute_type_ == SQNBIT_CompInt8) {
 #ifdef MLAS_TARGET_AMD64_IX86
@@ -198,10 +204,17 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
       is_packed = false;
     } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
       auto zptr = tensor.Data<uint8_t>();
-      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr);
+      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr,
+                                  has_zp_input_, zptr, nullptr);
       is_packed = false;
     }
-#endif  // MLAS_TARGET_AMD64_IX86
+#elif defined(MLAS_TARGET_ARM64)
+    if (input_idx == InputIndex::scales && packed_b_ != nullptr &&
+        MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, has_zp_input_)) {
+      scales_are_packed_ = true;
+      is_packed = true;
+    }
+#endif  // MLAS_TARGET_ARM64
   }
 
   return Status::OK();
@@ -236,14 +249,24 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
     return Status::OK();
   }
   if (input_idx == InputIndex::B) {
-    packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
+    const Tensor* scales = nullptr;
+    OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales);
+    if (scales && MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, has_zp_input_)) {
+      auto sptr = scales->Data<MLFloat16>();
+      auto tensor_size = static_cast<size_t>(tensor.Shape().Size());
+      auto ptr = IAllocator::MakeUniquePtr<float>(alloc, tensor_size, true);
+      MlasConvertHalfToFloatBuffer(sptr, ptr.get(), tensor_size);
+      scales_fp32_ = std::move(ptr);
+    }
+
+    packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, has_zp_input_, compute_type_);
     if (packed_b_size_ == 0) {
       return Status::OK();
     }
     auto qptr = tensor.DataRaw();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
     MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(),
-                                nullptr, has_zp_input_, nullptr, nullptr);
+                                scales_fp32_.get(), has_zp_input_, nullptr, nullptr);
     is_packed = true;
   } else if (compute_type_ == SQNBIT_CompInt8) {
 #ifdef MLAS_TARGET_AMD64_IX86
@@ -287,7 +310,7 @@ Status MatMulNBits<T1>::ComputeBPacked(const Tensor* a,
                                        concurrency::ThreadPool* thread_pool,
                                        const MatMulComputeHelper& helper) const {
   const auto* a_data = a->Data<T1>();
-  const auto* scales_data = scales->Data<T1>();
+  const auto* scales_data = scales == nullptr ? nullptr : scales->Data<T1>();
   const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
   const auto* bias_data = bias == nullptr ? nullptr : bias->Data<T1>();
   auto* y_data = y->MutableData<T1>();
@@ -300,7 +323,7 @@ Status MatMulNBits<T1>::ComputeBPacked(const Tensor* a,
 
   IAllocatorUniquePtr<std::byte> workspace{};
   const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize(
-      M, N, K, batch_count, nbits_, block_size_, compute_type_);
+      M, N, K, batch_count, nbits_, block_size_, zero_points, compute_type_);
   if (workspace_size > 0) {
     // Use reserve since no caching is needed
     workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size, true);
@@ -310,11 +333,9 @@ Status MatMulNBits<T1>::ComputeBPacked(const Tensor* a,
   for (size_t i = 0; i < batch_count; ++i) {
     data[i].A = a_data + helper.LeftOffsets()[i];
     data[i].lda = lda;
-#ifdef MLAS_TARGET_AMD64_IX86
     if (compute_type_ == SQNBIT_CompInt8) {
       data[i].QuantBDataWorkspace = packed_b_.get();
     }
-#endif
     data[i].PackedQuantBData = static_cast<std::byte*>(packed_b_.get());
     data[i].QuantBScale = scales_data;
     data[i].QuantBZeroPoint = zero_points_data;
@@ -351,7 +372,7 @@ Status MatMulNBits<MLFloat16>::ComputeBPacked(const Tensor* a,
 
   IAllocatorUniquePtr<std::byte> workspace{};
   const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize(
-      M, N, K, batch_count, nbits_, block_size_, compute_type_);
+      M, N, K, batch_count, nbits_, block_size_, zero_points, compute_type_);
   if (workspace_size > 0) {
     // Use reserve since no caching is needed
     workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size, true);
@@ -653,7 +674,7 @@ template <typename T1>
 Status MatMulNBits<T1>::Compute(OpKernelContext* ctx) const {
   concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
   const Tensor* a = ctx->Input<Tensor>(InputIndex::A);
-  const Tensor* scales = ctx->Input<Tensor>(InputIndex::scales);
+  const Tensor* scales = scales_are_packed_ ? nullptr : ctx->Input<Tensor>(InputIndex::scales);
   const Tensor* zero_points = ctx->Input<Tensor>(InputIndex::zero_points);
   const Tensor* reorder_idx = ctx->Input<Tensor>(InputIndex::g_idx);
   const Tensor* bias = ctx->Input<Tensor>(InputIndex::bias);
diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
index e337f41a8688d..469084e7b4491 100644
--- a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
@@ -21,16 +21,20 @@ using onnxruntime::concurrency::ThreadPool;
 namespace onnxruntime {
 namespace contrib {
 
-ONNX_OPERATOR_TYPED_KERNEL_EX(
-    SparseAttention,
-    kMSDomain,
-    1,
-    float,
-    kCpuExecutionProvider,
-    KernelDefBuilder()
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("M", DataTypeImpl::GetTensorType<int32_t>()),
-    SparseAttention<float>);
+#define REGISTER_KERNEL_TYPED(T)                                        \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                        \
+      SparseAttention,                                                  \
+      kMSDomain,                                                        \
+      1,                                                                \
+      T,                                                                \
+      kCpuExecutionProvider,                                            \
+      KernelDefBuilder()                                                \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())        \
+          .TypeConstraint("M", DataTypeImpl::GetTensorType<int32_t>()), \
+      SparseAttention<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(MLFloat16)
 
 template <typename T>
 SparseAttention<T>::SparseAttention(const OpKernelInfo& info) : OpKernel(info), SparseAttentionBase(info) {
diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h
index 2c719b3724106..cccaec0b16ce5 100644
--- a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h
@@ -67,7 +67,10 @@ class SparseAttentionBase {
     int present_buffer_sequence_length = static_cast<int>(present_key->Shape().GetDims()[2]);
 
     // Allocate a buffer to store Softmax(QK)
-    size_t bytes = SafeInt<size_t>(batch_size) * num_heads_ * sequence_length * parameters.total_sequence_length * sizeof(T);
+    bool attention_mlas_supported = MlasGQASupported<T>(CblasNoTrans, CblasTrans) &&
+                                    MlasGQASupported<T>(CblasNoTrans, CblasNoTrans);
+    size_t bytes = SafeInt<size_t>(batch_size) * num_heads_ * sequence_length * parameters.total_sequence_length *
+                   (attention_mlas_supported ? sizeof(T) : sizeof(float));
     auto attention_probs = allocator->Alloc(bytes);
     BufferUniquePtr scratch_buffer(attention_probs, BufferDeleter(allocator));
 
@@ -77,21 +80,37 @@ class SparseAttentionBase {
     auto* tp = context->GetOperatorThreadPool();
 
     const T* k = packed_qkv ? Q + num_heads_ * sequence_length * head_size : K;
-    ComputeAttentionProbs<T>(
-        static_cast<T*>(attention_probs), Q, k, total_key_lengths->Data<int32_t>(),
-        batch_size, sequence_length, parameters.total_sequence_length,
-        past_buffer_sequence_length, present_buffer_sequence_length, head_size,
-        past_key->Data<T>(), present_key->MutableData<T>(), past_present_share_buffer, packed_qkv,
-        block_row_indices->Data<int32_t>(), block_col_indices->Data<int32_t>(), parameters, tp);
-
-    // Compute the attentionScore * Value: out(B, N, S, H_v) = attention_probs(B, N, S, T) x V(B, N, T, H_v)
     const T* v = packed_qkv ? Q + (num_heads_ + kv_num_heads_) * sequence_length * head_size : V;
-    ComputeVxAttentionScore<T>(
-        output->MutableData<T>(), static_cast<T*>(attention_probs), v,
-        total_key_lengths->Data<int32_t>(),
-        batch_size, sequence_length, parameters.total_sequence_length,
-        past_buffer_sequence_length, present_buffer_sequence_length, head_size, parameters.hidden_size,
-        past_value->Data<T>(), present_value->MutableData<T>(), past_present_share_buffer, packed_qkv, tp);
+
+    if (attention_mlas_supported) {
+      ComputeAttentionProbs(
+          static_cast<T*>(attention_probs), Q, k, total_key_lengths->Data<int32_t>(),
+          batch_size, sequence_length, parameters.total_sequence_length,
+          past_buffer_sequence_length, present_buffer_sequence_length, head_size,
+          past_key->Data<T>(), present_key->MutableData<T>(), past_present_share_buffer, packed_qkv,
+          block_row_indices->Data<int32_t>(), block_col_indices->Data<int32_t>(), parameters, tp, allocator);
+
+      ComputeVxAttentionScore(
+          output->MutableData<T>(), static_cast<T*>(attention_probs), v,
+          total_key_lengths->Data<int32_t>(),
+          batch_size, sequence_length, parameters.total_sequence_length,
+          past_buffer_sequence_length, present_buffer_sequence_length, head_size, parameters.hidden_size,
+          past_value->Data<T>(), present_value->MutableData<T>(), past_present_share_buffer, packed_qkv, tp, allocator);
+    } else {
+      ComputeAttentionProbs(
+          static_cast<float*>(attention_probs), Q, k, total_key_lengths->Data<int32_t>(),
+          batch_size, sequence_length, parameters.total_sequence_length,
+          past_buffer_sequence_length, present_buffer_sequence_length, head_size,
+          past_key->Data<T>(), present_key->MutableData<T>(), past_present_share_buffer, packed_qkv,
+          block_row_indices->Data<int32_t>(), block_col_indices->Data<int32_t>(), parameters, tp, allocator);
+
+      ComputeVxAttentionScore(
+          output->MutableData<T>(), static_cast<float*>(attention_probs), v,
+          total_key_lengths->Data<int32_t>(),
+          batch_size, sequence_length, parameters.total_sequence_length,
+          past_buffer_sequence_length, present_buffer_sequence_length, head_size, parameters.hidden_size,
+          past_value->Data<T>(), present_value->MutableData<T>(), past_present_share_buffer, packed_qkv, tp, allocator);
+    }
 
     return Status::OK();
   }
@@ -100,9 +119,9 @@ class SparseAttentionBase {
   // Helper function to compute the attention probs. It does 2 things:
   //  attention_probs(B, N, S, T) = 1/sqrt(H) x Q(B, N, S, H) x K'(B, N, T, H -> B, N, H, T)
   //  attention_probs(B, N, S, T) = Softmax(attention_probs)
-  template <typename T>
+  template <typename T, typename U>
   void ComputeAttentionProbs(
-      T* attention_probs,                     // output buffer with size BxNxSxT
+      U* attention_probs,                     // output buffer with size BxNxSxT
       const T* Q,                             // query start pointer
       const T* K,                             // key start pointer
       const int32_t* total_key_lengths,       // total key sequence lengths (past + new)
@@ -119,7 +138,8 @@ class SparseAttentionBase {
       const int32_t* block_row_indices,       // block row indices
       const int32_t* block_col_indices,       // block column indices
       SparseAttentionParameters& parameters,  // parameters
-      ThreadPool* tp) const {                 // thread pool
+      ThreadPool* tp,                         // thread pool
+      AllocatorPtr allocator) const {
     const bool is_prompt = (total_sequence_length == sequence_length);
     const ptrdiff_t packed_batch_stride =
         packed_qkv ? SafeInt<ptrdiff_t>(num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size
@@ -173,7 +193,7 @@ class SparseAttentionBase {
         const int total_seq_len = total_key_lengths[batch_index];
 
         const ptrdiff_t output_offset = SafeInt<ptrdiff_t>(i) * sequence_length * total_sequence_length;
-        T* output = attention_probs + output_offset;
+        U* output = attention_probs + output_offset;
 
         const T* k;
         if (packed_qkv) {
@@ -205,14 +225,34 @@ class SparseAttentionBase {
         DUMP_CPU_TENSOR("Q", q, sequence_length, head_size);
         DUMP_CPU_TENSOR("K", k, total_seq_len, head_size);
 
-        math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_seq_len, head_size, alpha, q,
-                                    head_size, k, head_size, 0.0f /*bata*/, output, total_seq_len,
-                                    nullptr);
+        if constexpr (std::is_same<T, float>::value) {
+          math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_seq_len, head_size, alpha, q,
+                                      head_size, k, head_size, 0.0f /*bata*/, output, total_seq_len,
+                                      nullptr);
+        } else if constexpr (std::is_same<U, MLFloat16>::value) {
+          MlasGemm(CblasNoTrans, CblasTrans, sequence_length, total_seq_len, head_size,
+                   q, head_size, k, head_size, output, total_seq_len,
+                   MLFloat16(alpha).val, static_cast<uint16_t>(0) /*beta*/, nullptr);
+        } else {
+          size_t bytes = static_cast<size_t>(head_size) * (sequence_length + total_seq_len) * sizeof(float);
+          auto q_k_fp32 = allocator->Alloc(bytes);
+          BufferUniquePtr scratch_buffer(q_k_fp32, BufferDeleter(allocator));
+
+          float* q_fp32 = static_cast<float*>(q_k_fp32);
+          MlasConvertHalfToFloatBuffer(q, q_fp32, static_cast<size_t>(head_size) * sequence_length);
+
+          float* k_fp32 = q_fp32 + head_size * sequence_length;
+          MlasConvertHalfToFloatBuffer(k, k_fp32, static_cast<size_t>(head_size) * total_seq_len);
+
+          math::GemmEx<float, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_seq_len, head_size,
+                                          alpha, q_fp32, head_size, k_fp32, head_size, 0.0f /*bata*/,
+                                          output, total_seq_len, nullptr);
+        }
 
         DUMP_CPU_TENSOR("QK", output, sequence_length, total_seq_len);
 
         // Compute Softmax for causal and output result in place.
-        T* output_softmax = output;
+        U* output_softmax = output;
 
         int layout_id = head_index % parameters.num_sparse_layout;
         bool is_sparse_layout = layout_has_sparse[layout_id];
@@ -224,7 +264,11 @@ class SparseAttentionBase {
             int causal_length = past_seq_len + q_id + 1;
             ComputeAttentionSoftmaxInplace(output_softmax, 1, causal_length, nullptr);
             for (int remain_seq_id = causal_length; remain_seq_id < total_seq_len; remain_seq_id++) {
-              output_softmax[remain_seq_id] = 0.f;
+              if constexpr (std::is_same<U, float>::value) {
+                output_softmax[remain_seq_id] = 0.f;
+              } else {
+                output_softmax[remain_seq_id] = MLFloat16::FromBits(static_cast<uint16_t>(0));
+              }
             }
             output_softmax += total_seq_len;
           }
@@ -278,14 +322,23 @@ class SparseAttentionBase {
             // Update inline according to attention mask.
             if (has_sparse) {
               for (int s = 0; s < causal_length; s++) {
-                if (mask[s] == 0)
-                  output_softmax[s] = std::numeric_limits<T>::lowest();
+                if (mask[s] == 0) {
+                  if constexpr (std::is_same<U, float>::value) {
+                    output_softmax[s] = std::numeric_limits<T>::lowest();
+                  } else {
+                    output_softmax[s] = MLFloat16::FromBits(static_cast<uint16_t>(0xFBFF));
+                  }
+                }
               }
             }
             ComputeAttentionSoftmaxInplace(output_softmax, 1, causal_length, nullptr);
 
             for (int remain_seq_id = causal_length; remain_seq_id < total_seq_len; remain_seq_id++) {
-              output_softmax[remain_seq_id] = 0.f;
+              if constexpr (std::is_same<U, float>::value) {
+                output_softmax[remain_seq_id] = 0.f;
+              } else {
+                output_softmax[remain_seq_id] = MLFloat16::FromBits(static_cast<uint16_t>(0));
+              }
             }
 
             output_softmax += total_seq_len;
@@ -299,9 +352,9 @@ class SparseAttentionBase {
     });
   }
 
-  template <typename T>
+  template <typename T, typename U>
   void ComputeVxAttentionScore(T* output,                           // buffer for the result with size BxSxNxH
-                               const T* attention_probs,            // Softmax of Q*K' with size BxNxSxT
+                               const U* attention_probs,            // Softmax of Q*K' with size BxNxSxT
                                const T* V,                          // v value with size BxN_kvxSxH
                                const int32_t* total_key_lengths,    // total sequence lengths
                                int batch_size,                      // batch size
@@ -315,7 +368,8 @@ class SparseAttentionBase {
                                T* present_value,                    // present value only
                                bool past_present_share_buffer,      // whether past_key and present_key share the buffer
                                bool packed_qkv,                     // whether Q, K, V are packed
-                               ThreadPool* tp) const {
+                               ThreadPool* tp,
+                               AllocatorPtr allocator) const {
     const bool is_prompt = sequence_length == total_sequence_length;
     const ptrdiff_t packed_batch_stride =
         packed_qkv ? SafeInt<ptrdiff_t>(num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size
@@ -341,6 +395,13 @@ class SparseAttentionBase {
       unit_cost.bytes_stored += bytes_to_copy_value;
     }
 
+    size_t output_fp32_bytes = 0;
+    if constexpr (std::is_same<T, MLFloat16>::value && std::is_same<U, float>::value) {
+      output_fp32_bytes = SafeInt<size_t>(sequence_length) * batch_size * num_heads_ * head_size * sizeof(float);
+    }
+    auto output_fp32 = allocator->Alloc(output_fp32_bytes);
+    BufferUniquePtr scratch_buffer(output_fp32, BufferDeleter(allocator));
+
     DUMP_CPU_TENSOR_INIT();
 
     ThreadPool::TryParallelFor(
@@ -376,14 +437,42 @@ class SparseAttentionBase {
 
             DUMP_CPU_TENSOR("attention_probs", attention_probs + attention_probs_offset, sequence_length, total_seq_len);
 
-            math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasNoTrans, sequence_length, head_size, total_seq_len,
-                                        1.f, /*alpha*/
-                                        attention_probs + attention_probs_offset, total_seq_len, v,
-                                        head_size, 0.0f /*beta*/, output_current, hidden_size, nullptr);
+            if constexpr (std::is_same<T, float>::value) {
+              math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasNoTrans, sequence_length, head_size, total_seq_len,
+                                          1.f, /*alpha*/
+                                          attention_probs + attention_probs_offset, total_seq_len, v,
+                                          head_size, 0.0f /*beta*/, output_current, hidden_size, nullptr);
+            } else if constexpr (std::is_same<U, MLFloat16>::value) {
+              MlasGemm(CblasNoTrans, CblasNoTrans, sequence_length, head_size, total_seq_len,
+                       attention_probs + attention_probs_offset, total_seq_len,
+                       v, head_size, output_current, hidden_size,
+                       MLFloat16(1.0f).val, static_cast<uint16_t>(0) /*beta*/, nullptr);
+            } else {
+              size_t bytes = static_cast<size_t>(head_size) * total_seq_len * sizeof(float);
+              auto v_fp32 = allocator->Alloc(bytes);
+              BufferUniquePtr scratch_buffer(v_fp32, BufferDeleter(allocator));
+
+              float* v_fp32_ptr = static_cast<float*>(v_fp32);
+              MlasConvertHalfToFloatBuffer(v, v_fp32_ptr, static_cast<size_t>(head_size) * total_seq_len);
+
+              float* output_fp32_current = static_cast<float*>(output_fp32) +
+                                           (batch_index * sequence_length * num_heads_ + head_index) * head_size;
+              math::GemmEx<float, ThreadPool>(CblasNoTrans, CblasNoTrans, sequence_length, head_size, total_seq_len,
+                                              1.f, /*alpha*/ attention_probs + attention_probs_offset,
+                                              total_seq_len, v_fp32_ptr,
+                                              head_size, 0.0f /*beta*/, output_fp32_current,
+                                              hidden_size, nullptr);
+            }
 
             DUMP_CPU_TENSOR("out", attention_probs + attention_probs_offset, sequence_length, head_size);
           }
         });
+
+    if constexpr (std::is_same<T, MLFloat16>::value && std::is_same<U, float>::value) {
+      MlasConvertFloatToHalfBuffer(static_cast<float*>(output_fp32),
+                                   output,
+                                   SafeInt<size_t>(sequence_length) * batch_size * num_heads_ * head_size);
+    }
   }
 };
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
index 12fae5ccf0983..885827fb09e7e 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
@@ -68,6 +68,10 @@ REGISTER_KERNEL_TYPED(float)
 
 namespace transformers {
 
+constexpr const char* kBeamSearchNotSupportFp16InCpu =
+    "BeamSearch does not support float16 model on CPU execution provider. "
+    "Use float32 model or CUDA execution provider instead.";
+
 void BeamSearch::Init(const OpKernelInfo& info) {
   parameters_->ParseFromAttributes(info);
 
@@ -139,13 +143,19 @@ Status BeamSearch::SetupSubgraphExecutionInfo(const SessionState& session_state,
       ORT_RETURN_IF_ERROR(t5_encoder_subgraph_->Setup(session_state, subgraph_session_state));
       encoder_feeds_fetches_manager_ = t5_encoder_subgraph_->GetFeedsFetchesManager();
 
-      if (parameters_->decoder_start_token_id < 0) {
-        ORT_RETURN_IF(t5_encoder_subgraph_->num_subgraph_inputs != 2,
-                      "Encoder subgraph shall have 2 inputs when decoder_start_token_id attribute is empty");
+      if (!t5_encoder_subgraph_->HasLogitsOutput()) {
+        // New format requires start token id.
+        ORT_ENFORCE(parameters_->decoder_start_token_id >= 0);
       } else {
-        ORT_RETURN_IF(t5_encoder_subgraph_->num_subgraph_inputs != 3,
-                      "Encoder subgraph shall have 3 inputs when decoder_start_token_id attribute is available");
+        if (parameters_->decoder_start_token_id < 0) {
+          ORT_RETURN_IF(t5_encoder_subgraph_->num_subgraph_inputs != 2,
+                        "Encoder subgraph shall have 2 inputs when decoder_start_token_id attribute is empty");
+        } else {
+          ORT_RETURN_IF(t5_encoder_subgraph_->num_subgraph_inputs != 3,
+                        "Encoder subgraph shall have 3 inputs when decoder_start_token_id attribute is available");
+        }
       }
+
     } else if (attribute_name == "decoder") {
       ORT_ENFORCE(t5_decoder_subgraph_ == nullptr,
                   "SetupSubgraphExecutionInfo should only be called once for each subgraph.");
@@ -209,6 +219,8 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
   // Make a copy of parameters since we will update it based on inputs later
   BeamSearchParameters parameters = *parameters_;
 
+  const bool is_cpu_provider = ctx->GetComputeStream() == nullptr;
+
   if (parameters.model_type == IGenerationParameters::kModelTypeGpt) {
     if (!gpt_subgraph_->IsOutputFloat16()) {  // Output float32
       BeamSearchGpt<float> impl{
@@ -234,6 +246,10 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
 
       return impl.Execute(init_run_decoder_feeds_fetches_manager_, *decoder_feeds_fetches_manager_);
     } else {  // Output float16
+      if (is_cpu_provider) {
+        ORT_THROW(kBeamSearchNotSupportFp16InCpu);
+      }
+
       BeamSearchGpt<MLFloat16> impl{
           *ctx_internal,
           has_init_decoder_ ? init_run_decoder_session_state : nullptr,
@@ -250,6 +266,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           device_copy_int32_func_,
           update_gpt_feeds_fp16_func_,
           create_beam_scorer_func_};
+
 #ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
@@ -288,6 +305,10 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
 
       return impl.Execute(*encoder_feeds_fetches_manager_, *decoder_feeds_fetches_manager_);
     } else {
+      if (is_cpu_provider) {
+        ORT_THROW(kBeamSearchNotSupportFp16InCpu);
+      }
+
       BeamSearchT5<MLFloat16> impl{
           *ctx_internal, *encoder_session_state, *decoder_session_state, *t5_encoder_subgraph_,
           *t5_decoder_subgraph_, thread_pool, ctx->GetComputeStream(), dumper_, parameters,
@@ -303,6 +324,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_,
           expand_buffer_float16_func_,
           create_beam_scorer_func_};
+
 #ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
@@ -340,6 +362,10 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
 
       return impl.Execute(*encoder_feeds_fetches_manager_, *decoder_feeds_fetches_manager_);
     } else {
+      if (is_cpu_provider) {
+        ORT_THROW(kBeamSearchNotSupportFp16InCpu);
+      }
+
       BeamSearchWhisper<MLFloat16> impl{
           *ctx_internal, *encoder_session_state, *decoder_session_state, *whisper_encoder_subgraph_,
           *whisper_decoder_subgraph_, thread_pool, ctx->GetComputeStream(), dumper_, parameters,
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
index b67d003eaceeb..c9646cf0fab2e 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@@ -51,7 +51,13 @@ class BeamSearchT5 : public BeamSearchBase<T> {
         expand_buffer_int32_func_(expand_buffer_int32_func),
         expand_buffer_float_func_(expand_buffer_float_func),
         expand_buffer_float16_func_(expand_buffer_float16_func),
-        create_beam_scorer_func_(create_beam_scorer_func) {}
+        create_beam_scorer_func_(create_beam_scorer_func) {
+    // When decoder uses encoder_hidden_state, make sure the encoder outputs it.
+    if (decoder_subgraph_.UseEncoderHiddenState()) {
+      ORT_ENFORCE(encoder_subgraph_.subgraph_output_names[1] == "encoder_hidden_states");
+    }
+    ORT_ENFORCE(encoder_subgraph_.num_layers == decoder_subgraph_.num_layers);
+  }
 
 #ifdef USE_CUDA
   Status InitializeCuda(
@@ -160,7 +166,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
       this->create_encoder_inputs_func_,
       this->add_to_feeds_func_,
       buffer,
-      decoder_input_ids,
+      decoder_input_ids,  // new format does not use decoder_input_ids in encoder, it is still initialized here when decoder_start_token_id >= 0.
       this->ort_stream_));
 
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
@@ -233,35 +239,47 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
 
   std::vector<OrtValue> decoder_fetches;
 
-  if (current_length + 1 < parameters->max_length) {
+  // When encoder outputs logits (in old format), we need get the next token from logits.
+  if (current_length + 1 < parameters->max_length && encoder_subgraph_.HasLogitsOutput()) {
     ++iteration_counter;
-    ORT_RETURN_IF_ERROR(this->GenerateNextToken(encoder_fetches[0],
+    const OrtValue& logits = encoder_fetches[0];
+    ORT_RETURN_IF_ERROR(this->GenerateNextToken(logits,
                                                 beam_next_tokens,
                                                 beam_state,
                                                 cpu_state,
                                                 iteration_counter));
     ++current_length;  // Increase sequence length after a new token is generated.
+  }
 
-    ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(this->cpu_allocator_,
-                                                             ReinterpretAsSpan<const int32_t>(beam_next_tokens),
-                                                             this->implicit_inputs_,
-                                                             encoder_feeds,
-                                                             encoder_fetches,
-                                                             decoder_feeds,
-                                                             this->device_copy_int32_func_,
-                                                             this->expand_buffer_int32_func_,
-                                                             this->expand_buffer_float_func_,
-                                                             this->expand_buffer_float16_func_,
-                                                             parameters->num_beams,
-                                                             this->ort_stream_,
-                                                             decoder_subgraph_.UseSequenceAsInputIds(),
-                                                             current_length,
-                                                             cpu_state.sequences,
-                                                             parameters->max_length,
-                                                             decoder_subgraph_.has_decoder_masked_attention_,
-                                                             this->cuda_device_prop_ != nullptr));
+  if (current_length < parameters->max_length) {
+    // when no logits, copy sequence (filled with start token IDs) to input_ids for decoder.
+    bool copy_sequence_to_input_ids = decoder_subgraph_.UseSequenceAsInputIds() || !encoder_subgraph_.HasLogitsOutput();
+    if (copy_sequence_to_input_ids) {
+      ORT_ENFORCE(current_length == cpu_state.sequences.GetSequenceLength());
+    }
+
+    // Generate inputs for next decoder subgraph call.
+    ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(
+        this->cpu_allocator_,
+        ReinterpretAsSpan<const int32_t>(beam_next_tokens),
+        this->implicit_inputs_,
+        encoder_feeds,
+        encoder_fetches,
+        decoder_feeds,
+        this->device_copy_int32_func_,
+        this->expand_buffer_int32_func_,
+        this->expand_buffer_float_func_,
+        this->expand_buffer_float16_func_,
+        parameters->num_beams,
+        this->ort_stream_,
+        copy_sequence_to_input_ids,
+        cpu_state.sequences,
+        parameters->max_length,
+        decoder_subgraph_.has_decoder_masked_attention_,
+        this->cuda_device_prop_ != nullptr));
 
     if (decoder_subgraph_.past_present_share_buffer_) {
+      // Configure buffer sharing of past and present kv cache.
       decoder_fetches.reserve(static_cast<size_t>(decoder_subgraph_.GetFirstPresentOutputIndex()) +
                               2 * static_cast<size_t>(decoder_subgraph_.num_layers));
       decoder_fetches.resize(decoder_subgraph_.GetFirstPresentOutputIndex(), OrtValue());
@@ -299,14 +317,19 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
 
   while (current_length < parameters->max_length) {
     iteration_counter++;
+
 #ifdef DEBUG_GENERATION
-    auto cur_len = std::to_string(current_length);
-    dumper->Print("***CurrentLength", cur_len, true);
+    dumper->Print(::onnxruntime::MakeString("Iteration=", iteration_counter,
+                                            ", CurrentLength=", current_length,
+                                            ", num_layers=", decoder_subgraph_.num_layers,
+                                            ", decoder_feeds=", decoder_feeds.size(),
+                                            ", start_token_id=", parameters->decoder_start_token_id));
 
     for (int i = 0; i < decoder_subgraph_.GetFirstPastInputIndex(); i++) {
       dumper->Print("decoder_feeds", i, true);
       dumper->Print("", decoder_feeds[i]);
     }
+
     for (int i = 0; i < decoder_subgraph_.num_layers; i++) {
       int self_key_idx = decoder_subgraph_.GetFirstPastInputIndex() + 2 * i;
       int self_value_idx = self_key_idx + 1;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
index 7757435990a65..537d066b264a1 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
@@ -36,12 +36,9 @@ Subgraph::Subgraph(
   auto& subgraph_inputs = subgraph.GetInputs();
   auto& subgraph_outputs = subgraph.GetOutputs();
 
-  // inputs: input_ids, position_ids, attention_mask, past_0, past_1, ...
-  // outputs: logits, present_0, present_1, ...
   num_subgraph_inputs = static_cast<int>(subgraph_inputs.size());
   num_subgraph_outputs = static_cast<int>(subgraph_outputs.size());
 
-  // CheckSubgraph will verify inputs and outputs later.
   subgraph_input_names.reserve(num_subgraph_inputs);
   for (int i = 0; i < num_subgraph_inputs; ++i) {
     subgraph_input_names.push_back(subgraph_inputs[i]->Name());
@@ -68,10 +65,9 @@ Status Subgraph::Setup(const SessionState& session_state,
   InlinedVector<std::string_view> feed_names;
   feed_names.reserve(static_cast<size_t>(num_subgraph_inputs) + static_cast<size_t>(num_implicit_inputs));
 
-  // Use the first output (logits) to find device location.
+  // Use the first output to find device location.
   const OrtDevice& default_location = utils::FindDeviceForValue(subgraph_session_state, subgraph_output_names[0]);
 
-  // The position_ids, attention_mask, past_0, ... are created by this operator so the name doesn't matter.
   feed_names.insert(feed_names.end(), subgraph_input_names.begin(), subgraph_input_names.end());
 
   const auto& subgraph_map = subgraph_session_state.GetOrtValueNameIdxMap();
@@ -174,13 +170,15 @@ Status Subgraph::GetParameters(const ONNX_NAMESPACE::TensorShapeProto* past_shap
   }
 
   // Logits shape is like (batch_size, seq_len, vocabulary_size)
-  ORT_RETURN_IF(logits_shape->dim_size() != 3,
-                "subgraph logits output is expected to have 3 dimension, got ", logits_shape->dim_size());
+  if (logits_shape != nullptr) {
+    ORT_RETURN_IF(logits_shape->dim_size() != 3,
+                  "subgraph logits output is expected to have 3 dimension, got ", logits_shape->dim_size());
 
-  ORT_RETURN_IF(!logits_shape->dim(2).has_dim_value() || logits_shape->dim(2).dim_value() <= 0,
-                "subgraph past state dimension 2 shall have a positive value for vocabulary size");
+    ORT_RETURN_IF(!logits_shape->dim(2).has_dim_value() || logits_shape->dim(2).dim_value() <= 0,
+                  "subgraph past state dimension 2 shall have a positive value for vocabulary size");
 
-  this->vocab_size = static_cast<int>(logits_shape->dim(2).dim_value());
+    this->vocab_size = static_cast<int>(logits_shape->dim(2).dim_value());
+  }
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
index 997beb198f450..09bce9828aa33 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
@@ -6,11 +6,12 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include <gsl/gsl>
 #include "contrib_ops/cpu/transformers/subgraph_t5_decoder.h"
 #include "contrib_ops/cpu/utils/dump_tensor.h"
 #include "contrib_ops/cpu/transformers/generation_device_helper.h"
 #include "contrib_ops/cpu/transformers/sequences.h"
+#include <gsl/gsl>
+#include <memory>
 
 namespace onnxruntime {
 namespace contrib {
@@ -20,9 +21,9 @@ namespace transformers {
 
    Inputs:
       input_ids: int32 (B, 1)
-      encoder_input_ids: int32 (B, encode_sequence_length) (optional)
+      encoder_input_ids: int32 (B, encode_sequence_length) (optional for old format; removed in new format)
       encoder_attention_mask: int32 (B, encode_sequence_length)
-      encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size) (optional)
+      encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size) (optional for old format; removed in new format)
 
       past_key_self_0: (B, num_heads, past_decode_sequence_length, head_size)
       past_value_self_0: (B, num_heads, past_decode_sequence_length, head_size)
@@ -141,14 +142,23 @@ Status T5DecoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_i
 }
 
 // Create inputs for decoder from the following data sources:
-// encoder feeds: encoder_input_ids, encoder_attention_mask, decoder_input_ids (with start tokens)
-// encoder fetches: logits,
-//                  encoder_hidden_states,
-//                  present_key_self_0, present_value_self_0, ..., present_key_cross_0, present_value_cross_0, ...
-// decoder_feeds: input_ids,
-//                encoder_attention_mask,
-//                encoder_hidden_states,
-//                present_key_self_0, present_value_self_0, ..., present_key_cross_0, present_value_cross_0, ...
+// New format:
+//   encoder feeds: encoder_input_ids, encoder_attention_mask
+//   encoder fetches: present_key_cross_0, present_value_cross_0, ...
+//   decoder_feeds: input_ids, encoder_attention_mask,
+//                  present_key_self_0, present_value_self_0, ...,
+//                  present_key_cross_0, present_value_cross_0, ...
+//                  past_seq_len (optional), num_beams (optional), cache_indirection (optional)
+//
+// Old format:
+//   encoder feeds: encoder_input_ids, encoder_attention_mask, decoder_input_ids (with start tokens)
+//   encoder fetches: logits, encoder_hidden_states,
+//                    present_key_self_0, present_value_self_0, ...,
+//                    present_key_cross_0, present_value_cross_0, ...
+//   decoder_feeds: input_ids, encoder_input_ids (optional), encoder_attention_mask, encoder_hidden_states (optional),
+//                  present_key_self_0, present_value_self_0, ...,
+//                  present_key_cross_0, present_value_cross_0, ...
+//                  past_seq_len (optional), num_beams (optional), cache_indirection (optional)
 Status T5DecoderSubgraph::CreateInitialFeeds(
     AllocatorPtr cpu_allocator,
     gsl::span<const int32_t> beam_next_tokens,
@@ -162,8 +172,7 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
     const GenerationDeviceHelper::ExpandBufferFunc<MLFloat16>& expand_buffer_float16_func,
     int num_beam,
     Stream* stream,
-    bool use_sequence_as_input_ids,
-    int cur_len,
+    bool copy_sequence_to_input_ids,
     transformers::Sequences& sequences,
     int past_present_share_buffer_max_seq_len,
     bool need_cache_indir,
@@ -173,34 +182,30 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
   // Allocate subgraph inputs from same device as inputs of encoder subgraph.
   AllocatorPtr allocator = session_state_->GetAllocator(encoder_feeds[0].Get<Tensor>().Location());
 
+  int batch_beam_size = static_cast<int>(encoder_fetches[0].Get<Tensor>().Shape()[0]) * num_beam;
+
   // Copy beam next tokens in CPU to input_ids in provider device (CPU for CPU EP, or GPU for CUDA EP).
-  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
-  int sequence_length = !use_sequence_as_input_ids ? 1 : cur_len;
+  int sequence_length = !copy_sequence_to_input_ids ? 1 : sequences.GetSequenceLength();
   int64_t dims[] = {batch_beam_size, sequence_length};
   TensorShape input_ids_shape(&dims[0], 2);
   OrtValue input_ids;
   Tensor::InitOrtValue(DataTypeImpl::GetType<int32_t>(), input_ids_shape, allocator, input_ids);
-  int32_t* input_ids_data = input_ids.GetMutable<Tensor>()->MutableData<int32_t>();
-  AllocatorPtr buffer_allocator = std::make_shared<onnxruntime::CPUAllocator>();
-  size_t total_size = static_cast<size_t>(cur_len) * static_cast<size_t>(batch_beam_size);
-  size_t total_size_bytes = total_size * sizeof(int);
-  auto seq_copy = IAllocator::MakeUniquePtr<int>(buffer_allocator, total_size_bytes, false, stream);
-  int* seq_copy_ptr = seq_copy.get();
-
-  if (!use_sequence_as_input_ids_) {
+
+  // Prepare data for input_ids.
+  if (!copy_sequence_to_input_ids) {  // use next tokens for input_ids.
     ORT_RETURN_IF_ERROR(device_copy_int32_func(
         input_ids.GetMutable<Tensor>()->MutableDataAsSpan<int32_t>(),
         beam_next_tokens,
         stream,
         DeviceCopyDirection::hostToDevice));
-  } else {
+  } else {  // use whole sequences for input_ids.
+    int32_t* input_ids_data = input_ids.GetMutable<Tensor>()->MutableData<int32_t>();
     if (use_cuda) {
       auto sequences_buffer = sequences.GetCurrentDeviceSequences();
       for (int i = 0; i < batch_beam_size; i++) {
-        size_t batch_beam_stride = static_cast<size_t>(i) * static_cast<size_t>(sequences.GetMaxLength());
-        int seq_size = sequences.GetSequenceLength();
-        gsl::span<const int32_t> sequence = sequences_buffer.subspan(batch_beam_stride, seq_size);
-        gsl::span<int> temp_input(input_ids_data + static_cast<ptrdiff_t>(i) * seq_size, seq_size);
+        size_t offset = static_cast<size_t>(i) * static_cast<size_t>(sequences.GetMaxLength());
+        gsl::span<const int32_t> sequence = sequences_buffer.subspan(offset, sequence_length);
+        gsl::span<int> temp_input(input_ids_data + static_cast<ptrdiff_t>(i) * sequence_length, sequence_length);
         ORT_RETURN_IF_ERROR(device_copy_int32_func(
             temp_input,
             sequence,
@@ -208,12 +213,19 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
             DeviceCopyDirection::deviceToDevice));
       }
     } else {
-      const size_t cur_len_bytes = cur_len * sizeof(int);
+      size_t total_size = static_cast<size_t>(sequence_length) * static_cast<size_t>(batch_beam_size);
+      size_t total_size_bytes = total_size * sizeof(int);
+      AllocatorPtr buffer_allocator = std::make_shared<onnxruntime::CPUAllocator>();
+      // TODO: not need extra buffer. Copy directly to input_ids_data instead like the user_cuda above.
+      auto seq_copy = IAllocator::MakeUniquePtr<int>(buffer_allocator, total_size_bytes, false, stream);
+      int* seq_copy_ptr = seq_copy.get();
+
+      const size_t sequence_bytes = sequence_length * sizeof(int);
       for (int i = 0; i < batch_beam_size; i++) {
         gsl::span<const int32_t> sequence = sequences.GetSequence(i);
         const int32_t* sequence_data = sequence.data();
-        ptrdiff_t seq_index = static_cast<ptrdiff_t>(i) * cur_len;
-        memcpy(seq_copy_ptr + seq_index, sequence_data, cur_len_bytes);
+        ptrdiff_t seq_index = static_cast<ptrdiff_t>(i) * sequence_length;
+        memcpy(seq_copy_ptr + seq_index, sequence_data, sequence_bytes);
       }
       gsl::span<int> temp_input(input_ids_data, total_size);
       gsl::span<int> temp_sequence(seq_copy_ptr, total_size);
@@ -227,9 +239,11 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
 
   // The ordering is the same as used in Setup.
   decoder_feeds.reserve(static_cast<size_t>(num_subgraph_inputs) + static_cast<size_t>(num_implicit_inputs));
+
+  // input 0: input_ids
   decoder_feeds.push_back(input_ids);
 
-  if (has_encoder_input_ids_) {
+  if (has_encoder_input_ids_) {  // encoder_input_ids is optional
     // The encoder_input_ids is copied from the first input of encoder.
     OrtValue expanded_encoder_input_ids;
     ORT_RETURN_IF_ERROR(expand_buffer_int32_func(stream,
@@ -251,70 +265,66 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
                                                expanded_decoder_attention_masks,
                                                false,
                                                0 /*max_sequence_length*/));
-
   decoder_feeds.push_back(expanded_decoder_attention_masks);
 
   if (!past_present_share_buffer_) {
     past_present_share_buffer_max_seq_len = 0;
   }
 
-  // When first_past_input_index_ == 3, the encoder_hidden_states and past states are copied from the second output
-  // of encoder.
-  // When first_past_input_index_ == 2, the past states are copied from the second output of encoder.
-  // TODO - probably more robust to introduce a encoder_out/decoder_in mapping instead of relying on positions.
-  // What happens if encoder_hidden_states is present in the encoder_fetches but not in the decoder_feeds?
-  for (size_t j = static_cast<size_t>(2) - has_hidden_state_; j < encoder_fetches.size(); j++) {
-    if (j == 1) {
-      ORT_RETURN_IF(has_hidden_state_ == false, "Invalid hidden_states expension: has_hidden_state_ == false");
-      OrtValue expanded_hidden_states;
-      if (is_output_float16_) {
-        ORT_RETURN_IF_ERROR(expand_buffer_float16_func(stream,
-                                                       encoder_fetches[j],
-                                                       num_beam,
-                                                       allocator,
-                                                       expanded_hidden_states,
-                                                       false,
-                                                       0 /*max_sequence_length*/));
-      } else {
-        ORT_RETURN_IF_ERROR(expand_buffer_float_func(stream,
-                                                     encoder_fetches[j],
-                                                     num_beam,
-                                                     allocator,
-                                                     expanded_hidden_states,
-                                                     false,
-                                                     0 /*max_sequence_length*/));
-      }
-      decoder_feeds.push_back(expanded_hidden_states);
-    } else {
+// macro to expand encoder outputs and append to decoder feeds.
+#define ADD_DECODER_FEED(encoder_output, is_dynamic_kv_cache)                                                         \
+  OrtValue expanded;                                                                                                  \
+  if (is_output_float16_) {                                                                                           \
+    ORT_RETURN_IF_ERROR(expand_buffer_float16_func(stream, encoder_output, num_beam, allocator, expanded, false,      \
+                                                   is_dynamic_kv_cache ? past_present_share_buffer_max_seq_len : 0)); \
+  } else {                                                                                                            \
+    ORT_RETURN_IF_ERROR(expand_buffer_float_func(stream, encoder_output, num_beam, allocator, expanded, false,        \
+                                                 is_dynamic_kv_cache ? past_present_share_buffer_max_seq_len : 0));   \
+  }                                                                                                                   \
+  decoder_feeds.push_back(expanded);
+
+  // The encoder_hidden_states is copied from the second output of encoder.
+  if (has_hidden_state_) {
+    ADD_DECODER_FEED(encoder_fetches[1], false);
+  }
+
+  // New format of encoder has only cross outputs.
+  bool is_new_format = (static_cast<int>(encoder_fetches.size()) == 2 * num_layers);
+  if (is_new_format) {
+    for (int i = 0; i < 2 * num_layers; i++) {
+      // cross shape is (batch_size, num_heads, encode_sequence_length, head_size)
+      const TensorShape& cross_shape = encoder_fetches[0].Get<Tensor>().Shape();
+      ORT_ENFORCE(cross_shape.NumDimensions() == 4);
+
+      // Shape for kv cache: (batch_size * num_beam, num_heads, max_seq_len, head_size)
+      int64_t cache_dims[4] = {0};
+      cross_shape.CopyDims(cache_dims, cross_shape.NumDimensions());
+      cache_dims[0] *= num_beam;
+      cache_dims[2] = past_present_share_buffer_max_seq_len;
+      TensorShape expanded_shape(&cache_dims[0], cross_shape.NumDimensions());
+
+      MLDataType element_type = encoder_fetches[0].Get<Tensor>().DataType();
+      OrtValue past;
+      Tensor::InitOrtValue(element_type, expanded_shape, allocator, past);
+      decoder_feeds.push_back(past);
+    }
+
+    // Add cross inputs from encoder output.
+    for (size_t j = 0; j < encoder_fetches.size(); j++) {
+      ADD_DECODER_FEED(encoder_fetches[j], false);
+    }
+  } else {
+    // present_* output of encoder are added as decoder inputs.
+    for (size_t j = 2; j < encoder_fetches.size(); j++) {
       // past key/value for cross attention does not need to be initialized with max_seq_len since they are static.
-      bool use_max_seq_len = (j - first_past_input_index_) < 2 * static_cast<size_t>(num_layers);
-
-      OrtValue expanded_cache;
-      if (is_output_float16_) {
-        ORT_RETURN_IF_ERROR(expand_buffer_float16_func(stream,
-                                                       encoder_fetches[j],
-                                                       num_beam,
-                                                       allocator,
-                                                       expanded_cache,
-                                                       false,
-                                                       use_max_seq_len ? past_present_share_buffer_max_seq_len : 0));
-      } else {
-        ORT_RETURN_IF_ERROR(expand_buffer_float_func(stream,
-                                                     encoder_fetches[j],
-                                                     num_beam,
-                                                     allocator,
-                                                     expanded_cache,
-                                                     false,
-                                                     use_max_seq_len ? past_present_share_buffer_max_seq_len : 0));
-      }
-      decoder_feeds.push_back(expanded_cache);
+      bool is_dynamic_kv_cache = (j - first_past_input_index_) < 2 * static_cast<size_t>(num_layers);
+      ADD_DECODER_FEED(encoder_fetches[j], is_dynamic_kv_cache);
     }
   }
 
-  // TODO: This part shares the similar logic with CreateInitialFeeds() in subgraph_gpt.cc. We should refactor it.
   if (past_present_share_buffer_) {
-    // Past sequence length feed
-    ORT_RETURN_IF_ERROR(AppendPastSequenceLength(decoder_feeds, cpu_allocator, 1));
+    // Past sequence length set to 0
+    ORT_RETURN_IF_ERROR(AppendPastSequenceLength(decoder_feeds, cpu_allocator, is_new_format ? 0 : 1));
     // Add beam search specific inputs
     if (need_cache_indir) {
       const int64_t batch_size = static_cast<int64_t>(batch_beam_size / num_beam);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h
index b5d727b67924c..87782d47cdbe1 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h
@@ -45,7 +45,6 @@ class T5DecoderSubgraph : public Subgraph {
       int num_beam,
       Stream* stream,
       bool use_sequence_as_input_ids,
-      int cur_len,
       transformers::Sequences& sequences,
       int past_present_share_buffer_max_seq_len = -1,
       bool need_cache_indir = false,
@@ -72,6 +71,10 @@ class T5DecoderSubgraph : public Subgraph {
     return use_sequence_as_input_ids_;
   }
 
+  inline bool UseEncoderHiddenState() const {
+    return has_hidden_state_;
+  }
+
  protected:
   int first_past_input_index_;
   int first_present_output_index_;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
index d59db4afac2c2..a54c0d960980c 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
@@ -15,70 +15,97 @@ namespace transformers {
 
 /* T5 Encoder Subgraph (It also contains decoder initialization where decoder_input_ids are filled with start token ID).
 
-   Inputs:
+  New format:
+    Inputs:
       encoder_input_ids: int32 (B, encode_sequence_length)
       encoder_attention_mask: int32 (B, encode_sequence_length)
-      decoder_input_ids: int32 (B, 1)
 
     Outputs:
-      logits: (B, 1, vocab_size)
-      encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size)
-
-      present_key_self_0: (B, num_heads, 1, head_size)
-      present_value_self_0: (B, num_heads, 1, head_size)
-      ... (for each self attention layer)
-
       present_key_cross_0: (B, num_heads, encode_sequence_length, head_size)
       present_value_cross_0: (B, num_heads, encode_sequence_length, head_size)
       ... (for each cross attention layer)
 
-    Note:
-      Here, B = batch_size * num_beams since we expand the inputs.
-      Ideally, we could use B=batch_size and expand the outputs with a factor of num_beams.
-      Data type of input or output is float or float16 if not specified.
+  Old format:
+    Inputs:
+        encoder_input_ids: int32 (B, encode_sequence_length)
+        encoder_attention_mask: int32 (B, encode_sequence_length)
+        decoder_input_ids: int32 (B, 1)
+
+    Outputs:
+        logits: (B, 1, vocab_size)
+        encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size)
+
+        present_key_self_0: (B, num_heads, 1, head_size)
+        present_value_self_0: (B, num_heads, 1, head_size)
+        ... (for each self attention layer)
+
+        present_key_cross_0: (B, num_heads, encode_sequence_length, head_size)
+        present_value_cross_0: (B, num_heads, encode_sequence_length, head_size)
+        ... (for each cross attention layer)
+
+  Note:
+    Here, B = batch_size * num_beams since we expand the inputs.
+    Ideally, we could use B=batch_size and expand the outputs with a factor of num_beams.
+    Data type of input or output is float or float16 if not specified.
 */
 
 Status T5EncoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_inputs,
                                    const std::vector<const NodeArg*>& subgraph_outputs) {
-  ORT_RETURN_IF(num_subgraph_inputs != 3, "expect 3 inputs, got:", num_subgraph_inputs);
-
-  ORT_RETURN_IF(num_subgraph_outputs < 6, "expect >=6 outputs, got:", num_subgraph_outputs);
-  ORT_RETURN_IF((static_cast<int>(subgraph_outputs.size()) - first_present_output_index_) % 4 != 0,
-                "number of outputs expected to be 2 + 4 * layers, got:", num_subgraph_outputs);
+  constexpr auto int32_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32;
+  constexpr auto float32_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT;
+  constexpr auto float16_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16;
 
+  ORT_RETURN_IF(num_subgraph_inputs != 2 && num_subgraph_inputs != 3, "expect 2 or 3 inputs, got:", num_subgraph_inputs);
   ORT_RETURN_IF(subgraph_inputs[0]->Name() != "encoder_input_ids",
                 "encoder subgraph input 0 shall be named as encoder_input_ids, got: ", subgraph_inputs[0]->Name());
   ORT_RETURN_IF(subgraph_inputs[1]->Name() != "encoder_attention_mask",
                 "encoder subgraph input 1 shall be named as encoder_attention_mask, got: ", subgraph_inputs[1]->Name());
-  ORT_RETURN_IF(subgraph_inputs[2]->Name() != "decoder_input_ids",
-                "encoder subgraph input 2 shall be named as decoder_input_ids, got: ", subgraph_inputs[2]->Name());
-
-  ORT_RETURN_IF(subgraph_outputs[0]->Name() != "logits",
-                "encoder subgraph output 0 shall be named as logits, got: ", subgraph_outputs[0]->Name());
-  ORT_RETURN_IF(subgraph_outputs[1]->Name() != "encoder_hidden_states",
-                "encoder subgraph output 1 shall be named encoder_hidden_states, got: ", subgraph_outputs[1]->Name());
-  ORT_RETURN_IF(subgraph_outputs[2]->Name() != "present_key_self_0",
-                "encoder subgraph output 2 shall be named as present_key_self_0, got: ", subgraph_outputs[2]->Name());
-  ORT_RETURN_IF(subgraph_outputs[3]->Name() != "present_value_self_0",
-                "encoder subgraph output 3 shall be named as present_value_self_0, got: ", subgraph_outputs[3]->Name());
-
-  const ONNX_NAMESPACE::TensorShapeProto* past_shape = subgraph_outputs[2]->Shape();
-  const ONNX_NAMESPACE::TensorShapeProto* logits_shape = subgraph_outputs[0]->Shape();
-
-  // Save parameters related to the subgraph.
-  ORT_RETURN_IF_ERROR(GetParameters(past_shape, logits_shape, false));
-  num_layers = (static_cast<int>(subgraph_outputs.size()) - first_present_output_index_) / 4;
-
-  constexpr auto int32_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32;
-  constexpr auto float32_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT;
-  constexpr auto float16_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16;
 
   ORT_RETURN_IF(subgraph_inputs[0]->TypeAsProto()->tensor_type().elem_type() != int32_type,
                 "encoder subgraph input 0 (encoder_input_ids) shall have int32 type");
   ORT_RETURN_IF(subgraph_inputs[1]->TypeAsProto()->tensor_type().elem_type() != int32_type,
                 "encoder subgraph input 1 (encoder_attention_mask) shall have int32 type");
-  ORT_RETURN_IF(subgraph_inputs[2]->TypeAsProto()->tensor_type().elem_type() != int32_type,
-                "encoder subgraph input 2 (decoder_input_ids) shall have int32 type");
+
+  if (num_subgraph_inputs == 2) {
+    ORT_RETURN_IF(num_subgraph_outputs < 2 || num_subgraph_outputs % 2 != 0,
+                  "number of outputs expected to be 2 * layers, got:", num_subgraph_outputs);
+
+    ORT_RETURN_IF(subgraph_outputs[0]->Name() != "present_key_cross_0",
+                  "encoder subgraph output 0 shall be named as present_key_cross_0, got: ", subgraph_outputs[0]->Name());
+    ORT_RETURN_IF(subgraph_outputs[1]->Name() != "present_value_cross_0",
+                  "encoder subgraph output 1 shall be named as present_value_cross_0, got: ", subgraph_outputs[1]->Name());
+
+    // Deduce num_heads and head_size parameters from shape of graph outputs
+    const ONNX_NAMESPACE::TensorShapeProto* past_shape = subgraph_outputs[0]->Shape();
+    const ONNX_NAMESPACE::TensorShapeProto* logits_shape = nullptr;
+    ORT_RETURN_IF_ERROR(GetParameters(past_shape, logits_shape, false));
+
+    num_layers = num_subgraph_outputs / 2;
+  } else {
+    ORT_RETURN_IF(num_subgraph_outputs < 6 || (num_subgraph_outputs - first_present_output_index_) % 4 != 0,
+                  "number of outputs expected to be 2 + 4 * layers, got:", num_subgraph_outputs);
+
+    ORT_RETURN_IF(subgraph_inputs[2]->Name() != "decoder_input_ids",
+                  "encoder subgraph input 2 shall be named as decoder_input_ids, got: ", subgraph_inputs[2]->Name());
+    ORT_RETURN_IF(subgraph_inputs[2]->TypeAsProto()->tensor_type().elem_type() != int32_type,
+                  "encoder subgraph input 2 (decoder_input_ids) shall have int32 type");
+
+    ORT_RETURN_IF(subgraph_outputs[0]->Name() != "logits",
+                  "encoder subgraph output 0 shall be named as logits, got: ", subgraph_outputs[0]->Name());
+    ORT_RETURN_IF(subgraph_outputs[1]->Name() != "encoder_hidden_states",
+                  "encoder subgraph output 1 shall be named encoder_hidden_states, got: ", subgraph_outputs[1]->Name());
+    ORT_RETURN_IF(subgraph_outputs[2]->Name() != "present_key_self_0",
+                  "encoder subgraph output 2 shall be named as present_key_self_0, got: ", subgraph_outputs[2]->Name());
+    ORT_RETURN_IF(subgraph_outputs[3]->Name() != "present_value_self_0",
+                  "encoder subgraph output 3 shall be named as present_value_self_0, got: ", subgraph_outputs[3]->Name());
+
+    // Deduce num_heads, head_size and vocab_size from shape of graph outputs
+    const ONNX_NAMESPACE::TensorShapeProto* past_shape = subgraph_outputs[2]->Shape();
+    const ONNX_NAMESPACE::TensorShapeProto* logits_shape = subgraph_outputs[0]->Shape();
+    ORT_RETURN_IF_ERROR(GetParameters(past_shape, logits_shape, false));
+
+    num_layers = (num_subgraph_outputs - first_present_output_index_) / 4;
+  }
 
   auto output_type = subgraph_outputs[0]->TypeAsProto()->tensor_type().elem_type();
   ORT_RETURN_IF(output_type != float32_type && output_type != float16_type,
@@ -86,7 +113,7 @@ Status T5EncoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_i
 
   for (int i = 1; i < num_subgraph_outputs; i++) {
     ORT_RETURN_IF(subgraph_outputs[i]->TypeAsProto()->tensor_type().elem_type() != output_type,
-                  "encoder subgraph outputs 1, 2, ... shall have same data type");
+                  "encoder subgraph outputs shall have same data type");
   }
 
   is_output_float16_ = (output_type == float16_type);
@@ -120,7 +147,6 @@ Status T5EncoderSubgraph::CreateInitialFeeds(
   }
   ORT_RETURN_IF(cpu_allocator == nullptr, "cpu_allocator shouldn't be nullptr");
 
-  // TODO(tianleiwu): expand the outputs instead of inputs to save computation.
   OrtValue encoder_input_ids;
   OrtValue encoder_attention_mask;
   ORT_RETURN_IF_ERROR(create_encoder_inputs_func(&original_encoder_input_ids,
@@ -136,9 +162,10 @@ Status T5EncoderSubgraph::CreateInitialFeeds(
   AllocatorPtr default_allocator = session_state_->GetAllocator(provider->GetOrtDeviceByMemType(OrtMemTypeDefault));
   AllocatorPtr pinned_allocator = session_state_->GetAllocator(provider->GetOrtDeviceByMemType(OrtMemTypeCPU));
   const OrtMemoryInfo& location = default_allocator->Info();
+
   ORT_RETURN_IF_ERROR(add_to_feeds_func(
       ort_stream,
-      {encoder_input_ids, encoder_attention_mask, decoder_input_ids},
+      num_subgraph_inputs == 2 ? std::initializer_list<OrtValue>{encoder_input_ids, encoder_attention_mask} : std::initializer_list<OrtValue>{encoder_input_ids, encoder_attention_mask, decoder_input_ids},
       feeds,
       buffer,
       default_allocator,
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.h b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.h
index a79f677f5a043..33fd522bdfd82 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.h
@@ -16,7 +16,11 @@ class T5EncoderSubgraph : public Subgraph {
       const onnxruntime::Node& node_in,
       const std::string& attribute_name,
       const GraphViewer& subgraph_in) : Subgraph(node_in, attribute_name, subgraph_in) {
-    first_present_output_index_ = 2;
+    has_logits_output_ = num_subgraph_outputs > 0 && subgraph_output_names[0] == "logits";
+
+    // Old format: The first output is logits, the second one is encoder_hidden_states.
+    // New format: No logits and encoder_hidden_states. All outputs are cross.
+    first_present_output_index_ = HasLogitsOutput() ? 2 : 0;
   }
 
   // Create inputs for first inference of subgraph.
@@ -36,11 +40,18 @@ class T5EncoderSubgraph : public Subgraph {
   Status Validate(const std::vector<const NodeArg*>& subgraph_inputs,
                   const std::vector<const NodeArg*>& subgraph_outputs) override;
 
+#ifdef DEBUG_GENERATION
   int GetFirstPresentOutputIndex() const {
     return first_present_output_index_;
   }
+#endif
+
+  bool HasLogitsOutput() const {
+    return has_logits_output_;
+  }
 
  protected:
+  bool has_logits_output_;
   int first_present_output_index_;
 };
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index 0209183f46425..51311715d3b2a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -512,7 +512,7 @@ Status EfficientAttention(
     p.seqstart_q_ptr = nullptr;
     p.seqstart_k_ptr = nullptr;
   } else {
-    p.seqlen_k_ptr = const_cast<int32_t*>(reinterpret_cast<const int32_t*>(data.mask_index));
+    p.seqlen_k_ptr = reinterpret_cast<const int32_t*>(data.mask_index);
     p.seqstart_q_ptr = p.seqlen_k_ptr + parameters.batch_size;
     p.seqstart_k_ptr = p.seqlen_k_ptr + 2 * parameters.batch_size + 1;
   }
@@ -762,12 +762,8 @@ Status UnfusedAttention(
   } else {  // no mask
     if (nullptr != data.output_qk) {
       int64_t qk_size = (int64_t)batch_size * num_heads * sequence_length * total_sequence_length;
-      if (std::is_same<T, QK>::value) {
-        cudaMemcpyAsync(data.output_qk, data.scratch, qk_size * sizeof(QK), cudaMemcpyDeviceToDevice, stream);
-      } else {
-        ORT_RETURN_IF_ERROR(
-          (CopyQK<T, QK>(stream, static_cast<int>(qk_size), data.scratch, reinterpret_cast<QK*>(data.output_qk))));
-      }
+      ORT_RETURN_IF_ERROR(
+        (CopyQK<T, QK>(stream, static_cast<int>(qk_size), data.scratch, reinterpret_cast<QK*>(data.output_qk))));
     }
     ORT_RETURN_IF_ERROR(
         ComputeSoftmax<T>(
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_qk.cu b/onnxruntime/contrib_ops/cuda/bert/attention_qk.cu
index 78c407fd3bb3b..3f02a441da73e 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_qk.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_qk.cu
@@ -24,16 +24,24 @@ __global__ void ConvertAndCopyQK(const int count, const half* input, float* outp
   }
 }
 
+template <typename T>
+__global__ void ConvertAndCopyQK(const int count, const T* input, T* output) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < count) {
+    output[idx] = input[idx];
+  }
+}
+
 template <typename T, typename QK>
 Status CopyQK(cudaStream_t stream,
               const int qk_size,
               const T* input,
               QK* output) {
-  const bool half2float = std::is_same<T, half>::value && std::is_same<QK, float>::value;
-  const bool float2half = std::is_same<T, float>::value && std::is_same<QK, half>::value;
-  ORT_ENFORCE(half2float || float2half);
+  constexpr const bool half2float = std::is_same<T, half>::value && std::is_same<QK, float>::value;
+  constexpr const bool float2half = std::is_same<T, float>::value && std::is_same<QK, half>::value;
+  static_assert(half2float || float2half, "This function supports either <float,half> or <half,float>");
 
-  int block_size = 256;
+  constexpr const int block_size = 256;
   int num_blocks = (qk_size + block_size - 1) / block_size;
   ConvertAndCopyQK<<<num_blocks, block_size, 0, stream>>>(qk_size, input, output);
 
@@ -50,6 +58,24 @@ template Status CopyQK<half, float>(cudaStream_t stream,
                                     const half* input,
                                     float* output);
 
+template <>
+Status CopyQK(cudaStream_t stream,
+              const int qk_size,
+              const float* input,
+              float* output) {
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, qk_size * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+  return Status::OK();
+}
+
+template <>
+Status CopyQK(cudaStream_t stream,
+              const int qk_size,
+              const half* input,
+              half* output) {
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, qk_size * sizeof(half), cudaMemcpyDeviceToDevice, stream));
+  return Status::OK();
+}
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
index 9b3ba73254d73..8d8f735e3ed34 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -222,6 +222,9 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) {
     }
 
     p.use_smooth_softmax = params.use_smooth_softmax;
+
+    // local_windows_size in GQA does not include current query token, while windows_size in this kernel includes it.
+    p.window_size = params.local_window_size + 1;
   }
 
   auto kernel_fn = attention_kernel_batched_impl<Attention>;
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h
index 8dff521da48d1..f35d6c2e6c8dc 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h
@@ -174,10 +174,9 @@ struct AttentionKernel {
     scalar_t* key_ptr = nullptr;        // [num_keys, num_heads, head_dim]
     scalar_t* value_ptr = nullptr;      // [num_keys, num_heads, head_dim_value]
     scalar_t* attn_bias_ptr = nullptr;  // [num_heads, num_queries, num_keys]
-    int32_t* seqstart_q_ptr = nullptr;
-    int32_t* seqstart_k_ptr = nullptr;
-
-    int32_t* seqlen_k_ptr = nullptr;
+    const int32_t* seqstart_q_ptr = nullptr;
+    const int32_t* seqstart_k_ptr = nullptr;
+    const int32_t* seqlen_k_ptr = nullptr;
     uint32_t causal_diagonal_offset = 0;
 
     // Output tensors
@@ -187,6 +186,8 @@ struct AttentionKernel {
     // [num_heads, num_queries] - can be null
     lse_scalar_t* logsumexp_ptr = nullptr;
 
+    int32_t window_size = -1;
+
     // Scale
     accum_t scale = 0.0;
 
@@ -651,6 +652,12 @@ struct AttentionKernel {
     XFORMERS_CHECK(
         p.custom_mask_type < NumCustomMaskTypes,
         "invalid value for `custom_mask_type`");
+    if (p.window_size > 0) {
+      XFORMERS_CHECK(
+          p.custom_mask_type == CausalFromTopLeft ||
+              p.custom_mask_type == CausalFromBottomRight,
+          "invalid value for custom_mask_type");
+    }
     return true;
   }
 
@@ -726,6 +733,13 @@ struct AttentionKernel {
     // Iterate through keys
     for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
          iter_key_start += kKeysPerBlock) {
+      if (p.window_size > 0) {
+        // don't compute anything if below attention band
+        if (iter_key_start + kKeysPerBlock <
+            static_cast<int32_t>(query_start + p.causal_diagonal_offset) - p.window_size) {
+          continue;
+        }
+      }
       int32_t problem_size_0_m =
           cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
       int32_t problem_size_0_n = cutlass::fast_min(
@@ -894,6 +908,38 @@ struct AttentionKernel {
             },
             [&](int accum_m) {});
       }
+
+      // Mask out lower left corner of block if window_size > 0
+      // only required if current block intersects with the lower left corner
+      // block starts at x_lowerleft = iter_key_start // y = query_start +
+      // kQueriesPerBlock first non masked value at this y is : x_first =
+      // query_start + kQueriesPerBlock - window_size mask if x_fist >
+      // x_lowerleft
+
+      if (p.window_size > 0 &&
+          (query_start + p.causal_diagonal_offset +
+               cutlass::fast_min(
+                   static_cast<int32_t>(kQueriesPerBlock), static_cast<int32_t>(p.num_queries)) -
+               p.window_size >=
+           iter_key_start)) {
+        auto query_start = blockIdx.x * kQueriesPerBlock;
+        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+            my_lane_id, my_warp_id, iteratorC_tile_offset);
+        int32_t first_col;
+        const int32_t offset = query_start + p.causal_diagonal_offset -
+                               p.window_size - iter_key_start;
+        MM0::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) { first_col = accum_m + offset; },
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_n <= first_col) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+
       // Update `mi` from accum stored in registers
       // Also does accum[i] <- exp(accum[i] - mi)
       iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
@@ -1036,9 +1082,18 @@ struct AttentionKernel {
         }
 
         if (!kKeepOutputInRF) {
+          int first_key = 0;
+          if (p.window_size > 0) {
+            first_key = (cutlass::fast_max(
+                             static_cast<int>(query_start + p.causal_diagonal_offset) -
+                                 p.window_size + 1,
+                             0) /
+                         kKeysPerBlock) *
+                        kKeysPerBlock;
+          }
           MM1::Mma::drain_cp_asyncs();
           DISPATCH_BOOL(
-              iter_key_start == 0, kIsFirst, ([&] {
+              iter_key_start == first_key, kIsFirst, ([&] {
                 DISPATCH_BOOL(
                     (iter_key_start + kKeysPerBlock) >= p.num_keys,
                     kIsLast,
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h
index 9fe66c6fe992e..287413bf5acde 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h
@@ -14,42 +14,39 @@ namespace cuda {
 constexpr int kEfficientAttentionMaxHeadSize = 1024;
 
 struct MemoryEfficientAttentionParams {
-  int32_t sm;
-  bool is_half;
+  int32_t sm = 50;
+  bool is_half = false;
   bool is_kv_bsnh = true;
-  int32_t batch_size;
-  int32_t num_heads;
-  int32_t sequence_length;
-  int32_t kv_sequence_length;
-  int32_t max_sequence_length;
-  int32_t qk_head_size;
-  int32_t v_head_size;
-  bool causal;
-  bool use_smooth_softmax;
-
-  float scale;
+  int32_t batch_size = 0;
+  int32_t num_heads = 0;
+  int32_t sequence_length = 0;
+  int32_t kv_sequence_length = 0;
+  int32_t max_sequence_length = 0;
+  int32_t qk_head_size = 0;
+  int32_t v_head_size = 0;
+  int32_t local_window_size = -1;
+  bool causal = false;
+  bool use_smooth_softmax = false;
+  bool broadcast_attn_bias_dim_0 = false;
+  bool broadcast_attn_bias_dim_1 = false;
+  bool has_custom_right_padding = false;
+  float scale = 1.0f;
   float softcap = 0.0;
 
-  int32_t* seqstart_q_ptr;
-  int32_t* seqstart_k_ptr;
-  int32_t* seqlen_k_ptr;
-
-  const void* query;      // [B, S, N, H]
-  const void* key;        // [B, L, N, H], where L is kv_sequence_length
-  const void* value;      // [B, L, N, H_v]
-  const void* attn_bias;  // [B or 1, N or 1, S, L] or null
-  bool broadcast_attn_bias_dim_0;
-  bool broadcast_attn_bias_dim_1;
-
-  void* output;     // [B, S, N, H_v]
-  void* workspace;  // [B, S, N, H_v] when kNeedsOutputAccumulatorBuffer, nullptr otherwise
-  cudaStream_t stream;
+  cudaStream_t stream = nullptr;
+  const int32_t* seqstart_q_ptr = nullptr;  // [B + 1], cumulated sequence lengths of queries
+  const int32_t* seqstart_k_ptr = nullptr;  // [B + 1], cumulated sequence lengths of keys
+  const int32_t* seqlen_k_ptr = nullptr;    // [B], sequence lengths of keys
+  const void* query = nullptr;              // [B, S, N, H]
+  const void* key = nullptr;                // [B, L, N, H], where L is kv_sequence_length
+  const void* value = nullptr;              // [B, L, N, H_v]
+  const void* attn_bias = nullptr;          // [B or 1, N or 1, S, L] or null
+  void* workspace = nullptr;                // [B, S, N, H_v] when kNeedsOutputAccumulatorBuffer, nullptr otherwise
+  void* output = nullptr;                   // [B, S, N, H_v]
 
   static bool need_workspace(size_t v_head_size, bool is_float) {
     return (v_head_size > 128 && !is_float);
   }
-
-  bool has_custom_right_padding = false;
 };
 
 void run_memory_efficient_attention(const MemoryEfficientAttentionParams& params);
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index 8b63b363d8863..9f1bc46ee297d 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -156,13 +156,8 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   bool use_memory_efficient_attention =
       !use_flash_attention &&
       !disable_memory_efficient_attention_ &&
-      local_window_size_ == -1 &&
-      (sizeof(T) == 2 || parameters.sequence_length >= this->kernel_options_->MinSeqLenForEfficientAttentionFp32()) &&
       has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.head_size);
-  if (!use_flash_attention && !use_memory_efficient_attention && local_window_size_ != -1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Local attention UNSUPPORTED for sm < 80 on CUDA.");
-  }
+
   // allocate buffers
   size_t kv_buffer_bytes = 0;
   // need a buffer if we must ungroup kv
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
index dbbee87238d0c..2d1b49033003d 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -635,6 +635,7 @@ Status EfficientAttention(
   p.stream = stream;
   p.has_custom_right_padding = true;
   p.use_smooth_softmax = parameters.use_smooth_softmax;
+  p.local_window_size = parameters.local_window_size;
   run_memory_efficient_attention(p);
 
   DUMP_TENSOR("efficient attention output", data.output, batch_size, sequence_length, num_heads, head_size);
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
index f3b9fd310f46f..846d2be7bf2e1 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
@@ -698,8 +698,8 @@ Status FusedAttentionCutlass(
   p.scale = parameters.scale == 0.0f ? 1.f / sqrt(static_cast<float>(qk_head_size))
                                      : parameters.scale;
   p.seqlen_k_ptr = nullptr;
-  p.seqstart_q_ptr = const_cast<int32_t*>(data.cumulative_sequence_length);
-  p.seqstart_k_ptr = const_cast<int32_t*>(data.cumulative_sequence_length);
+  p.seqstart_q_ptr = data.cumulative_sequence_length;
+  p.seqstart_k_ptr = data.cumulative_sequence_length;
   p.query = data.no_qkv_workspace ? data.query : data.workspace;
   p.key = data.no_qkv_workspace ? data.key : (data.workspace + elements_qk);
   p.value = data.no_qkv_workspace ? data.value : (data.workspace + elements_qk + elements_qk);
diff --git a/onnxruntime/contrib_ops/webgpu/bert/bias_add.cc b/onnxruntime/contrib_ops/webgpu/bert/bias_add.cc
index 65c14e8cb0bdd..e822f8764b63f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/bias_add.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/bias_add.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_utils.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
 #include "contrib_ops/webgpu/bert/bias_add.h"
 #include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
@@ -34,15 +35,6 @@ Status BiasAddProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
-static int64_t GetMaxComponents(int64_t size) {
-  if (size % 4 == 0) {
-    return 4;
-  } else if (size % 2 == 0) {
-    return 2;
-  }
-  return 1;
-}
-
 Status BiasAdd::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
   const auto* input = context.Input(0);
   const auto* bias = context.Input(1);
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index b33084d60cec3..8f316cfae80e9 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -100,7 +100,7 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con
 
   program
       .CacheHint(interleaved_)
-      .AddInputs({{input, ProgramTensorMetadataDependency::Rank},
+      .AddInputs({{input, ProgramTensorMetadataDependency::TypeAndRank},
                   {position_ids, ProgramTensorMetadataDependency::Rank},
                   {cos_cache, ProgramTensorMetadataDependency::Rank},
                   {sin_cache, ProgramTensorMetadataDependency::Rank}})
diff --git a/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc
index d5d4632c01e2a..61f701f7911a7 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_utils.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
 #include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
 #include "contrib_ops/webgpu/bert/skip_layer_norm.h"
@@ -10,28 +11,6 @@ namespace onnxruntime {
 namespace contrib {
 namespace webgpu {
 
-static uint32_t GetMaxComponents(int size) {
-  if (size % 4 == 0) {
-    return 4;
-  } else if (size % 2 == 0) {
-    return 2;
-  }
-  return 1;
-}
-
-static std::string SumVector(std::string x, int components) {
-  switch (components) {
-    case 1:
-      return x;
-    case 2:
-      return "(" + x + ".x + " + x + ".y" + ")";
-    case 4:
-      return "(" + x + ".x + " + x + ".y + " + x + ".w + " + x + ".z" + ")";
-    default:
-      ORT_THROW("Unsupported number of components: ", components);
-  }
-}
-
 Status SkipLayerNormProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& x = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
   shader.AddInput("skip", ShaderUsage::UseUniform);
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
index a25d8e68f11cd..6d2370db853ee 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -7,6 +7,39 @@
 namespace onnxruntime {
 namespace contrib {
 namespace webgpu {
+namespace {
+
+constexpr std::string_view commonFunctions = R"ADDNL_FN(
+        fn DequantizedFrom4BitsTo8Bits(in: vec2<u32>) -> vec4<u32>
+        {
+            var out = vec4<u32>(0);
+            var value_lower = vec4<i32>(unpack4xU8(in[0] & 0x0F0F0F0Fu)) - vec4<i32>(8);
+            var value_upper = vec4<i32>(unpack4xU8((in[0] >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
+            out[0] = pack4xI8(vec4<i32>(value_lower[0], value_upper[0], value_lower[1], value_upper[1]));
+            out[1] = pack4xI8(vec4<i32>(value_lower[2], value_upper[2], value_lower[3], value_upper[3]));
+            value_lower = vec4<i32>(unpack4xU8(in[1] & 0x0F0F0F0Fu)) - vec4<i32>(8);
+            value_upper = vec4<i32>(unpack4xU8((in[1] >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
+            out[2] = pack4xI8(vec4<i32>(value_lower[0], value_upper[0], value_lower[1], value_upper[1]));
+            out[3] = pack4xI8(vec4<i32>(value_lower[2], value_upper[2], value_lower[3], value_upper[3]));
+            return out;
+        }
+
+        // Scaled dot product of 8 packed unsigned integers.
+        fn SDP8AI(a1:vec4<u32>, b1:vec4<u32>, a2:vec4<u32>, b2:vec4<u32>, scale:output_element_t) -> output_element_t
+        {
+            var local_sum = dot4I8Packed(a1[0], b1[0]);
+            local_sum += dot4I8Packed(a1[1], b1[1]);
+            local_sum += dot4I8Packed(a1[2], b1[2]);
+            local_sum += dot4I8Packed(a1[3], b1[3]);
+            local_sum += dot4I8Packed(a2[0], b2[0]);
+            local_sum += dot4I8Packed(a2[1], b2[1]);
+            local_sum += dot4I8Packed(a2[2], b2[2]);
+            local_sum += dot4I8Packed(a2[3], b2[3]);
+            return output_element_t(local_sum) * scale;
+        }
+  )ADDNL_FN";
+
+}  // namespace
 
 Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
@@ -65,7 +98,8 @@ Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // this shader require A to be int8 quantized with block size 64. B is regular
   // matmulnbits input with block size 32.
 
-  shader.AdditionalImplementation() << "  const block_size = " << block_size_ << ";";
+  shader.AdditionalImplementation() << commonFunctions
+                                    << "  const block_size = " << block_size_ << ";";
 
   shader.AdditionalImplementation() << R"ADDNL_FN(
         const tile_size = 64;
@@ -105,34 +139,13 @@ Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
             }
 
             let b_value = input_b[b_global*uniforms.K16+kidx_v+col];
-            var b_value_lower = vec4<i32>(unpack4xU8(b_value[0] & 0x0F0F0F0Fu)) - vec4<i32>(8);
-            var b_value_upper = vec4<i32>(unpack4xU8((b_value[0] >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
-            tile_B[col][row][0] = pack4xI8(vec4<i32>(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1]));
-            tile_B[col][row][1] = pack4xI8(vec4<i32>(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3]));
-            b_value_lower = vec4<i32>(unpack4xU8(b_value[1] & 0x0F0F0F0Fu)) - vec4<i32>(8);
-            b_value_upper = vec4<i32>(unpack4xU8((b_value[1] >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
-            tile_B[col][row][2] = pack4xI8(vec4<i32>(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1]));
-            tile_B[col][row][3] = pack4xI8(vec4<i32>(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3]));
+            tile_B[col][row] = DequantizedFrom4BitsTo8Bits(b_value);
             if (col == 0)
             {
                 // kidx_v - each kidx_v covers 16 values of k
                 scale_B[row] = scales_b[b_global*(uniforms.K/block_size) + kidx_v/(block_size/16)];
             }
         }
-
-        // Scaled dot product of 8 packed unsigned integers.
-        fn SDP8AI(a1:vec4<u32>, b1:vec4<u32>, a2:vec4<u32>, b2:vec4<u32>, scale:output_element_t) -> output_element_t
-        {
-            var local_sum = dot4I8Packed(a1[0], b1[0]);
-            local_sum += dot4I8Packed(a1[1], b1[1]);
-            local_sum += dot4I8Packed(a1[2], b1[2]);
-            local_sum += dot4I8Packed(a1[3], b1[3]);
-            local_sum += dot4I8Packed(a2[0], b2[0]);
-            local_sum += dot4I8Packed(a2[1], b2[1]);
-            local_sum += dot4I8Packed(a2[2], b2[2]);
-            local_sum += dot4I8Packed(a2[3], b2[3]);
-            return output_element_t(local_sum) * scale;
-        }
     )ADDNL_FN";
 
   shader.MainFunctionBody() << R"MAIN_FN(
@@ -249,11 +262,122 @@ Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
+// scale_A components = 1, b components = 4, output components = 1
+Status DP4AMatMulNBitsSmallMProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("input_a", ShaderUsage::UseUniform);
+  shader.AddInput("scales_a", ShaderUsage::UseUniform);
+  shader.AddInput("input_b", ShaderUsage::UseUniform);
+  shader.AddInput("scales_b", ShaderUsage::UseUniform);
+  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
+  // This algorithm works to compute dot product of k parallelly, by processing k at each step amongst tile_size_k_vec threads,
+  // and utilizing the remaining threads in the workgroup to process additional rows of b in parallel (such that the values in shared memory for A can be reused).
+  // For each load of k, the tile_size_k_vec threads also reload B tile_size/num_concurrent_b_rows times to compute partial dot products of other B rows
+  // in order to complete all tile_size b rows in this workgroup and also reusing the loaded in register values of a.
+
+  // 1. Each workgroup handles tile_size_k_vec (16) * k_vectorization_in_b (32) columns (total 512) and num_concurrent_b_rows of matrix B at a time,
+  // iterating over the columns to compute a partial dot product.
+  // 2. Uses vec4 vectorization where each K represents 32 elements of matrix B
+  constexpr uint32_t tile_size_k_vec = 16;
+
+  // 1. Workgroup Responsibility:
+  //    - Processes one row of matrix A
+  //    - Handles tile_size rows of matrix B
+  //
+  // 2. Computation Process:
+  //    - Reads [tile_size][tile_size_k_vec] block of B data at a time
+  //    - Each thread within workgroup computes dot products of 32 A*B elements since each K represents 32 elements of matrix B
+  //    - Stores intermediate results in shared memory (inter_results)
+  //    - Iterates through columns accumulating results in inter_results
+  //    - Performs final reduction sum in inter_results for output
+  shader.AdditionalImplementation() << "const tile_size = " << tile_size_ << "u;\n"
+                                    << "const tile_size_k_vec = " << tile_size_k_vec << "u;\n"
+                                    // sub_tile_size is the number of concurrent b rows processed by the workgroup.
+                                    << "const sub_tile_size = " << WorkgroupSizeX() / tile_size_k_vec << "u;\n";
+  shader.AdditionalImplementation() << commonFunctions
+                                    << R"ADDNL_FN(
+    // Shared memory
+    // Need 2 * tile_size_k_vec (32) to store a tile_A since b is quantized as 4 bits and a is quantized as 8 bits.
+    var<workgroup> tile_A : array<vec4<u32>, 32>;
+    // Need 4 scales value since each tile_A includes 512 (4x4x32) scalars and the block_size is 128.
+    var<workgroup> scale_A : array<output_element_t, 4>;
+    var<workgroup> inter_results: array<array<output_element_t, tile_size_k_vec>, tile_size>;
+    fn loadSHMA(a_global: u32, kidx_v: u32, col: u32)
+    {
+      let k_offset = kidx_v + col;
+      if (k_offset >= uniforms.K16) {
+        return;
+      }
+
+      tile_A[col] = input_a[a_global*uniforms.K16+k_offset];
+      if (col < 4)
+      {
+        // kidx_v - covers 16 values of k in input_a
+        scale_A[col] = scales_a[a_global*(uniforms.K/128) + kidx_v/8 + col];
+      }
+    }
+  )ADDNL_FN";
+
+  shader.MainFunctionBody() << R"MAIN_FN(
+    let a_global = u32(workgroup_idx / uniforms.num_N_tile);
+    let b_global_base = (workgroup_idx % uniforms.num_N_tile) * tile_size;
+    // Handle each workgroup threads as a block of [sub_tile_size][tile_size_k_vec]
+    let local_col = local_idx % tile_size_k_vec;
+    let local_row = local_idx / tile_size_k_vec;
+    for (var kidx_v:u32 = 0; kidx_v < uniforms.K32; kidx_v += tile_size_k_vec)
+    {
+      // Load Phase: Populate shared memory for the workgroup.
+      if (local_idx < 32)
+      {
+        loadSHMA(a_global, kidx_v * 2, local_idx);
+      }
+      workgroupBarrier();
+      var own_a: vec4<u32> = tile_A[local_col * 2];
+      var own_a1: vec4<u32> = tile_A[local_col * 2 + 1];
+      var own_scale_a = scale_A[local_col / 4];
+      var own_b = vec4<u32>(0);
+      var own_b1 = vec4<u32>(0);
+      let k_offset = kidx_v + local_col;
+      // calculate intermediate results into inter_results.
+      for (var row_offset = 0u; row_offset < tile_size; row_offset += sub_tile_size) {
+        let b_global = b_global_base + row_offset + local_row;
+        if (b_global < uniforms.N && k_offset < uniforms.K32)
+        {
+          let b_offset = b_global * uniforms.K32 + k_offset;
+          let b_value = input_b[b_offset];
+          own_b = DequantizedFrom4BitsTo8Bits(b_value.xy);
+          own_b1 = DequantizedFrom4BitsTo8Bits(b_value.zw);
+
+          // k_offset - covers 32 values of k in input_b
+          let own_scale_b = scales_b[b_global * uniforms.K / uniforms.block_size + k_offset * 32 / uniforms.block_size];
+          inter_results[row_offset + local_row][local_col] += SDP8AI(own_a, own_b, own_a1, own_b1, own_scale_a * own_scale_b);
+        }
+      }
+      workgroupBarrier();
+    }
+
+    if (local_idx < tile_size) {
+      // Do reduce sum to get final output.
+      var output_value = output_element_t(0);
+      for (var b = 0u; b < tile_size_k_vec; b++) {
+        output_value += inter_results[local_idx][b];
+      }
+      let b_global =  b_global_base + local_idx;
+      let output_idx = a_global * uniforms.N + b_global;
+      if (b_global < uniforms.N) {
+        output[output_idx] = output_value;
+      }
+    }
+  )MAIN_FN";
+
+  return Status::OK();
+}
+
 Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t K,
                                   uint32_t block_size,
+                                  uint32_t min_M_for_tile_optimization,
                                   onnxruntime::webgpu::ComputeContext& context,
                                   Tensor* y) {
   constexpr uint32_t kVec4Components = 4;
@@ -273,6 +397,21 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
                    {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}});
   ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
 
+  if (M < min_M_for_tile_optimization) {
+    constexpr uint32_t kTileSize = 32;
+    DP4AMatMulNBitsSmallMProgram mul_program{kTileSize};
+    uint32_t num_N_tile = (N + kTileSize - 1) / kTileSize;
+    mul_program.SetWorkgroupSize(128);
+    mul_program.SetDispatchGroupSize(M * num_N_tile);
+    mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)},
+                           {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, 1},
+                           {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components * kU32Components)},
+                           {scales, ProgramTensorMetadataDependency::TypeAndRank, 1}})
+        .AddUniformVariables({M, N, K, K / 16, K / 32, block_size, num_N_tile})
+        .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, 1});
+    return context.RunProgram(mul_program);
+  }
+
   constexpr uint32_t kTileSize = 64;
   TensorShape reshaped_y_shape{1, M, N / kVec4Components};
   uint32_t num_M_tile = (M + kTileSize - 1) / kTileSize;
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
index f0157ca3e8c97..67e2e7d66e83a 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
@@ -34,11 +34,29 @@ class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
   uint32_t block_size_;
 };
 
+class DP4AMatMulNBitsSmallMProgram final : public Program<DP4AMatMulNBitsSmallMProgram> {
+ public:
+  DP4AMatMulNBitsSmallMProgram(uint32_t tile_size) : Program{"DP4AMatMulNBitsSmallMProgram"}, tile_size_(tile_size) {}
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
+      {"M", ProgramUniformVariableDataType::Uint32},
+      {"N", ProgramUniformVariableDataType::Uint32},
+      {"K", ProgramUniformVariableDataType::Uint32},
+      {"K16", ProgramUniformVariableDataType::Uint32},
+      {"K32", ProgramUniformVariableDataType::Uint32},
+      {"block_size", ProgramUniformVariableDataType::Uint32},
+      {"num_N_tile", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  uint32_t tile_size_;
+};
+
 Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
                                   uint32_t M,
                                   uint32_t N,
                                   uint32_t K,
                                   uint32_t block_size,
+                                  uint32_t min_M_for_tile_optimization,
                                   onnxruntime::webgpu::ComputeContext& context,
                                   Tensor* y);
 
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index cce10a59fbd4b..be105a0fd4374 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -10,23 +10,13 @@
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_utils.h"
 
 namespace onnxruntime {
 namespace contrib {
 namespace webgpu {
 
 namespace {
-// Put it to a common place?
-uint32_t GetMaxComponents(uint32_t size) {
-  // we cannot use vec3 type since it has alignment of 16 bytes
-  if (size % 4 == 0) {
-    return 4;
-  } else if (size % 2 == 0) {
-    return 2;
-  }
-
-  return 1;
-}
 
 std::string QuantizedDataType(int components) {
   switch (components) {
@@ -574,9 +564,9 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
     return ApplySubgroupMatrixMatMulNBits(a, b, scales, M, N, K, context, y);
   }
 
-  if (M >= kMinMForTileOptimization &&
-      CanApplyDP4AMatrixMatMulNBits(context, accuracy_level_, block_size, batch_count, N, K, components_a, has_zero_points)) {
-    return ApplyDP4AMatrixMatMulNBits(a, b, scales, M, N, K, block_size, context, y);
+  // On FP32 only GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.
+  if ((M >= kMinMForTileOptimization || y->DataType() == DataTypeImpl::GetType<float>()) && CanApplyDP4AMatrixMatMulNBits(context, accuracy_level_, block_size, batch_count, N, K, components_a, has_zero_points)) {
+    return ApplyDP4AMatrixMatMulNBits(a, b, scales, M, N, K, block_size, kMinMForTileOptimization, context, y);
   }
 
   // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
diff --git a/onnxruntime/core/framework/model_metadef_id_generator.cc b/onnxruntime/core/framework/model_metadef_id_generator.cc
index 4a35052d159a0..2d55aa8360bd2 100644
--- a/onnxruntime/core/framework/model_metadef_id_generator.cc
+++ b/onnxruntime/core/framework/model_metadef_id_generator.cc
@@ -28,7 +28,7 @@ int ModelMetadefIdGenerator::GenerateId(const onnxruntime::GraphViewer& graph_vi
   // hash the bytes in the Graph instance. we can't just use the address as a new Graph instance may use
   // the same memory (unit tests prove this can occur). the raw bytes of the Graph instance should be a unique
   // fingerprint for the instance that can use used as the key to the hash of the model path/contents.
-  MurmurHash3::x86_128(&main_graph, gsl::narrow_cast<int32_t>(sizeof(Graph)), instance_hash[0], &instance_hash);
+  MurmurHash3::x86_128(&main_graph, sizeof(Graph), instance_hash[0], &instance_hash);
   HashValue graph_instance_hash = instance_hash[0] | (uint64_t(instance_hash[1]) << 32);
 
   // if we've already hashed this main graph instance use the cached value
@@ -42,10 +42,10 @@ int ModelMetadefIdGenerator::GenerateId(const onnxruntime::GraphViewer& graph_vi
     // this may not be available if the model was loaded from a stream or in-memory bytes
     const auto model_path_str = main_graph.ModelPath().string();
     if (!model_path_str.empty()) {
-      MurmurHash3::x86_128(model_path_str.data(), gsl::narrow_cast<int32_t>(model_path_str.size()), hash[0], &hash);
+      MurmurHash3::x86_128(model_path_str.data(), model_path_str.size(), hash[0], &hash);
     } else {
       auto hash_str = [&hash](const std::string& str) {
-        MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
+        MurmurHash3::x86_128(str.data(), str.size(), hash[0], &hash);
       };
 
       // fingerprint the main graph by hashing graph inputs and the ordered outputs from each node
diff --git a/onnxruntime/core/framework/murmurhash3.cc b/onnxruntime/core/framework/murmurhash3.cc
index 802f0a4c58a6d..c984767932a3b 100644
--- a/onnxruntime/core/framework/murmurhash3.cc
+++ b/onnxruntime/core/framework/murmurhash3.cc
@@ -59,7 +59,7 @@ inline uint64_t rotl64(uint64_t x, int8_t r) {
 //
 // Changes to support big-endian from https://github.com/explosion/murmurhash/pull/27/
 // were manually applied to original murmurhash3 source code.
-ORT_FORCEINLINE uint32_t getblock32(const uint32_t* p, int i) {
+ORT_FORCEINLINE uint32_t getblock32(const uint32_t* p, ptrdiff_t i) {
   if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
     return p[i];
   } else {
@@ -71,7 +71,7 @@ ORT_FORCEINLINE uint32_t getblock32(const uint32_t* p, int i) {
   }
 }
 
-ORT_FORCEINLINE uint64_t getblock64(const uint64_t* p, int i) {
+ORT_FORCEINLINE uint64_t getblock64(const uint64_t* p, ptrdiff_t i) {
   if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
     return p[i];
   } else {
@@ -115,10 +115,10 @@ ORT_FORCEINLINE constexpr uint64_t fmix64(uint64_t k) {
 //-----------------------------------------------------------------------------
 
 namespace onnxruntime {
-void MurmurHash3::x86_32(const void* key, int len,
+void MurmurHash3::x86_32(const void* key, size_t len,
                          uint32_t seed, void* out) {
   const uint8_t* data = (const uint8_t*)key;
-  const int nblocks = len / 4;
+  const auto nblocks = static_cast<ptrdiff_t>(len / 4U);
 
   uint32_t h1 = seed;
 
@@ -128,9 +128,9 @@ void MurmurHash3::x86_32(const void* key, int len,
   //----------
   // body
 
-  const uint32_t* blocks = (const uint32_t*)(data + static_cast<ptrdiff_t>(nblocks) * 4);
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4);
 
-  for (int i = -nblocks; i; i++) {
+  for (auto i = -nblocks; i; i++) {
     uint32_t k1 = getblock32(blocks, i);
 
     k1 *= c1;
@@ -145,7 +145,7 @@ void MurmurHash3::x86_32(const void* key, int len,
   //----------
   // tail
 
-  const uint8_t* tail = (const uint8_t*)(data + static_cast<ptrdiff_t>(nblocks) * 4);
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
 
   uint32_t k1 = 0;
 
@@ -176,9 +176,9 @@ void MurmurHash3::x86_32(const void* key, int len,
 
 //-----------------------------------------------------------------------------
 
-void MurmurHash3::x86_128(const void* key, int len, uint32_t seed, void* out) {
+void MurmurHash3::x86_128(const void* key, size_t len, uint32_t seed, void* out) {
   const uint8_t* data = (const uint8_t*)key;
-  const int nblocks = len / 16;
+  const auto nblocks = static_cast<ptrdiff_t>(len / 16U);
 
   uint32_t h1 = seed;
   uint32_t h2 = seed;
@@ -193,9 +193,9 @@ void MurmurHash3::x86_128(const void* key, int len, uint32_t seed, void* out) {
   //----------
   // body
 
-  const uint32_t* blocks = (const uint32_t*)(data + static_cast<ptrdiff_t>(nblocks) * 16);
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 16);
 
-  for (int i = -nblocks; i; i++) {
+  for (auto i = -nblocks; i; i++) {
     uint32_t k1 = getblock32(blocks, i * 4 + 0);
     uint32_t k2 = getblock32(blocks, i * 4 + 1);
     uint32_t k3 = getblock32(blocks, i * 4 + 2);
@@ -241,7 +241,7 @@ void MurmurHash3::x86_128(const void* key, int len, uint32_t seed, void* out) {
   //----------
   // tail
 
-  const uint8_t* tail = (const uint8_t*)(data + static_cast<ptrdiff_t>(nblocks) * 16);
+  const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
 
   uint32_t k1 = 0;
   uint32_t k2 = 0;
diff --git a/onnxruntime/core/framework/murmurhash3.h b/onnxruntime/core/framework/murmurhash3.h
index ab86a3e591adf..ddba725bb2a37 100644
--- a/onnxruntime/core/framework/murmurhash3.h
+++ b/onnxruntime/core/framework/murmurhash3.h
@@ -4,13 +4,14 @@
 #pragma once
 
 #include <cstdint>
+#include <cstddef>
 
 namespace onnxruntime {
 struct MurmurHash3 {
   // generate 32-bit hash from input and write to 'out'
-  static void x86_32(const void* key, int len, uint32_t seed, void* out);
+  static void x86_32(const void* key, size_t len, uint32_t seed, void* out);
 
   // generate 128-bit hash from input and write to 'out'.
-  static void x86_128(const void* key, int len, uint32_t seed, void* out);
+  static void x86_128(const void* key, size_t len, uint32_t seed, void* out);
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights.cc b/onnxruntime/core/framework/prepacked_weights.cc
index 6aee164dcf104..460b43f2888ab 100644
--- a/onnxruntime/core/framework/prepacked_weights.cc
+++ b/onnxruntime/core/framework/prepacked_weights.cc
@@ -11,14 +11,14 @@ HashValue PrePackedWeights::GetHash() const {
 
   uint32_t hash[4] = {0, 0, 0, 0};
 
-  auto hash_int8_t_buffer = [&hash](void* data, int len) { MurmurHash3::x86_128(data, len, hash[0], &hash); };
+  auto hash_int8_t_buffer = [&hash](void* data, size_t len) { MurmurHash3::x86_128(data, len, hash[0], &hash); };
 
   ORT_ENFORCE(buffers_.size() == buffer_sizes_.size());
   for (size_t iter = 0; iter < buffers_.size(); ++iter) {
     // some pre-packed buffers may be null if they were just "place-holders" occupying an index
     // in the "buffers_" vector
     if (buffers_[iter].get() != nullptr) {
-      hash_int8_t_buffer(buffers_[iter].get(), static_cast<int>(buffer_sizes_[iter]));
+      hash_int8_t_buffer(buffers_[iter].get(), buffer_sizes_[iter]);
     }
   }
 
diff --git a/onnxruntime/core/framework/resource_accountant.cc b/onnxruntime/core/framework/resource_accountant.cc
index b1bf9aa6d120b..0665cc1951e60 100644
--- a/onnxruntime/core/framework/resource_accountant.cc
+++ b/onnxruntime/core/framework/resource_accountant.cc
@@ -204,7 +204,7 @@ std::string IResourceAccountant::MakeUniqueNodeName(const Node& node) {
 
   uint32_t hash[4] = {0, 0, 0, 0};
   auto hash_str = [&hash](const std::string& str) {
-    MurmurHash3::x86_128(str.data(), narrow<int32_t>(str.size()), hash[0], &hash);
+    MurmurHash3::x86_128(str.data(), str.size(), hash[0], &hash);
   };
 
   const auto& node_name = (node.Name().empty()) ? node.OpType() : node.Name();
diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h
index 9608644a22523..3627989609737 100644
--- a/onnxruntime/core/mlas/inc/mlas_qnbit.h
+++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h
@@ -123,6 +123,7 @@ MlasIsQNBitGemmAvailable(
  * @param[in]   BatchN          number of batches
  * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   HasZeroPoint    whether zero points are provided
  * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 size_t MLASCALL
@@ -133,6 +134,7 @@ MlasQNBitGemmBatchWorkspaceSize(
     size_t BatchN,
     size_t BlkBitWidth,
     size_t BlkLen,
+    bool HasZeroPoint,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
@@ -147,6 +149,7 @@ MlasQNBitGemmBatchWorkspaceSize(
  * @param[in]   K               column size of matrix A and row size of matrix B
  * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   HasZeroPoint    whether zero points are provided
  * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 size_t MLASCALL
@@ -155,6 +158,7 @@ MlasQNBitGemmPackQuantBDataSize(
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
+    bool HasZeroPoint,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
@@ -181,7 +185,7 @@ MlasQNBitGemmPackQuantBDataSize(
  * @param[in]   QuantBData                      quantized B data
  * @param[in]   PackedQuantBDataAndOrBlkSum     buffer to store packed quantized B data and/or BlkSum
  * @param[in]   QuantBScale                     quantized B scale
- * @param[in]   has_zp_input                    whether QuantBZeroPoint is provided
+ * @param[in]   HasZeroPoint                    whether QuantBZeroPoint is provided
  * @param[in]   QuantBZeroPoint                 quantized B zero point
  * @param[in]   ThreadPool          thread pool to use (no parallel if nullptr)
  */
@@ -195,7 +199,25 @@ MlasQNBitGemmPackQuantBData(
     const void* QuantBData,
     void* PackedQuantBDataAndOrBlkSum,
     const void* QuantBScale,
-    bool has_zp_input,
+    bool HasZeroPoint,
     const void* QuantBZeroPoint,
     MLAS_THREADPOOL* ThreadPool
 );
+
+/**
+ * @brief Returns true if scales are packed when calling MlasQNBitGemmPackQuantBData the first time.
+ *
+ * @param[in]   K               column size of matrix A and row size of matrix B
+ * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
+ * @param[in]   HasZeroPoint    whether QuantBZeroPoint is provided
+ */
+bool MLASCALL
+MlasQNBitGemmScalesPacked(
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    bool HasZeroPoint
+);
diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
new file mode 100644
index 0000000000000..fdada83cc6582
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
@@ -0,0 +1,81 @@
+//
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: MIT
+//
+
+#include "kai_ukernel_interface.h"
+#include "mlasi.h"
+
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm.h"
+
+const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
+      kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod};
+
+const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_n_step_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_mr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_nr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_kr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_get_dst_size_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
+      kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod};
+
+const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_n_step_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_mr_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_nr_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_kr_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
+      kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod};
+
+const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
+      kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm};
+
+const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() {
+    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()) {
+        return kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm;
+    } else {
+        return kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod;
+    }
+}
+
+const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemvUKernel() {
+    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()) {
+        return kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod;
+    } else {
+        return kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod;
+    }
+}
diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.h b/onnxruntime/core/mlas/lib/kai_ukernel_interface.h
new file mode 100644
index 0000000000000..1a6f111d1c794
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.h
@@ -0,0 +1,12 @@
+//
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp_qsi4c32p_interface.h"
+
+const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel();
+const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemvUKernel();
diff --git a/onnxruntime/core/mlas/lib/qnbitgemm.cpp b/onnxruntime/core/mlas/lib/qnbitgemm.cpp
index f064a8e1d6a78..eafe91575c528 100644
--- a/onnxruntime/core/mlas/lib/qnbitgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qnbitgemm.cpp
@@ -91,6 +91,7 @@ MlasIsQNBitGemmAvailable(
         }
         case SQNBitGemmVariant_BitWidth4_CompInt8: { // SQ4BitGemmKernel_BlkSum_CompInt8
             return
+              (Dispatch->SQ4BitGemmKernel_Packed_CompInt8 != nullptr && Dispatch->QuantizeA_Packed_CompInt8 != nullptr) ||
               (Dispatch->SQ4BitGemmKernel_CompInt8 != nullptr && Dispatch->QuantizeARow_CompInt8 != nullptr) ||
               (Dispatch->SQ4BitGemmKernel_BlkSum_CompInt8 != nullptr && Dispatch->QuantizeARowComputeBlkSum_CompInt8 != nullptr);
         }
@@ -110,6 +111,7 @@ QNBitGemmPerGemmWorkspaceSize(
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
+    bool HasZeroPoint,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
@@ -119,7 +121,7 @@ QNBitGemmPerGemmWorkspaceSize(
     }
 
     if (BlkBitWidth == 4 && Dispatch->Q4BitGemmPerGemmWorkspaceSize != nullptr) {
-        return Dispatch->Q4BitGemmPerGemmWorkspaceSize(M, N, K, BlkLen, ComputeType);
+        return Dispatch->Q4BitGemmPerGemmWorkspaceSize(M, N, K, BlkLen, HasZeroPoint, ComputeType);
     }
 
     return 0;
@@ -151,10 +153,11 @@ QNBitGemmPerGemmWorkspaceStride(
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
+    bool HasZeroPoint,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
-    const auto Size = QNBitGemmPerGemmWorkspaceSize(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+    const auto Size = QNBitGemmPerGemmWorkspaceSize(M, N, K, BlkBitWidth, BlkLen, HasZeroPoint, ComputeType);
     const auto Alignment = QNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType);
     return MlasDivRoundup(Size, Alignment) * Alignment;
 }
@@ -169,10 +172,12 @@ MlasQNBitGemmBatchWorkspaceSize(
     size_t BatchN,
     size_t BlkBitWidth,
     size_t BlkLen,
+    bool HasZeroPoint,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
-    const size_t PerGemmWorkspaceStride = QNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+    const size_t PerGemmWorkspaceStride =
+        QNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, HasZeroPoint, ComputeType);
     if (PerGemmWorkspaceStride == 0) {
         return 0;
     }
@@ -190,6 +195,7 @@ MlasQNBitGemmPackQuantBDataSize(
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
+    bool HasZeroPoint,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
@@ -200,7 +206,7 @@ MlasQNBitGemmPackQuantBDataSize(
 
     if (BlkBitWidth == 4 && Dispatch->Q4BitGemmPackQuantBDataSize != nullptr) {
         return Dispatch->Q4BitGemmPackQuantBDataSize(
-            N, K, BlkLen, ComputeType
+            N, K, BlkLen, HasZeroPoint, ComputeType
         );
     }
 
@@ -232,7 +238,7 @@ MlasQNBitGemmPackQuantBData(
     const void* QuantBData,
     void* PackedQuantBDataAndOrBlkSumWorkspace,
     const void* QuantBScale,
-    bool has_zp_input,
+    bool HasZeroPoint,
     const void* QuantBZeroPoint,
     MLAS_THREADPOOL* ThreadPool
 )
@@ -253,7 +259,7 @@ MlasQNBitGemmPackQuantBData(
                 ComputeType,
                 static_cast<const std::byte*>(QuantBData),
                 static_cast<const float*>(QuantBScale),
-                has_zp_input,
+                HasZeroPoint,
                 static_cast<const std::byte*>(QuantBZeroPoint),
                 packed_quant_b,
                 ThreadPool
@@ -286,6 +292,29 @@ MlasQNBitGemmPackQuantBData(
     }
 }
 
+bool MLASCALL
+MlasQNBitGemmScalesPacked(
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    bool HasZeroPoint
+) {
+#ifdef MLAS_TARGET_ARM64
+    if (BlkBitWidth == 4 && ComputeType == SQNBIT_CompInt8) {
+      const auto UsePacked = GetMlasPlatform().QNBitGemmDispatch->UsePacked_CompInt8;
+      return UsePacked && UsePacked(K, BlkLen, HasZeroPoint);
+    }
+#else
+    MLAS_UNREFERENCED_PARAMETER(K);
+    MLAS_UNREFERENCED_PARAMETER(BlkBitWidth);
+    MLAS_UNREFERENCED_PARAMETER(BlkLen);
+    MLAS_UNREFERENCED_PARAMETER(ComputeType);
+    MLAS_UNREFERENCED_PARAMETER(HasZeroPoint);
+#endif  // MLAS_TARGET_ARM64
+    return false;
+}
+
 namespace
 {
 
@@ -519,6 +548,16 @@ SQ4BitGemm_CompInt8(
     const size_t RangeCountN
 )
 {
+    const auto UsePacked = GetMlasPlatform().QNBitGemmDispatch->UsePacked_CompInt8;
+    const auto SQ4BitGemm = GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_Packed_CompInt8;
+    if (UsePacked && SQ4BitGemm && UsePacked(K, BlkLen, DataParams->QuantBZeroPoint)) {
+        const std::byte* QuantA = static_cast<const std::byte*>(PerGemmWorkspace);
+        SQ4BitGemm(BlkLen, QuantA, DataParams->PackedQuantBData,
+            DataParams->C, RangeStartM, RangeCountM, RangeStartN, RangeCountN, K,
+            DataParams->ldc, DataParams->Bias);
+        return;
+    }
+
 #ifdef MLAS_TARGET_AMD64_IX86
     PerGemmQuantAWorkspace* const per_gemm_quant_a_workspace = static_cast<PerGemmQuantAWorkspace*>(PerGemmWorkspace);
     constexpr size_t BlkBitWidth = 4;
@@ -666,6 +705,8 @@ InitializeWorkspace_CompInt8<float>(
 {
     MLAS_UNREFERENCED_PARAMETER(N);
 
+    const auto UsePacked = GetMlasPlatform().QNBitGemmDispatch->UsePacked_CompInt8;
+    const auto QuantizeA_Packed = GetMlasPlatform().QNBitGemmDispatch->QuantizeA_Packed_CompInt8;
     const auto QuantizeARow = GetMlasPlatform().QNBitGemmDispatch->QuantizeARow_CompInt8;
     const auto QuantizeARow2 = GetMlasPlatform().QNBitGemmDispatch->QuantizeARowComputeBlkSum_CompInt8;
 
@@ -673,7 +714,15 @@ InitializeWorkspace_CompInt8<float>(
     const size_t QuantAStride = BlockCountK * Q8BlkSize(BlkLen);
 
     // TODO: try parallel on BatchN * M threads because BatchN is usually 1.
-    if (QuantizeARow) {
+    if (UsePacked && QuantizeA_Packed && UsePacked(K, BlkLen, DataParams->QuantBZeroPoint)) {
+        MlasTrySimpleParallel(ThreadPool, BatchN, [&](ptrdiff_t gemm_idx) {
+            const auto& data = DataParams[gemm_idx];
+
+            const float* ARowPtr = data.A;
+            std::byte* QuantARowPtr = static_cast<std::byte*>(Workspace) + gemm_idx * PerGemmWorkspaceStride;
+            QuantizeA_Packed(BlkLen, ARowPtr, M, K, QuantARowPtr);
+        });
+    } else if (QuantizeARow) {
         MlasTrySimpleParallel(ThreadPool, BatchN, [&](ptrdiff_t gemm_idx) {
             const auto& data = DataParams[gemm_idx];
 
@@ -844,7 +893,9 @@ MlasQNBitGemmBatch(
         );
     }
 
-    const size_t PerGemmWorkspaceStride = QNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+    const bool has_zp_input = DataParams->QuantBZeroPoint;
+    const size_t PerGemmWorkspaceStride =
+        QNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, has_zp_input, ComputeType);
 
     if (const auto InitializeWorkspaceOperation = GetInitializeWorkspace<T>(Variant);
         InitializeWorkspaceOperation != nullptr) {
@@ -862,7 +913,7 @@ MlasQNBitGemmBatch(
             const auto* Data = &DataParams[gemm_i];
             void* PerGemmWorkspace =
                 reinterpret_cast<std::byte*>(Workspace) + gemm_i * PerGemmWorkspaceStride;
-            if (ComputeType == SQNBIT_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) {
+            if (ComputeType == SQNBIT_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8 != nullptr) {
                 PackedQuantBDataStruct<T> packed_quant_b(const_cast<void*>(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen);
                 const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData;
                 const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum;
@@ -933,7 +984,7 @@ MlasQNBitGemmBatch(
 
         void* PerGemmWorkspace =
             reinterpret_cast<std::byte*>(Workspace) + gemm_i * PerGemmWorkspaceStride;
-        if (ComputeType == SQNBIT_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) {
+        if (ComputeType == SQNBIT_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8 != nullptr) {
             PackedQuantBDataStruct<T> packed_quant_b(const_cast<void*>(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen);
             const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData;
             const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum;
diff --git a/onnxruntime/core/mlas/lib/qnbitgemm.h b/onnxruntime/core/mlas/lib/qnbitgemm.h
index eb3d0b44ae3de..e25455cbfa217 100644
--- a/onnxruntime/core/mlas/lib/qnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/qnbitgemm.h
@@ -55,9 +55,12 @@ struct PackedQuantBDataStruct {
         constexpr size_t BlkBitWidth = 4;
         const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
         size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(T);
-
+#if defined(MLAS_TARGET_AMD64_IX86)
         // _mm256_load_si256 requires alignment on a 32-byte boundary
         PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 32);
+#else
+        PackedQuantBData = (std::byte*)PackedQuantBWorkspace;
+#endif
         QuantBBlkSum = (T*)(PackedQuantBData + PackedQuantBDataSize);
         QuantBBlkSum = (T*)MlasAlignAddress(QuantBBlkSum, MlasQNBitQuantBBlkSumAlignment());
         PackedQuantBScale = (T*)((std::byte*)QuantBBlkSum + BlkSumSize);
@@ -95,6 +98,7 @@ struct MLAS_QNBIT_GEMM_DISPATCH {
         size_t N,
         size_t K,
         size_t BlkLen,
+        bool HasZeroPoint,
         MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
     );
 
@@ -121,9 +125,9 @@ struct MLAS_QNBIT_GEMM_DISPATCH {
         MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
         const std::byte* QuantBDataBegin,
         const float* QuantBScaleBegin,
-        bool has_zp_input,
+        bool HasZeroPoint,
         const std::byte* QuantBZPBegin,
-        PackedQuantBDataStruct<float>& packed_quant_b,
+        PackedQuantBDataStruct<float>& PackedQuantB,
         MLAS_THREADPOOL* ThreadPool
     );
 
@@ -141,6 +145,7 @@ struct MLAS_QNBIT_GEMM_DISPATCH {
      * @param[in]   N               column size of matrix B and C
      * @param[in]   K               column size of matrix A and row size of matrix B
      * @param[in]   BlkLen          number of quantized values per block
+     * @param[in]   HasZeroPoint    whether zero points are provided
      * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
      */
     typedef size_t(Q4BitGemmPerGemmWorkspaceSize_Fn)(
@@ -148,6 +153,7 @@ struct MLAS_QNBIT_GEMM_DISPATCH {
         size_t N,
         size_t K,
         size_t BlkLen,
+        bool HasZeroPoint,
         MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
     );
 
@@ -267,6 +273,39 @@ struct MLAS_QNBIT_GEMM_DISPATCH {
     // SQNBIT_CompInt8 kernel function prototypes.
     //
 
+    /**
+     * @brief Multiply quantized 8-bit integer matrix A with quantized 4-bit integer matrix B.
+     *        A and B are block quantized and B is column major.
+     *        A should be packed using QuantizeA_Packed_CompInt8.
+     *
+     * @param       BlkLen              Number of values in a block.
+     * @param       QuantA              Supplies the quantized A matrix.
+                                        Binary data containing block quantized int8 data and scale values.
+     * @param       PackedQuantBData    Supplies the packed quantized B matrix data.
+     * @param[out]  C                   Supplies the output C matrix.
+     * @param       RangeStartM         Start of M range.
+     * @param       RangeCountM         Number of rows of A and C.
+     * @param       RangeStartN         Start of N range.
+     * @param       RangeCountN         Number of columns of B and C.
+     * @param       CountK              Number of columns of A and rows of B.
+     * @param       ldc                 Number of elements between adjacent rows of C.
+     */
+    typedef void(SQ4BitGemmKernel_Packed_CompInt8_Fn)(
+        size_t BlkLen,
+        const std::byte* QuantA,
+        const std::byte* PackedQuantBData,
+        float* C,
+        const size_t RangeStartM,
+        const size_t RangeCountM,
+        const size_t RangeStartN,
+        const size_t RangeCountN,
+        size_t CountK,
+        size_t ldc,
+        const float* Bias
+    );
+
+    SQ4BitGemmKernel_Packed_CompInt8_Fn* SQ4BitGemmKernel_Packed_CompInt8 = nullptr;
+
     /**
      * @brief Multiply quantized 8-bit integer matrix A with quantized 4-bit integer matrix B.
      *        A and B are block quantized and B is column major.
@@ -343,6 +382,38 @@ struct MLAS_QNBIT_GEMM_DISPATCH {
 
     SQ4BitGemmKernel_CompInt8_Fn* SQ4BitGemmKernel_CompInt8 = nullptr;
 
+    /**
+     * @brief Whether to use SQ4BitGemmKernel_Packed_CompInt8 for this problem.
+     */
+    typedef bool(UsePacked_CompInt8_Fn)(
+        size_t K,
+        size_t BlkLen,
+        bool HasZp
+    );
+
+    UsePacked_CompInt8_Fn* UsePacked_CompInt8 = nullptr;
+
+    /**
+     * @brief Block quantize values from matrix A from floats to quantized 8-bit integers.
+     *        Used in conjunction with SQ4BitGemmKernel_Packed_CompInt8.
+     *
+     * @param       BlkLen  Number of values in a block.
+     * @param       A       Supplies the A matrix.
+     * @param       CountM  Number of rows of A.
+     * @param       CountK  Number of columns of A.
+     * @param[out]  QuantA  Supplies the output quantized A matrix.
+     *                      Binary data containing block quantized int8 data and scale values.
+     */
+    typedef void(QuantizeA_Packed_CompInt8_Fn)(
+        size_t BlkLen,
+        const float* A,
+        size_t CountM,
+        size_t CountK,
+        std::byte* QuantA
+    );
+
+    QuantizeA_Packed_CompInt8_Fn* QuantizeA_Packed_CompInt8 = nullptr;
+
     /**
      * @brief Block quantize values from one row of matrix A from floats to quantized 8-bit integers.
      *
diff --git a/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
index 748ad8b9ba1a0..ab71492805e9c 100644
--- a/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
@@ -20,10 +20,18 @@ Module Name:
 #include <arm_neon.h>
 
 #include <cassert>
+#include <vector>
 
 #include "qnbitgemm.h"
 #include "sqnbitgemm_q8_block.h"
 
+#ifdef USE_KLEIDIAI
+#include "kai/kai_common.h"
+#include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h"
+#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h"
+#include "kai_ukernel_interface.h"
+#endif
+
 namespace sqnbitgemm_neon
 {
 
@@ -39,16 +47,31 @@ Q4BitGemmPackQuantBDataSize(
     size_t N,
     size_t K,
     size_t BlkLen,
+    bool HasZeroPoint,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
+#ifndef USE_KLEIDIAI
+    MLAS_UNREFERENCED_PARAMETER(HasZeroPoint);
     MLAS_UNREFERENCED_PARAMETER(ComputeType);  // same size regardless of ComputeType
-
-    constexpr size_t BlkBitWidth = 4;
-
-    const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
-    const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
-    return PackedQuantBDataSize;
+#endif
+
+#ifdef USE_KLEIDIAI
+    if (ComputeType == SQNBIT_CompInt8 && UseKleidiAI(K, BlkLen, HasZeroPoint)) {
+        const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& ukernel = GetKleidiAIGemmUKernel();
+        const size_t nr = ukernel.get_nr();
+        const size_t kr = ukernel.get_kr();
+        const size_t sr = ukernel.get_sr();
+        return kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, BlkLen, kai_dt_bf16);
+    } else
+#endif
+    {
+        constexpr size_t BlkBitWidth = 4;
+
+        const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+        const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+        return PackedQuantBDataSize;
+    }
 }
 
 void
@@ -122,6 +145,60 @@ SQ4BitGemmPackQuantBData(
     );
 }
 
+void
+SQ4BitGemmPackQuantBDataAndBlkSum(
+    size_t N,
+    size_t K,
+    size_t BlkLen,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const std::byte* QuantBDataBegin,
+    const float* QuantBScaleBegin,
+    bool HasZeroPoint,
+    const std::byte*,
+    PackedQuantBDataStruct<float>& PackedQuantB,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+#ifndef USE_KLEIDIAI
+    MLAS_UNREFERENCED_PARAMETER(QuantBScaleBegin);
+    MLAS_UNREFERENCED_PARAMETER(HasZeroPoint);
+#endif
+    assert(BlkLen >= 16 && BlkLen % 16 == 0);
+
+#ifdef USE_KLEIDIAI
+    if (UseKleidiAI(K, BlkLen, HasZeroPoint)) {
+        const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& ukernel = GetKleidiAIGemmUKernel();
+        std::byte* PackedQuantBDataBegin = PackedQuantB.PackedQuantBData;
+
+        const size_t nr = ukernel.get_nr();
+        const size_t kr = ukernel.get_kr();
+        const size_t sr = ukernel.get_sr();
+
+        kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params;
+        params.lhs_zero_point = 1;
+        params.rhs_zero_point = 8;
+        params.scale_dt = kai_dt_bf16;
+
+        const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+        const size_t scales_len = N * BlockCountK;
+        std::vector<uint16_t> scales(scales_len);
+        for (size_t i = 0; i < scales_len; i++) {
+            const uint32_t* i32 = reinterpret_cast<const uint32_t*>(&QuantBScaleBegin[i]);
+            scales[i] = *i32 >> 16;
+        }
+
+        kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(1, N, K, nr, kr, sr, BlkLen,
+                reinterpret_cast<const uint8_t*>(QuantBDataBegin), BlockCountK * BlkLen / 2,
+                nullptr, scales.data(), BlockCountK * sizeof(uint16_t),
+                PackedQuantBDataBegin, 0, &params);
+    } else
+#endif
+    {
+        std::byte* PackedQuantBDataBegin = reinterpret_cast<std::byte*>(PackedQuantB.QuantBWorkspace_);
+        SQ4BitGemmPackQuantBData(N, K, BlkLen, ComputeType, QuantBDataBegin, PackedQuantBDataBegin, ThreadPool);
+    }
+}
+
 //
 // Workspace size calculation function implementation.
 //
@@ -132,17 +209,34 @@ Q4BitGemmPerGemmWorkspaceSize(
     size_t N,
     size_t K,
     size_t BlkLen,
+    bool HasZeroPoint,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
     MLAS_UNREFERENCED_PARAMETER(N);
+#ifndef USE_KLEIDIAI
+    MLAS_UNREFERENCED_PARAMETER(HasZeroPoint);
+#endif
 
     switch (ComputeType) {
         case SQNBIT_CompInt8: {
             // workspace buffer is used for block quantization of A to int8
-            const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
-            const size_t PerGemmWorkspaceSize = M * BlockCountK * Q8BlkSize(BlkLen);
-            return PerGemmWorkspaceSize;
+#ifdef USE_KLEIDIAI
+            if (UseKleidiAI(K, BlkLen, HasZeroPoint)) {
+                const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& ukernel =
+                    M == 1? GetKleidiAIGemvUKernel() : GetKleidiAIGemmUKernel();
+
+                const size_t mr = ukernel.get_mr();
+                const size_t kr = ukernel.get_kr();
+                const size_t sr = ukernel.get_sr();
+                return kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
+            } else
+#endif
+            {
+                const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+                const size_t PerGemmWorkspaceSize = M * BlockCountK * Q8BlkSize(BlkLen);
+                return PerGemmWorkspaceSize;
+            }
         }
         default: {
             return 0;
@@ -170,6 +264,20 @@ Q4BitGemmPerGemmWorkspaceAlignment(
 
 }  // namespace
 
+bool
+UseKleidiAI(size_t K, size_t BlkLen, bool HasZp)
+{
+#ifdef USE_KLEIDIAI
+    bool has_dotprod = MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeonDot();
+    return (BlkLen % 32) == 0 && (K % BlkLen) == 0 && !HasZp && has_dotprod;
+#else
+    MLAS_UNREFERENCED_PARAMETER(K);
+    MLAS_UNREFERENCED_PARAMETER(BlkLen);
+    MLAS_UNREFERENCED_PARAMETER(HasZp);
+    return false;
+#endif
+}
+
 }  // namespace sqnbitgemm_neon
 
 //
@@ -189,16 +297,24 @@ GetMlasQNBitGemmDispatchNeon(
 
         d.Q4BitGemmPackQuantBDataSize = sqnbitgemm_neon::Q4BitGemmPackQuantBDataSize;
         d.SQ4BitGemmPackQuantBData = sqnbitgemm_neon::SQ4BitGemmPackQuantBData;
+        d.SQ4BitGemmPackQuantBDataAndBlkSum = sqnbitgemm_neon::SQ4BitGemmPackQuantBDataAndBlkSum;
 
         d.Q4BitGemmPerGemmWorkspaceSize = sqnbitgemm_neon::Q4BitGemmPerGemmWorkspaceSize;
         d.Q4BitGemmPerGemmWorkspaceAlignment = sqnbitgemm_neon::Q4BitGemmPerGemmWorkspaceAlignment;
 
         d.SQ4BitGemmM1Kernel_CompFp32 = sqnbitgemm_neon::SQ4BitGemmM1Kernel_CompFp32;
         d.SQ4BitBlkDequantBForSgemm_CompFp32 = sqnbitgemm_neon::SQ4BitBlkDequantBForSgemm_CompFp32;
+
         if (InitializeWithDotSupport) {
             d.SQ4BitGemmKernel_CompInt8 = sqnbitgemm_neon::SQ4BitGemmKernel_CompInt8;
+            d.QuantizeARow_CompInt8 = sqnbitgemm_neon::QuantizeARow_CompInt8;
+            d.UsePacked_CompInt8 = sqnbitgemm_neon::UsePacked_CompInt8;
+
+#ifdef USE_KLEIDIAI
+            d.SQ4BitGemmKernel_Packed_CompInt8 = sqnbitgemm_neon::SQ4BitGemmKernel_Packed_CompInt8;
+            d.QuantizeA_Packed_CompInt8 = sqnbitgemm_neon::QuantizeA_Packed_CompInt8;
+#endif
         }
-        d.QuantizeARow_CompInt8 = sqnbitgemm_neon::QuantizeARow_CompInt8;
 
 #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
         d.HQ4BitGemmPackQuantBData = sqnbitgemm_neon::HQ4BitGemmPackQuantBData_CompFp16;
diff --git a/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h
index ce9f0fe6d939d..a254ec9f92596 100644
--- a/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h
+++ b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h
@@ -108,6 +108,13 @@ HQ4BitGemmKernel_CompFp16(
 
 // SQNBIT_CompInt8 declarations
 
+bool
+UsePacked_CompInt8(
+    size_t K,
+    size_t BlkLen,
+    bool HasZp
+);
+
 void
 QuantizeARow_CompInt8(
     size_t BlkLen,
@@ -132,6 +139,35 @@ SQ4BitGemmKernel_CompInt8(
     const float* Bias
 );
 
+#ifdef USE_KLEIDIAI
+void
+QuantizeA_Packed_CompInt8(
+    size_t BlkLen,
+    const float* A,
+    size_t CountM,
+    size_t CountK,
+    std::byte* QuantA
+);
+
+void
+SQ4BitGemmKernel_Packed_CompInt8(
+    size_t BlkLen,
+    const std::byte* QuantA,
+    const std::byte* PackedQuantBData,
+    float* C,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN,
+    size_t CountK,
+    size_t ldc,
+    const float *Bias
+);
+#endif
+
+bool
+UseKleidiAI(size_t K, size_t BlkLen, bool HasZp);
+
 //
 // General helpers.
 //
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp
index 81615da46aa2e..79893eea85eca 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp
@@ -1309,9 +1309,9 @@ SQ4BitGemmPackQuantBDataAndBlkSum(
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const std::byte* QuantBDataBegin,
     const float* QuantBScaleBegin,
-    bool has_zp_input,
+    bool HasZeroPoint,
     const std::byte* QuantBZPBegin,
-    PackedQuantBDataStruct<float>& packed_quant_b,
+    PackedQuantBDataStruct<float>& PackedQuantB,
     MLAS_THREADPOOL* ThreadPool
 )
 {
@@ -1324,7 +1324,8 @@ SQ4BitGemmPackQuantBDataAndBlkSum(
     if (BlkLen == 32 && ComputeType == SQNBIT_CompInt8) {
         SubBlkLen = 64;
     }
-    PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool);
+    PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin,
+        HasZeroPoint, QuantBZPBegin, PackedQuantB, ThreadPool);
 }
 
 //
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp
index b4e25d4e4040a..ea06f954c854a 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp
@@ -335,9 +335,9 @@ SQ4BitGemmPackQuantBDataAndBlkSum512(
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const std::byte* QuantBDataBegin,
     const float* QuantBScaleBegin,
-    bool has_zp_input,
+    bool HasZeroPoint,
     const std::byte* QuantBZPBegin,
-    PackedQuantBDataStruct<float>& packed_quant_b,
+    PackedQuantBDataStruct<float>& PackedQuantB,
     MLAS_THREADPOOL* ThreadPool
 )
 {
@@ -349,7 +349,8 @@ SQ4BitGemmPackQuantBDataAndBlkSum512(
     if (ComputeType == SQNBIT_CompInt8) {
         SubBlkLen = 128;
     }
-    PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool);
+    PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin,
+        HasZeroPoint, QuantBZPBegin, PackedQuantB, ThreadPool);
 }
 
 const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512 = []() {
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp
index a4468bb906bbc..c2fcd92be2364 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp
@@ -317,9 +317,9 @@ SQ4BitGemmPackQuantBDataAndBlkSum512vnni(
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const std::byte* QuantBDataBegin,
     const float* QuantBScaleBegin,
-    bool has_zp_input,
+    bool HasZeroPoint,
     const std::byte* QuantBZPBegin,
-    PackedQuantBDataStruct<float>& packed_quant_b,
+    PackedQuantBDataStruct<float>& PackedQuantB,
     MLAS_THREADPOOL* ThreadPool
 )
 {
@@ -331,7 +331,8 @@ SQ4BitGemmPackQuantBDataAndBlkSum512vnni(
     if (ComputeType == SQNBIT_CompInt8) {
         SubBlkLen = 128;
     }
-    PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool);
+    PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin,
+        HasZeroPoint, QuantBZPBegin, PackedQuantB, ThreadPool);
 }
 
 //
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
index b0367b7fb9a15..02429a0c64f8e 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
@@ -11,6 +11,7 @@ Q4BitGemmPackQuantBDataSize(
     size_t N,
     size_t K,
     size_t BlkLen,
+    bool /* HasZeroPoint */,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
@@ -302,22 +303,22 @@ PackQuantBDataAndBlkSum(
     size_t SubBlkLen,
     const std::byte* QuantBDataBegin,
     const float* QuantBScaleBegin,
-    bool has_zp_input,
+    bool HasZeroPoint,
     const std::byte* QuantBZPBegin,
-    PackedQuantBDataStruct<float>& packed_quant_b,
+    PackedQuantBDataStruct<float>& PackedQuantB,
     MLAS_THREADPOOL* ThreadPool
 )
 {
     if (QuantBDataBegin) {
-        PackQuantB(QuantBDataBegin, packed_quant_b.PackedQuantBData, ThreadPool, N, BlockCountK, BlkLen, SubBlkLen);
+        PackQuantB(QuantBDataBegin, PackedQuantB.PackedQuantBData, ThreadPool, N, BlockCountK, BlkLen, SubBlkLen);
     }
 
     if (QuantBScaleBegin) {
-        std::copy(QuantBScaleBegin, QuantBScaleBegin + N * BlockCountK, packed_quant_b.PackedQuantBScale);
+        std::copy(QuantBScaleBegin, QuantBScaleBegin + N * BlockCountK, PackedQuantB.PackedQuantBScale);
     }
 
-    if ((QuantBScaleBegin && !has_zp_input) || QuantBZPBegin) {
-        ComputePackBlkSum(BlkLen, SubBlkLen, N, packed_quant_b.PackedQuantBScale, QuantBZPBegin, packed_quant_b.QuantBBlkSum, ThreadPool, BlockCountK);
+    if ((QuantBScaleBegin && !HasZeroPoint) || QuantBZPBegin) {
+        ComputePackBlkSum(BlkLen, SubBlkLen, N, PackedQuantB.PackedQuantBScale, QuantBZPBegin, PackedQuantB.QuantBBlkSum, ThreadPool, BlockCountK);
     }
 }
 
@@ -331,6 +332,7 @@ Q4BitGemmPerGemmWorkspaceSize(
     size_t N,
     size_t K,
     size_t BlkLen,
+    bool /* HasZeroPoint */,
     MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
index 73beb06a3cfad..8dbd339468930 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
@@ -1,7 +1,6 @@
 /*++
 
 Copyright (c) Microsoft Corporation. All rights reserved.
-
 Licensed under the MIT License.
 
 Module Name:
@@ -20,11 +19,17 @@ Module Name:
 #include <arm_neon.h>
 
 #include <cassert>
+#include <limits>
 
 #include "qnbitgemm.h"
 #include "qnbitgemm_kernel_neon.h"
 #include "sqnbitgemm_q8_block.h"
 
+#ifdef USE_KLEIDIAI
+#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h"
+#include "kai_ukernel_interface.h"
+#endif
+
 namespace sqnbitgemm_neon
 {
 
@@ -126,6 +131,41 @@ QuantizeBlock(
 
 }  // namespace
 
+bool
+UsePacked_CompInt8(size_t K, size_t BlkLen, bool HasZp)
+{
+    return UseKleidiAI(K, BlkLen, HasZp);
+}
+
+#ifdef USE_KLEIDIAI
+void
+QuantizeA_Packed_CompInt8(
+    size_t,
+    const float* A,
+    size_t CountM,
+    size_t CountK,
+    std::byte* QuantA
+)
+{
+    const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& ukernel =
+        CountM == 1? GetKleidiAIGemvUKernel() : GetKleidiAIGemmUKernel();
+
+    const size_t mr = ukernel.get_mr();
+    const size_t kr = ukernel.get_kr();
+    const size_t sr = ukernel.get_sr();
+
+    const size_t src_stride = CountK * sizeof(float);
+    const size_t lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(0, src_stride);
+    const size_t lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(
+                            0, CountK, mr, kr, sr);
+
+    const float* src_ptr = reinterpret_cast<const float*>(reinterpret_cast<const std::byte*>(A) + lhs_offset);
+    void* dst_ptr = QuantA + lhs_packed_offset;
+
+    kai_run_lhs_quant_pack_qai8dxp_f32(CountM, CountK, mr, kr, sr, 0, src_ptr, src_stride, dst_ptr);
+}
+#endif
+
 void
 QuantizeARow_CompInt8(
     size_t BlkLen,
@@ -1399,4 +1439,47 @@ SQ4BitGemmKernel_CompInt8(
     return CountM;
 }
 
+#ifdef USE_KLEIDIAI
+void
+SQ4BitGemmKernel_Packed_CompInt8(
+    size_t BlkLen,
+    const std::byte* QuantA,
+    const std::byte* PackedQuantBData,
+    float* C,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN,
+    size_t CountK,
+    size_t ldc,
+    const float* Bias
+)
+{
+    const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel ukernel =
+        RangeCountM == 1 && RangeStartM == 0? GetKleidiAIGemvUKernel() : GetKleidiAIGemmUKernel();
+
+    const size_t dst_stride = ldc * sizeof(float);
+
+    const size_t lhs_packed_offset = ukernel.get_lhs_packed_offset(RangeStartM, CountK);
+    const size_t rhs_packed_offset = ukernel.get_rhs_packed_offset(RangeStartN, CountK, BlkLen);
+    const size_t dst_offset = ukernel.get_dst_offset(RangeStartM, RangeStartN, dst_stride);
+
+    const void* lhs_ptr = QuantA + lhs_packed_offset;
+    const void* rhs_ptr = PackedQuantBData + rhs_packed_offset;
+    float* dst_ptr = reinterpret_cast<float*>(reinterpret_cast<std::byte*>(C) + dst_offset);
+
+    ukernel.run_matmul(
+        RangeCountM, RangeCountN, CountK, BlkLen, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float),
+        -std::numeric_limits<float>::max(), std::numeric_limits<float>::max());
+
+    if (Bias != nullptr) {
+        for (size_t m = RangeStartM; m < RangeStartM + RangeCountM; m++) {
+            for (size_t n = RangeStartN; n < RangeStartN + RangeCountN; n++) {
+                C[m * ldc + n] += Bias[n];
+            }
+        }
+    }
+}
+#endif
+
 }  // namespace sqnbitgemm_neon
diff --git a/onnxruntime/core/providers/cann/cann_utils.cc b/onnxruntime/core/providers/cann/cann_utils.cc
index 95d7a462ca9d9..5b3f9e6731b34 100644
--- a/onnxruntime/core/providers/cann/cann_utils.cc
+++ b/onnxruntime/core/providers/cann/cann_utils.cc
@@ -220,7 +220,7 @@ bool FileExist(const std::string& file_name) {
 
 void GenerateHashValue(const std::string string, HashValue& hash_value) {
   uint32_t hash[4] = {0, 0, 0, 0};
-  MurmurHash3::x86_128(string.data(), gsl::narrow_cast<int32_t>(string.size()), hash[0], &hash);
+  MurmurHash3::x86_128(string.data(), string.size(), hash[0], &hash);
   hash_value = hash[0] | (uint64_t(hash[1]) << 32);
 }
 
diff --git a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
index 3d9ae2bf7e6ff..cf9f44f4cd8f0 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
@@ -68,7 +68,7 @@ void* MIGraphXExternalAllocator::Reserve(size_t size) {
   return p;
 }
 
-void* HIPPinnedAllocator::Alloc(size_t size) {
+void* MIGraphXPinnedAllocator::Alloc(size_t size) {
   void* p = nullptr;
   if (size > 0) {
     HIP_CALL_THROW(hipHostMalloc((void**)&p, size));
@@ -76,7 +76,7 @@ void* HIPPinnedAllocator::Alloc(size_t size) {
   return p;
 }
 
-void HIPPinnedAllocator::Free(void* p) {
+void MIGraphXPinnedAllocator::Free(void* p) {
   HIP_CALL_THROW(hipHostFree(p));
 }
 
diff --git a/onnxruntime/core/providers/migraphx/migraphx_allocator.h b/onnxruntime/core/providers/migraphx/migraphx_allocator.h
index c8c935eba44ab..2a84445897391 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_allocator.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.h
@@ -49,17 +49,16 @@ class MIGraphXExternalAllocator : public MIGraphXAllocator {
   std::unordered_set<void*> reserved_;
 };
 
-// TODO: add a default constructor
-class HIPPinnedAllocator : public IAllocator {
+class MIGraphXPinnedAllocator final : public IAllocator {
  public:
-  HIPPinnedAllocator(int device_id, const char* name)
+  MIGraphXPinnedAllocator(const int device_id, const char* name)
       : IAllocator(
-            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
+            OrtMemoryInfo(name, OrtDeviceAllocator,
                           OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, static_cast<OrtDevice::DeviceId>(device_id)),
                           device_id, OrtMemTypeCPUOutput)) {}
 
-  virtual void* Alloc(size_t size) override;
-  virtual void Free(void* p) override;
+  void* Alloc(size_t size) override;
+  void Free(void* p) override;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
index 7b192b657b7cc..2904c17bb4aa0 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
@@ -39,7 +39,7 @@ struct ProviderInfo_MIGraphX_Impl final : ProviderInfo_MIGraphX {
   }
 
   std::unique_ptr<IAllocator> CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) override {
-    return std::make_unique<HIPPinnedAllocator>(device_id, name);
+    return std::make_unique<MIGraphXPinnedAllocator>(device_id, name);
   }
 
 } g_info;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 0328f6c2014fa..6b9f6a5e73e0f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -52,6 +52,70 @@ static const char* DlError() {
 #endif
 }
 
+Status ReadBinaryFromFile(const std::string& file_path, uint8_t* buffer, size_t buffer_size) {
+  ORT_RETURN_IF(nullptr == buffer, "Binary buffer is nullptr");
+  std::ifstream in(file_path, std::ifstream::binary);
+  ORT_RETURN_IF(!in, "Failed to open input file: ", file_path.c_str());
+  ORT_RETURN_IF(!in.read(reinterpret_cast<char*>(buffer), buffer_size), "Failed to read the contents of: ", file_path.c_str());
+  return Status::OK();
+}
+
+Status QnnBackendManager::ParseLoraConfig(std::string lora_config_path) {
+  LOGS_DEFAULT(INFO) << "Acquiring the QnnInterface " << lora_config_path;
+
+  // QNN Lora Config file format should be a single line, with the graph name first,
+  // followed by the qnn lora context binary path, separated by a semicolon (;)
+  // Example: <graph_name>;<binary_path>
+  LOGS_DEFAULT(INFO) << "Loading Lora Config " << lora_config_path;
+  std::ifstream file(lora_config_path);
+  std::string line;
+
+  if (file.is_open()) {
+    if (std::getline(file, line)) {
+      std::istringstream ss(line);
+      std::string graph_name;
+      std::string lora_adapter_bin_path;
+
+      if (std::getline(ss, graph_name, ';') && std::getline(ss, lora_adapter_bin_path)) {
+        size_t buffer_size = std::filesystem::file_size(lora_adapter_bin_path.c_str());
+
+        ORT_RETURN_IF(0 == buffer_size, "Received path to an empty file. Nothing to deserialize.");
+        std::unique_ptr<uint8_t[]> buffer = std::make_unique<uint8_t[]>(buffer_size);
+        void* voidBufferPtr = static_cast<void*>(buffer.get());
+        QnnContext_Buffer_t contextBuffer{QNN_CONTEXT_BUFFER_VERSION_1,
+                                          {QNN_CONTEXTMEMTYPE_RAW, {{voidBufferPtr, buffer_size}}}};
+
+        auto status = ReadBinaryFromFile(lora_adapter_bin_path,
+                                         reinterpret_cast<uint8_t*>(buffer.get()),
+                                         buffer_size);
+
+        ORT_RETURN_IF(status != Status::OK(), "Failed to read binary data.");
+        Qnn_GraphHandle_t graph;
+        bool graph_retrieve_success = false;
+        for (size_t cIdx = 0; cIdx < contexts_.size(); cIdx++) {
+          auto graph_retrieve_rt = qnn_interface_.graphRetrieve(contexts_[cIdx], graph_name.c_str(), &graph);
+          if (QNN_SUCCESS != graph_retrieve_rt) {
+            continue;
+          }
+
+          graph_retrieve_success = true;
+
+          auto context_apply_binary_section_rt = qnn_interface_.contextApplyBinarySection(
+              contexts_[cIdx], graph, QNN_CONTEXT_SECTION_UPDATABLE, &contextBuffer, profile_backend_handle_, nullptr);
+          ORT_RETURN_IF(QNN_SUCCESS != context_apply_binary_section_rt, "Failed to apply binary section.");
+          break;
+        }
+        ORT_RETURN_IF_NOT(graph_retrieve_success, "Failed to retrieve graph: ", graph_name, " and apply binary section.");
+      }
+    }
+    file.close();
+  } else {
+    LOGS_DEFAULT(ERROR) << "Unable to load Lora Config " << lora_config_path;
+  }
+
+  return Status::OK();
+}
+
 template <typename F, class T>
 Status QnnBackendManager::GetQnnInterfaceProvider(const char* lib_path,
                                                   const char* interface_provider_name,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index bd451d9ba9c1d..137b3856d431d 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -139,6 +139,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
                                        const Qnn_Tensor_t& qnn_tensor,
                                        Qnn_MemHandle_t& mem_handle);
 
+  Status ParseLoraConfig(std::string lora_config);
+
  private:
   Status LoadBackend();
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index e5b88a77b334c..2606ace8127d3 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -1194,6 +1194,12 @@ Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_optio
     }
   }
 
+  std::string lora_config = "";
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnLoraConfig, lora_config)) {
+    LOGS_DEFAULT(VERBOSE) << "lora_config: " << lora_config;
+    ORT_RETURN_IF_ERROR(qnn_backend_manager_->ParseLoraConfig(lora_config));
+  }
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/shared/utils/utils.cc b/onnxruntime/core/providers/shared/utils/utils.cc
index 5b2f2c1fa1b2e..4281b5e53c5fd 100644
--- a/onnxruntime/core/providers/shared/utils/utils.cc
+++ b/onnxruntime/core/providers/shared/utils/utils.cc
@@ -100,17 +100,6 @@ bool GetClipMinMax(const GraphViewer& graph_viewer, const Node& node, float& min
       node, min, max, logger);
 }
 
-// deprecated version that is not able to check if the initializer is constant
-bool GetClipMinMax(const InitializedTensorSet& initializers, const Node& node, float& min, float& max,
-                   const logging::Logger& logger) {
-  return GetClipMinMaxImpl(
-      [&initializers](const std::string& name) -> const ONNX_NAMESPACE::TensorProto* {
-        auto entry = initializers.find(name);
-        return entry == initializers.end() ? nullptr : entry->second;
-      },
-      node, min, max, logger);
-}
-
 NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node)
     : node_attributes_(node.GetAttributes()) {}
 
diff --git a/onnxruntime/core/providers/shared/utils/utils.h b/onnxruntime/core/providers/shared/utils/utils.h
index ddbae42534711..78da6c76047bd 100644
--- a/onnxruntime/core/providers/shared/utils/utils.h
+++ b/onnxruntime/core/providers/shared/utils/utils.h
@@ -27,12 +27,6 @@ class NodeUnit;
 bool GetClipMinMax(const GraphViewer& graph_viewer, const Node& node,
                    float& min, float& max, const logging::Logger& logger);
 
-/// <deprecated>GraphViewer GetConstantInitializer/IsConstantInitializer should be used to ensure the initializer is
-/// constant. Low risk for Clip min/max but in general the infrastructure to check if an operator is supported needs
-/// to be updated to not use InitializedTensorSet which may contain non-constant initializers.</deprecated>
-bool GetClipMinMax(const InitializedTensorSet& initializers, const Node& node,
-                   float& min, float& max, const logging::Logger& logger);
-
 // Get the type of the given NodeArg
 // Will return false if the given NodeArg has no type
 bool GetType(const NodeArg& node_arg, int32_t& type, const logging::Logger& logger);
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 90fd36ea29956..0d01215efaa14 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -775,7 +775,7 @@ std::unique_ptr<Model> CreateModel(const GraphViewer& graph_viewer, const loggin
 }  // namespace cann
 #endif
 
-void MurmurHash3::x86_128(const void* key, int len, uint32_t seed, void* out) {
+void MurmurHash3::x86_128(const void* key, size_t len, uint32_t seed, void* out) {
   return g_host->MurmurHash3__x86_128(key, len, seed, out);
 }
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 83d615c1bde0a..9d5e16caa361d 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -1288,7 +1288,7 @@ struct ProviderHost {
   virtual std::unique_ptr<Model> cann__CreateModel(const GraphViewer& graph_viewer, const logging::Logger& logger) = 0;
 #endif
 
-  virtual void MurmurHash3__x86_128(const void* key, int len, uint32_t seed, void* out) = 0;
+  virtual void MurmurHash3__x86_128(const void* key, size_t len, uint32_t seed, void* out) = 0;
 
 #ifdef _WIN32
   virtual std::string ToUTF8String(const std::wstring& s) = 0;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
index 71674f7c9c557..b99cb4f52ed59 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
@@ -15,7 +15,7 @@ std::string GetUniqueGraphName(const Graph& graph) {
   uint32_t hash[4] = {0, 0, 0, 0};
 
   auto hash_str = [&hash](const std::string& str) {
-    MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
+    MurmurHash3::x86_128(str.data(), str.size(), hash[0], &hash);
   };
 
   // Hash all nodes' name
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index 5a7b135fd92cd..dcf3673a004e4 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -533,7 +533,7 @@ HashValue TRTGenerateId(const GraphViewer& graph_viewer, std::string trt_version
   uint32_t hash[4] = {0, 0, 0, 0};
 
   auto hash_str = [&hash](const std::string& str) {
-    MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
+    MurmurHash3::x86_128(str.data(), str.size(), hash[0], &hash);
   };
 
   // Use the model's file name instead of the entire path to avoid cache regeneration if path changes
diff --git a/onnxruntime/core/providers/webgpu/math/matmul.cc b/onnxruntime/core/providers/webgpu/math/matmul.cc
new file mode 100644
index 0000000000000..9b447d5fdb59a
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/matmul.cc
@@ -0,0 +1,228 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/math/matmul.h"
+#include "core/common/inlined_containers.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+#include "core/providers/webgpu/data_transfer.h"
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    MatMul,
+    kOnnxDomain,
+    1, 12,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes()),
+    MatMul);
+
+ONNX_OPERATOR_KERNEL_EX(
+    MatMul,
+    kOnnxDomain,
+    13,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes()),
+    MatMul);
+
+static std::string CalcResult(int64_t components, int64_t a_components, int64_t output_number) {
+  std::ostringstream oss;
+  oss << "var a_data: a_value_t;\n";
+  for (int i = 0; i < a_components; ++i) {
+    oss << "let b_data" << i << " = b[(b_offset + (k + " << i << ") * uniforms.N + col) / " << components << "];\n";
+  }
+  for (int i = 0; i < output_number; ++i) {
+    oss << "a_data = a[(a_offset + (row + " << i << ") * uniforms.K + k) / " << a_components << "];\n";
+
+    for (int j = 0; j < a_components; j++) {
+      oss << "values[" << i << "] = fma(b_value_t(a_data" << (a_components == 1 ? "" : "[" + std::to_string(j) + "]") << "), b_data" << j << ", values[" << i << "]);\n";
+    }
+  }
+  return oss.str();
+}
+
+Status MatMulNaiveProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& a = shader.AddInput("a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias |
+                                           ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& b = shader.AddInput("b", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias |
+                                           ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+
+  std::string process_bias;
+  if (has_bias_) {
+    shader.AddInput("bias", ShaderUsage::UseUniform);
+    process_bias = "value += output_value_t(bias[row + i]);";
+  }
+
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform |
+                                                      ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const auto& batch_dims = shader.AddIndices("batch_dims");
+
+  int a_components = a.NumComponents();
+  int components = b.NumComponents();  // components of N
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "let col = (global_idx % (uniforms.N / " << components << ")) * " << components << ";\n"
+                            << "var index1 = global_idx / (uniforms.N / " << components << ");\n"
+                            << "let stride1 = uniforms.M / " << output_number_ << ";\n"
+                            << "let row = (index1 % stride1) * " << output_number_ << ";\n"
+                            << "let batch = index1 / stride1;\n";
+  if (output_rank_ != 2) {
+    shader.MainFunctionBody() << "let batch_indices = " << batch_dims.OffsetToIndices("batch") << ";\n";
+  }
+  shader.MainFunctionBody() << "var a_indices: a_indices_t;\n"
+                            << ConvertOutputBatchIndicesToInputBatchIndices("a", a, a.Rank() - 2, batch_dims.Rank(), "batch_indices")
+                            << a.IndicesSet("a_indices", a.Rank() - 2, 0) << "\n"
+                            << a.IndicesSet("a_indices", a.Rank() - 1, 0) << "\n"
+                            << "let a_offset = " << a.IndicesToOffset("a_indices") << "*" << a_components << ";\n"
+                            << "var b_indices: b_indices_t;\n"
+                            << ConvertOutputBatchIndicesToInputBatchIndices("b", b, b.Rank() - 2, batch_dims.Rank(), "batch_indices")
+                            << b.IndicesSet("b_indices", b.Rank() - 2, 0) << "\n"
+                            << b.IndicesSet("b_indices", b.Rank() - 1, 0) << "\n"
+                            << "let b_offset = " << b.IndicesToOffset("b_indices") << " * " << components << ";\n"
+                            << "var values: array<output_value_t, " << output_number_ << ">;\n"
+                            << "for (var k: u32 = 0u; k < uniforms.K; k = k + " << a_components << ") {\n"
+                            << CalcResult(components, a_components, output_number_) << "\n"
+                            << "}\n"
+                            << "for (var i = 0u; i < " << output_number_ << "u; i++) {\n"
+                            << "  var value = values[i];\n"
+                            << process_bias << "\n"
+                            << "  let cur_indices = output_indices_t(batch, row + i, col/ " << components << ");\n"
+                            << "  let offset = " << output.IndicesToOffset("cur_indices") << ";\n"
+                            << output.SetByOffset("offset", "value")
+                            << "}\n";
+
+  return Status::OK();
+}
+
+Status MatMul::ComputeInternal(ComputeContext& context) const {
+  // calculate output shape
+  MatMulComputeHelper helper;
+  const auto* a = context.Input(0);
+  const auto* b = context.Input(1);
+
+  ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
+  auto* output_tensor = context.Output(0, helper.OutputShape());
+  bool has_bias = context.InputCount() > 2;
+
+  if (helper.N() < 8 && helper.K() < 8) {  // call MatMulNaiveProgram
+
+    const uint32_t m = narrow<uint32_t>(helper.M());  // left matrix first dimension
+    const uint32_t n = narrow<uint32_t>(helper.N());  // right matrix second dimension
+    const uint32_t k = narrow<uint32_t>(helper.K());  // right matrix first dimension
+
+    const auto components = GetMaxComponents(n);
+    const auto a_components = GetMaxComponents(k);
+
+    const auto output_number = GetMaxComponents(m);
+    uint32_t output_size = narrow<uint32_t>(helper.OutputShape().Size() / components / output_number);
+
+    const size_t output_rank = helper.OutputShape().NumDimensions();
+    TensorShape outer_dims = output_rank > 2 ? helper.OutputShape().Slice(0, output_rank - 2) : TensorShape({});
+    const int64_t batch_size = outer_dims.Size();
+
+    const int64_t a_rows = a->Shape().NumDimensions() > 1 ? a->Shape()[a->Shape().NumDimensions() - 2] : 1;
+    TensorShape output_shape_shader({batch_size, a_rows, helper.N() / components});
+
+    MatMulNaiveProgram program{output_rank, output_number, has_bias};
+
+    program
+        .CacheHint(std::to_string(components), std::to_string(a_components), std::to_string(output_number))
+        .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, a_components},
+                    {b, ProgramTensorMetadataDependency::TypeAndRank, components}});
+
+    if (has_bias) {
+      const auto* bias = context.Input(2);
+      program.AddInput({bias, ProgramTensorMetadataDependency::Rank, 1});
+    }
+    program
+        .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, output_shape_shader, components}})
+        .SetDispatchGroupSize((output_size + 63) / 64)  // Integer ceiling division
+        .AddIndices(outer_dims)
+        .AddUniformVariables({{output_size}, {m}, {n}, {k}});
+
+    return context.RunProgram(program);
+  }
+
+  int64_t batchA = a->Shape().SizeToDimension(a->Shape().NumDimensions() - 2);
+  int64_t batchB = b->Shape().SizeToDimension(b->Shape().NumDimensions() - 2);
+
+  TensorShape a_shape = a->Shape();
+  TensorShape b_shape = b->Shape();
+  TensorShape output_shape = helper.OutputShape();
+
+  const int64_t dim_output_outer = output_shape[output_shape.NumDimensions() - 2];
+  // check if A is  batch of vector (bach is not 1, M is 1) and B is a matrix (batch is 1)
+  if (batchA != 1 && dim_output_outer == 1 && batchB == 1) {
+    // optimization for batched vector matrix multiplication
+    // dimensions of A: [1,`batchA`,K]
+    TensorShapeVector dims_a = {1, batchA, helper.K()};
+    // dimensions of B: [1,K,N]
+    TensorShapeVector dims_b = {1, helper.K(), helper.N()};
+
+    a_shape = TensorShape(dims_a);
+    b_shape = TensorShape(dims_b);
+    output_shape = {1, batchA, helper.N()};
+  }
+
+  // helpful dimension variables
+  TensorShape outer_dims_a = a_shape.NumDimensions() > 2
+                                 ? a_shape.Slice(0, a_shape.NumDimensions() - 2)
+                                 : TensorShape({});
+
+  TensorShape outer_dims_b = b_shape.NumDimensions() > 2
+                                 ? b_shape.Slice(0, b_shape.NumDimensions() - 2)
+                                 : TensorShape({});
+
+  TensorShape outer_dims = output_shape.NumDimensions() > 2
+                               ? output_shape.Slice(0, output_shape.NumDimensions() - 2)
+                               : TensorShape({});
+
+  const int64_t batch_size = outer_dims.Size();
+
+  // Get dimensions for matrix multiplication from TensorShape
+  const int32_t dim_a_outer = narrow<int32_t>(a_shape[a_shape.NumDimensions() - 2]);  // left matrix second dimension
+  const int32_t dim_inner = narrow<int32_t>(a_shape[a_shape.NumDimensions() - 1]);    // left matrix first dimension
+  const int32_t dim_b_outer = narrow<int32_t>(b_shape[b_shape.NumDimensions() - 1]);  // right matrix first dimension
+
+  const bool is_vec4 = dim_inner % 4 == 0 && dim_b_outer % 4 == 0;
+
+  InlinedVector<int64_t> elements_per_thread = dim_a_outer <= 8
+                                                   ? InlinedVector<int64_t>({4, 1, 1})
+                                                   : InlinedVector<int64_t>({4, 4, 1});
+
+  const uint32_t dispatch_x = narrow<uint32_t>((dim_b_outer + MATMUL_PACKED_WORKGROUP_SIZE_X * elements_per_thread[0] - 1) /
+                                               (MATMUL_PACKED_WORKGROUP_SIZE_X * elements_per_thread[0]));
+  const uint32_t dispatch_y = narrow<uint32_t>((dim_a_outer + MATMUL_PACKED_WORKGROUP_SIZE_Y * elements_per_thread[1] - 1) /
+                                               (MATMUL_PACKED_WORKGROUP_SIZE_Y * elements_per_thread[1]));
+  const uint32_t dispatch_z = narrow<uint32_t>((static_cast<uint32_t>(batch_size) + MATMUL_PACKED_WORKGROUP_SIZE_Z * elements_per_thread[2] - 1) /
+                                               (MATMUL_PACKED_WORKGROUP_SIZE_Z * elements_per_thread[2]));
+
+  const int components = is_vec4 ? 4 : 1;
+  const TensorShape a_shape_temp = CreateMatMulIntermediateShape(outer_dims_a, dim_a_outer, dim_inner, components);
+  const TensorShape b_shape_temp = CreateMatMulIntermediateShape(outer_dims_b, dim_inner, dim_b_outer, components);
+  const TensorShape output_shape_temp = TensorShape({batch_size, dim_a_outer, dim_b_outer / components});
+
+  MatMulProgram program{has_bias, is_vec4, elements_per_thread};
+  program
+      .CacheHint(absl::StrJoin(elements_per_thread, "-"), std::to_string(is_vec4))
+      .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, a_shape_temp, components},
+                  {b, ProgramTensorMetadataDependency::TypeAndRank, b_shape_temp, components}})
+      .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::Rank, output_shape_temp, components}})
+      .AddUniformVariables({{dim_a_outer}, {dim_b_outer}, {dim_inner}})
+      .AddIndices(outer_dims)
+      .SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z)
+      .SetWorkgroupSize(MATMUL_PACKED_WORKGROUP_SIZE_X, MATMUL_PACKED_WORKGROUP_SIZE_Y, MATMUL_PACKED_WORKGROUP_SIZE_Z);
+
+  if (has_bias) {
+    const auto* bias = context.Input(2);
+    program.AddInput({bias, ProgramTensorMetadataDependency::Rank, 1});
+  }
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/matmul.h b/onnxruntime/core/providers/webgpu/math/matmul.h
new file mode 100644
index 0000000000000..789e824383189
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/matmul.h
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+#include "core/providers/cpu/math/matmul_helper.h"
+#include "core/providers/webgpu/math/matmul_utils.h"
+#include "core/providers/webgpu/math/matmul_packed.h"
+#include "core/providers/webgpu/webgpu_utils.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class MatMul final : public WebGpuKernel {
+ public:
+  MatMul(const OpKernelInfo& info) : WebGpuKernel{info} {}
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+  constexpr static uint32_t MATMUL_PACKED_WORKGROUP_SIZE_X = 8;
+  constexpr static uint32_t MATMUL_PACKED_WORKGROUP_SIZE_Y = 8;
+  constexpr static uint32_t MATMUL_PACKED_WORKGROUP_SIZE_Z = 1;
+};
+
+class MatMulNaiveProgram final : public Program<MatMulNaiveProgram> {
+ public:
+  MatMulNaiveProgram(const size_t output_rank, int64_t output_number, bool has_bias)
+      : Program{"MatMulNaive"}, output_rank_(output_rank), output_number_(output_number), has_bias_{has_bias} {
+  }
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"M", ProgramUniformVariableDataType::Uint32},
+                                          {"N", ProgramUniformVariableDataType::Uint32},
+                                          {"K", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  const size_t output_rank_;
+  const int64_t output_number_;
+  const bool has_bias_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/matmul_packed.cc b/onnxruntime/core/providers/webgpu/math/matmul_packed.cc
new file mode 100644
index 0000000000000..2e5cff923f442
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/matmul_packed.cc
@@ -0,0 +1,303 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/math/matmul_packed.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_utils.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+void MatMulProgram::MatMulReadWriteFnSource(ShaderHelper& shader,
+                                            const ShaderVariableHelper& a,
+                                            const ShaderVariableHelper& b,
+                                            const ShaderVariableHelper& output,
+                                            const ShaderIndicesHelper& batch_dims) const {
+  int components = is_vec4_ ? 4 : 1;
+  const std::string data_type = "a_element_t";
+  const std::string type_string = MakeScalarOrVectorType(components, data_type);
+
+  // Add the mm_readA function
+  shader.AdditionalImplementation()
+      << "fn mm_readA(batch: i32, row: i32, colIn: i32, batch_indices: batch_dims_indices_t) -> " << type_string << " {\n"
+      << "    var value = " << type_string << "(0.0);\n"
+      << "    let col = colIn * " << components << ";\n"
+      << "    if(row < uniforms.dim_a_outer && col < uniforms.dim_inner) {\n"
+      << "        var a_indices: a_indices_t;\n"
+      << ConvertOutputBatchIndicesToInputBatchIndices("a", a, a.Rank() - 2, batch_dims.Rank(), "batch_indices")
+      << a.IndicesSet("a_indices", a.Rank() - 2, "u32(row)") << "\n"
+      << a.IndicesSet("a_indices", a.Rank() - 1, "u32(colIn)") << "\n"
+      << "        value = " << a.GetByIndices("a_indices") << ";\n"
+      << "    }\n"
+      << "    return value;\n"
+      << "}\n\n";
+
+  // Add the mm_readB function
+  shader.AdditionalImplementation()
+      << "fn mm_readB(batch: i32, row: i32, colIn: i32, batch_indices: batch_dims_indices_t) -> " << type_string << " {\n"
+      << "    var value = " << type_string << "(0.0);\n"
+      << "    let col = colIn * " << components << ";\n"
+      << "    if(row < uniforms.dim_inner && col < uniforms.dim_b_outer) {\n"
+      << "        var b_indices: b_indices_t;\n"
+      << ConvertOutputBatchIndicesToInputBatchIndices("b", b, b.Rank() - 2, batch_dims.Rank(), "batch_indices")
+      << b.IndicesSet("b_indices", b.Rank() - 2, "u32(row)") << "\n"
+      << b.IndicesSet("b_indices", b.Rank() - 1, "u32(colIn)") << "\n"
+      << "        value = " << b.GetByIndices("b_indices") << ";\n"
+      << "    }\n"
+      << "    return value;\n"
+      << "}\n\n";
+
+  // Add the mm_write function
+  shader.AdditionalImplementation()
+      << "fn mm_write(batch: i32, row: i32, colIn: i32, valueIn: " << type_string << ") {\n"
+      << "  let col = colIn * " << components << ";\n"
+      << "  if (row < uniforms.dim_a_outer && col < uniforms.dim_b_outer) {\n"
+      << "    var value = valueIn;\n"
+      << "    let coords = vec3<i32>(batch, row, colIn);\n";
+
+  if (has_bias_) {
+    shader.AdditionalImplementation() << "    value = value + " << type_string << "(bias[row]);\n";
+  }
+
+  shader.AdditionalImplementation()
+      << output.SetByIndices("vec3<u32>(coords)", "value") << "\n"
+      << "  }\n"
+      << "}\n\n";
+}
+
+Status MatMulProgram::MakeMatMulPackedVec4Source(ShaderHelper& shader,
+                                                 const ShaderIndicesHelper& batch_dims,
+                                                 const InlinedVector<int64_t>& elements_per_thread,
+                                                 uint32_t workgroup_size_x,
+                                                 uint32_t workgroup_size_y) {
+  // elements per thread
+  const auto elements_per_thread_x = elements_per_thread[0];
+  const auto elements_per_thread_y = elements_per_thread[1];
+  const decltype(elements_per_thread_x) tile_inner = 32;
+
+  const auto tile_a_outer = workgroup_size_y * elements_per_thread_y;
+  const auto tile_b_outer = workgroup_size_x * elements_per_thread_x;
+  const auto tile_a_width = tile_inner;
+
+  const auto tile_a_height = tile_a_outer;
+  const auto inner_elements_size = tile_a_width / workgroup_size_x;
+  const auto row_per_thread_b = tile_inner / workgroup_size_y;
+
+  const std::string data_type = "a_element_t";
+
+  if (!((inner_elements_size == 3 || inner_elements_size == 4) &&
+        tile_a_width % workgroup_size_x == 0 &&
+        tile_inner % workgroup_size_y == 0 &&
+        elements_per_thread_x == 4)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Invalid matrix multiplication configuration inner_elements_size: ", inner_elements_size,
+                           " must be 3 or 4. tile_a_width: ", tile_a_width, " must be divisible by WorkgroupSizeX: ",
+                           workgroup_size_x, ". tile_inner: ", tile_inner, " must be divisible by WorkgroupSizeY: ",
+                           workgroup_size_y, ". elements_per_thread_x: ", elements_per_thread_x, " must be 4.");
+  }
+
+  shader.AdditionalImplementation()
+      << "var<workgroup> mm_Asub: array<array<vec" << inner_elements_size << "<" << data_type << ">, " << tile_a_width / inner_elements_size << ">, " << tile_a_height << ">;\n"
+      << "var<workgroup> mm_Bsub: array<array<vec4<" << data_type << ">, " << tile_b_outer / elements_per_thread_x << ">, " << tile_inner << ">;\n"
+      << "const rowPerThread = " << elements_per_thread_y << ";\n"
+      << "const colPerThread = " << elements_per_thread_x << ";\n"
+      << "const innerElementSize = " << inner_elements_size << ";\n"
+      << "const tileInner = " << tile_inner << ";\n";
+
+  shader.MainFunctionBody()
+      << "  let localRow = i32(local_id.y);\n"
+      << "  let tileRow = localRow * rowPerThread;\n"
+      << "  let tileCol = i32(local_id.x);\n"
+      << "  let globalRow = i32(global_id.y) * rowPerThread;\n"
+      << "  let globalCol = i32(global_id.x);\n"
+      << "  let batch = i32(global_id.z);\n"
+      << "  let batchIndices = " << batch_dims.OffsetToIndices("u32(batch)") << ";\n"
+      << "  let globalRowStart = i32(workgroup_id.y) * " << tile_a_outer << ";\n"
+      << "  let num_tiles = (uniforms.dim_inner - 1) / tileInner + 1;\n"
+      << "  var kStart = 0;\n"
+      << "  var acc: array<vec4<" << data_type << ">, rowPerThread>;\n";
+
+  // Loop over shared dimension.
+  shader.MainFunctionBody()
+      << "  let tileRowB = localRow * " << row_per_thread_b << ";\n"
+      << "  for (var t = 0; t < num_tiles; t = t + 1) {\n";
+
+  // Load one tile of A into local memory.
+  shader.MainFunctionBody()
+      << "    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
+      << "      let inputRow = tileRow + innerRow;\n"
+      << "      let inputCol = tileCol;\n"
+      << "      mm_Asub[inputRow][inputCol] = mm_readA(batch, globalRow + innerRow, kStart / innerElementSize + inputCol, batchIndices);\n"
+      << "    }\n";
+
+  // Load one tile of B into local memory.
+  shader.MainFunctionBody()
+      << "    for (var innerRow = 0; innerRow < " << row_per_thread_b << "; innerRow = innerRow + 1) {\n"
+      << "      let inputRow = tileRowB + innerRow;\n"
+      << "      let inputCol = tileCol;\n"
+      << "      mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalCol, batchIndices);\n"
+      << "    }\n"
+      << "    kStart = kStart + tileInner;\n"
+      << "    workgroupBarrier();\n";
+
+  // Compute acc values for a single thread.
+  shader.MainFunctionBody()
+      << "    for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {\n"
+      << "      let BCached0 = mm_Bsub[k * innerElementSize][tileCol];\n"
+      << "      let BCached1 = mm_Bsub[k * innerElementSize + 1][tileCol];\n"
+      << "      let BCached2 = mm_Bsub[k * innerElementSize + 2][tileCol];\n";
+
+  if (inner_elements_size != 3) {
+    shader.MainFunctionBody() << "      let BCached3 = mm_Bsub[k * innerElementSize + 3][tileCol];\n";
+  }
+
+  shader.MainFunctionBody()
+      << "      for (var i = 0; i < rowPerThread; i = i + 1) {\n"
+      << "        let ACached = mm_Asub[tileRow + i][k];\n"
+      << "        acc[i] = BCached0 * ACached.x + acc[i];\n"
+      << "        acc[i] = BCached1 * ACached.y + acc[i];\n"
+      << "        acc[i] = BCached2 * ACached.z + acc[i];\n"
+      << "        " << (inner_elements_size == 3 ? "" : "acc[i] = BCached3 * ACached.w + acc[i];") << "\n"
+      << "      }\n";
+
+  shader.MainFunctionBody() << "    workgroupBarrier();\n"
+                            << "  }\n";  // main for loop
+
+  // Write the results to the output buffer
+  shader.MainFunctionBody()
+      << "  for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
+      << "    mm_write(batch, globalRow + innerRow, globalCol, acc[innerRow]);\n"
+      << "  }\n"
+      << "}\n";
+
+  return Status::OK();
+}
+
+Status MatMulProgram::MakeMatMulPackedSource(ShaderHelper& shader, const ShaderIndicesHelper& batch_dims,
+                                             const InlinedVector<int64_t>& elements_per_thread,
+                                             uint32_t workgroup_size_x,
+                                             uint32_t workgroup_size_y) {
+  const auto elements_per_thread_x = elements_per_thread[0];
+  const auto elements_per_thread_y = elements_per_thread[1];
+  const decltype(elements_per_thread_x) tile_inner = 32;
+
+  const auto tile_a_outer = workgroup_size_y * elements_per_thread_y;
+  const auto tile_b_outer = workgroup_size_x * elements_per_thread_x;
+  const auto tile_a_width = tile_inner;
+  const auto tile_a_height = tile_a_outer;
+
+  if (!(tile_a_height % workgroup_size_y == 0 && tile_a_width % workgroup_size_x == 0 && tile_inner % workgroup_size_y == 0)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "tile_a_height: ", tile_a_height, " must be divisible by WorkgroupSizeY: ", workgroup_size_y,
+                           ", tile_a_width: ", tile_a_width, " must be divisible by WorkgroupSizeX: ", workgroup_size_x,
+                           ", tile_inner: ", tile_inner, " must be divisible by WorkgroupSizeY: ", workgroup_size_y);
+  }
+
+  const std::string data_type = "a_element_t";
+
+  const auto row_per_thread_a = tile_a_height / workgroup_size_y;
+  const auto col_per_thread_a = tile_a_width / workgroup_size_x;
+  const auto row_per_thread_b = tile_inner / workgroup_size_y;
+
+  shader.AdditionalImplementation()
+      << "var<workgroup> mm_Asub: array<array<" << data_type << ", " << tile_a_width << ">, " << tile_a_height << ">;\n"
+      << "var<workgroup> mm_Bsub: array<array<" << data_type << ", " << tile_b_outer << ">, " << tile_inner << ">;\n"
+      << "const rowPerThread = " << elements_per_thread_y << ";\n"
+      << "const colPerThread = " << elements_per_thread_x << ";\n"
+      << "const tileInner = " << tile_inner << ";\n";
+
+  shader.MainFunctionBody() << " let batch = i32(global_id.z);\n"
+                            << " let batchIndices = " << batch_dims.OffsetToIndices("u32(batch)") << ";\n"
+                            << " let num_tiles = (uniforms.dim_inner - 1) / tileInner + 1;\n"
+                            << " var kStart = 0;\n"
+                            << " var acc: array<vec4<" << data_type << ">, rowPerThread>;\n";
+
+  shader.MainFunctionBody()
+      << "let tileRow = i32(local_id.y) * rowPerThread;\n"
+      << "let tileCol = i32(local_id.x) * colPerThread;\n"
+      << "let globalRow = i32(global_id.y) * rowPerThread;\n"
+      << "let globalCol = i32(global_id.x) * colPerThread;\n"
+      << "let globalRowStart = i32(workgroup_id.y) * " << tile_a_outer << ";\n"
+      << "let tileRowA = i32(local_id.y) * " << row_per_thread_a << ";\n"
+      << "let tileColA = i32(local_id.x) * " << col_per_thread_a << ";\n"
+      << "let tileRowB = i32(local_id.y) * " << row_per_thread_b << ";\n";
+
+  // Loop over shared dimension.
+  shader.MainFunctionBody()
+      << "for (var t = 0; t < num_tiles; t = t + 1) {\n";
+
+  // Load one tile of A into local memory.
+  shader.MainFunctionBody()
+      << "  for (var innerRow = 0; innerRow < " << row_per_thread_a << "; innerRow = innerRow + 1) {\n"
+      << "    for (var innerCol = 0; innerCol < " << col_per_thread_a << "; innerCol = innerCol + 1) {\n"
+      << "      let inputRow = tileRowA + innerRow;\n"
+      << "      let inputCol = tileColA + innerCol;\n"
+      << "      mm_Asub[inputRow][inputCol] = mm_readA(batch, globalRowStart + inputRow, kStart + inputCol, batchIndices);\n"
+      << "    }\n"
+      << "  }\n";
+
+  // Load one tile of B into local memory.
+  shader.MainFunctionBody()
+      << "  for (var innerRow = 0; innerRow < " << row_per_thread_b << "; innerRow = innerRow + 1) {\n"
+      << "    for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {\n"
+      << "      let inputRow = tileRowB + innerRow;\n"
+      << "      let inputCol = tileCol + innerCol;\n"
+      << "      mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalCol + innerCol, batchIndices);\n"
+      << "    }\n"
+      << "  }\n"
+      << "  kStart = kStart + tileInner;\n"
+      << "  workgroupBarrier();\n";
+
+  // Compute acc values for a single thread.
+  shader.MainFunctionBody()
+      << "var BCached: array<" << data_type << ", colPerThread>;\n"
+      << "  for (var k = 0; k < tileInner; k = k + 1) {\n"
+      << "    for (var inner = 0; inner < colPerThread; inner = inner + 1) {\n"
+      << "      BCached[inner] = mm_Bsub[k][tileCol + inner];\n"
+      << "    }\n"
+      << "    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
+      << "      let ACached = mm_Asub[tileRow + innerRow][k];\n"
+      << "      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {\n"
+      << "        acc[innerRow][innerCol] = acc[innerRow][innerCol] + ACached * BCached[innerCol];\n"
+      << "      }\n"
+      << "    }\n"
+      << "  }\n"
+      << "  workgroupBarrier();\n"
+      << "}\n";
+
+  // Write the results to the output buffer
+  shader.MainFunctionBody()
+      << "for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {\n"
+      << "  for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {\n"
+      << "    mm_write(batch, globalRow + innerRow, globalCol + innerCol, acc[innerRow][innerCol]);\n"
+      << "  }\n"
+      << "}\n";
+
+  return Status::OK();
+}
+
+Status MatMulProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& a = shader.AddInput("a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& b = shader.AddInput("b", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const auto& batch_dims = shader.AddIndices("batch_dims", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+
+  if (has_bias_) {
+    shader.AddInput("bias", ShaderUsage::UseUniform);
+  }
+
+  // declare the read and write functions
+  MatMulReadWriteFnSource(shader, a, b, output, batch_dims);
+
+  // generate the main function
+  if (is_vec4_) {
+    ORT_RETURN_IF_ERROR(MakeMatMulPackedVec4Source(shader, batch_dims, elements_per_thread_, WorkgroupSizeX(), WorkgroupSizeY()));
+  } else {
+    ORT_RETURN_IF_ERROR(MakeMatMulPackedSource(shader, batch_dims, elements_per_thread_, WorkgroupSizeX(), WorkgroupSizeY()));
+  }
+  return Status::OK();
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/matmul_packed.h b/onnxruntime/core/providers/webgpu/math/matmul_packed.h
new file mode 100644
index 0000000000000..ea76468944066
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/matmul_packed.h
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/math/matmul_utils.h"
+
+namespace onnxruntime {
+namespace webgpu {
+class MatMulProgram final : public Program<MatMulProgram> {
+ public:
+  MatMulProgram(bool bias, bool is_vec4, const gsl::span<int64_t>& elements_per_thread) : Program{"MatMul"},
+                                                                                          has_bias_{bias},
+                                                                                          is_vec4_{is_vec4},
+                                                                                          elements_per_thread_(elements_per_thread.begin(), elements_per_thread.end()) {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"dim_a_outer", ProgramUniformVariableDataType::Int32},
+                                          {"dim_b_outer", ProgramUniformVariableDataType::Int32},
+                                          {"dim_inner", ProgramUniformVariableDataType::Int32});
+
+  static Status MakeMatMulPackedVec4Source(ShaderHelper& shader,
+                                           const ShaderIndicesHelper& batch_dims,
+                                           const InlinedVector<int64_t>& elements_per_thread,
+                                           uint32_t workgroup_size_x,
+                                           uint32_t workgroup_size_y);
+  static Status MakeMatMulPackedSource(ShaderHelper& shader,
+                                       const ShaderIndicesHelper& batch_dims,
+                                       const InlinedVector<int64_t>& elements_per_thread,
+                                       uint32_t workgroup_size_x,
+                                       uint32_t workgroup_size_y);
+
+ private:
+  const bool has_bias_;
+  const bool is_vec4_;
+  const InlinedVector<int64_t> elements_per_thread_;
+
+  void MatMulReadWriteFnSource(ShaderHelper& shader, const ShaderVariableHelper& a, const ShaderVariableHelper& b, const ShaderVariableHelper& output, const ShaderIndicesHelper& batch_dims) const;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/matmul_utils.h b/onnxruntime/core/providers/webgpu/math/matmul_utils.h
new file mode 100644
index 0000000000000..bcd9c1b24a9bf
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/matmul_utils.h
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/inlined_containers.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/providers/webgpu/shader_helper.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+// Helper that creates a new TensorShape for the intermediate result of MatMul
+// The new shape is created by appending the two dimensions dim1 and dim2 / components to the original shape
+inline TensorShape CreateMatMulIntermediateShape(const TensorShape& shape, const int64_t dim1, const int64_t dim2, const int components) {
+  TensorShapeVector shape_vec = shape.AsShapeVector();
+  shape_vec.push_back(dim1);
+  shape_vec.push_back(dim2 / components);
+  return TensorShape(shape_vec);
+}
+
+// Helper that convert output batch indices to input batch indices using only the rank and
+// the shape information in uniform
+inline std::string ConvertOutputBatchIndicesToInputBatchIndices(const std::string& name, const ShaderVariableHelper& input, int input_batch_rank, int output_batch_rank, const std::string& batch_indices) {
+  std::ostringstream oss;
+  const std::string input_shape = "uniforms." + name + "_shape";
+  const std::string input_indices = name + "_indices";
+  int extending_input_rank = output_batch_rank - input_batch_rank;
+  for (int i = 0; i < input_batch_rank; ++i) {
+    oss << "if (" << GetElementAt(input_shape, i, input.Rank()) << " != 1) {\n"
+        << input.IndicesSet(input_indices, i, GetElementAt(batch_indices, i + extending_input_rank, output_batch_rank)) << "\n"
+        << "} else {\n"
+        << input.IndicesSet(input_indices, i, 0) << "\n"
+        << "}\n";
+  }
+  return oss.str();
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc
index d06fc5a57eb8c..6a6cfe154b91c 100644
--- a/onnxruntime/core/providers/webgpu/math/softmax.cc
+++ b/onnxruntime/core/providers/webgpu/math/softmax.cc
@@ -11,6 +11,7 @@
 #include "core/providers/webgpu/shader_variable.h"
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_utils.h"
 namespace onnxruntime {
 namespace webgpu {
 
@@ -56,28 +57,6 @@ static std::string MaxVector(const std::string& name, int components) {
   }
 }
 
-static std::string SumVector(const std::string& x, int components) {
-  switch (components) {
-    case 1:
-      return x;
-    case 2:
-      return "(" + x + ".x + " + x + ".y" + ")";
-    case 4:
-      return "(" + x + ".x + " + x + ".y + " + x + ".w + " + x + ".z" + ")";
-    default:
-      ORT_THROW("Unsupported number of components: ", components);
-  }
-}
-
-static int GetMaxComponents(int64_t size) {
-  if (size % 4 == 0) {
-    return 4;
-  } else if (size % 2 == 0) {
-    return 2;
-  }
-  return 1;
-}
-
 Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // Add input and output variables
   const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
diff --git a/onnxruntime/core/providers/webgpu/nn/layer_norm.cc b/onnxruntime/core/providers/webgpu/nn/layer_norm.cc
index 28ad686909a47..cf2939555057a 100644
--- a/onnxruntime/core/providers/webgpu/nn/layer_norm.cc
+++ b/onnxruntime/core/providers/webgpu/nn/layer_norm.cc
@@ -4,20 +4,12 @@
 
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_utils.h"
 #include "core/providers/webgpu/nn/layer_norm.h"
 
 namespace onnxruntime {
 namespace webgpu {
 
-static int GetMaxComponents(int64_t size) {
-  if (size % 4 == 0) {
-    return 4;
-  } else if (size % 2 == 0) {
-    return 2;
-  }
-  return 1;
-}
-
 static size_t NormalizeAxis(int64_t axis, size_t tensor_rank) {
   int64_t rank = static_cast<int64_t>(tensor_rank);
   if (axis < -rank && axis >= rank) {
@@ -26,19 +18,6 @@ static size_t NormalizeAxis(int64_t axis, size_t tensor_rank) {
   return onnxruntime::narrow<size_t>(axis < 0 ? axis + rank : axis);
 }
 
-static std::string SumVector(std::string x, int components) {
-  switch (components) {
-    case 1:
-      return x;
-    case 2:
-      return "(" + x + ".x + " + x + ".y" + ")";
-    case 4:
-      return "(" + x + ".x + " + x + ".y + " + x + ".w + " + x + ".z" + ")";
-    default:
-      ORT_THROW("Unsupported number of components: ", components);
-  }
-}
-
 Status LayerNormProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& x = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
   shader.AddInput("scale", ShaderUsage::UseUniform);
diff --git a/onnxruntime/core/providers/webgpu/nn/pool.cc b/onnxruntime/core/providers/webgpu/nn/pool.cc
new file mode 100644
index 0000000000000..79072a1dbaba8
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/nn/pool.cc
@@ -0,0 +1,254 @@
+
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/string_macros.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/nn/pool.h"
+
+#include <vector>
+
+namespace onnxruntime {
+namespace webgpu {
+
+namespace {
+
+std::vector<uint32_t> NarrowToU32(const TensorShapeVector& shape) {
+  std::vector<uint32_t> result;
+  result.reserve(shape.size());
+  for (auto dim : shape) {
+    result.push_back(static_cast<uint32_t>(dim));
+  }
+  return result;
+}
+
+}  // namespace
+
+#define POOLING_KERNEL(op_name, domain, is_nhwc, pool_type, since_version)                                \
+  ONNX_OPERATOR_KERNEL_EX(op_name, domain, since_version, kWebGpuExecutionProvider,                       \
+                          (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedFloatTypes()), \
+                          Pool<pool_type, is_nhwc>);
+
+#define POOLING_KERNEL_VERSIONED(op_name, domain, is_nhwc, pool_type, since_version, end_version)                   \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(op_name, domain, since_version, end_version, kWebGpuExecutionProvider,          \
+                                    (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedFloatTypes()), \
+                                    Pool<pool_type, is_nhwc>);
+
+#define POOLING_KERNEL_WITH_INDICES(op_name, domain, is_nhwc, pool_type, since_version)     \
+  ONNX_OPERATOR_KERNEL_EX(op_name, domain, since_version, kWebGpuExecutionProvider,         \
+                          (*KernelDefBuilder::Create())                                     \
+                              .TypeConstraint("T", WebGpuSupportedFloatTypes())             \
+                              .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()), \
+                          Pool<pool_type, is_nhwc>);
+
+#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, domain, is_nhwc, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(op_name, domain, since_version, end_version, kWebGpuExecutionProvider,     \
+                                    (*KernelDefBuilder::Create())                                              \
+                                        .TypeConstraint("T", WebGpuSupportedFloatTypes())                      \
+                                        .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),          \
+                                    Pool<pool_type, is_nhwc>);
+
+POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 10, 10)
+POOLING_KERNEL_VERSIONED(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 10, 10)
+POOLING_KERNEL(AveragePool, kOnnxDomain, false, AveragePool, 11)
+POOLING_KERNEL(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 11)
+POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, AveragePool, 1)
+POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, AveragePool, 1)
+
+POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED(MaxPool, kMSInternalNHWCDomain, true, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 11, 11)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 11, 11)
+POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 12)
+POOLING_KERNEL_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 12)
+POOLING_KERNEL(GlobalMaxPool, kOnnxDomain, false, MaxPool<1>, 1)
+POOLING_KERNEL(GlobalMaxPool, kMSInternalNHWCDomain, true, MaxPool<1>, 1)
+
+Status PoolProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& input = shader.AddInput("input", ShaderUsage::UseUniform);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform);
+
+  // Declare and initialize the variables needed.
+  std::string var_decl_code;
+  // Process each element in the pooling window.
+  std::string sampling_code;
+  // Calculate the output value for each pooling window.
+  std::string downsampling_code;
+
+  constexpr const size_t kStringInitialSize = 128;
+  if (is_max_pool_) {
+    std::string f16_min = "f16(-65504)";
+
+    SS(f32_min_ss, kStringInitialSize);
+    f32_min_ss << "f32(" << std::numeric_limits<float>::lowest() << ")";
+    std::string f32_min = SS_GET(f32_min_ss);
+
+    SS(var_decl_ss, kStringInitialSize);
+    var_decl_ss << "  var value = " << (is_float16_ ? f16_min : f32_min) << ";\n";
+    var_decl_code = SS_GET(var_decl_ss);
+
+    sampling_code = "      value = max(value, x_val);\n";
+  } else {
+    SS(var_decl_ss, kStringInitialSize);
+    var_decl_ss << "  var value = " << (is_float16_ ? "f16(0)" : "f32(0)") << ";\n";
+    if (!count_include_pad_) {
+      var_decl_ss << "  var count = u32(0);\n";
+    } else {
+      var_decl_ss << "  var count = uniforms.kernel_size;\n";
+    }
+    var_decl_code = SS_GET(var_decl_ss);
+
+    SS(sampling_ss, kStringInitialSize);
+    sampling_ss << "      value += x_val;\n";
+    if (!count_include_pad_) {
+      sampling_ss << "      count++;\n";
+    }
+    sampling_code = SS_GET(sampling_ss);
+
+    SS(downsampling_ss, kStringInitialSize);
+    downsampling_ss << "  value /= " << (is_float16_ ? "f16" : "f32") << "(count);\n";
+    downsampling_code = SS_GET(downsampling_ss);
+  }
+
+  const auto kernel_rank = kernel_shape_.size();
+  const auto pads_rank = kernel_shape_.size() * 2;
+  // The dimension index for H or D1
+  const auto data_dim_begin = is_nhwc_ ? 1 : 2;
+  // The dimension index after W or Dn
+  auto data_dim_end = input.Rank();
+  data_dim_end = is_nhwc_ ? data_dim_end - 1 : data_dim_end;
+
+  auto& body = shader.MainFunctionBody();
+  body << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+       << "  let y_indices = " << output.OffsetToIndices("global_idx") << ";\n"
+       << "  var x_indices = y_indices;\n"
+       << "  var k_indices: array<u32, " << kernel_rank << ">;\n"
+       << var_decl_code
+       << "  for (var i: u32 = 0; i < uniforms.kernel_size; i++) {\n"
+       << "    var offset = i;\n"
+       // ---- Compute offset to indices in pooling window.
+       << "    for (var j = 0; j < " << kernel_rank << "; j++) {\n"
+       << "      k_indices[j] = offset / " << GetElementAt("uniforms.kernel_strides", "j", kernel_rank) << ";\n"
+       << "      offset = offset % " << GetElementAt("uniforms.kernel_strides", "j", kernel_rank) << ";\n"
+       << "    }\n"
+       // ---- Apply dilations in pooling window.
+       << "    for (var j = 0; j < " << kernel_rank << "; j++) {\n"
+       << "      k_indices[j] *= " << GetElementAt("uniforms.dilations", "j", kernel_rank) << ";\n"
+       << "    }\n"
+       << "    var is_pad = false;\n"
+       // ---- Compute x_indices in each data dimension
+       << "    for (var j = " << data_dim_begin << "; j < " << data_dim_end << "; j++) {\n"
+       << "      let d_idx = j - " << data_dim_begin << ";\n"
+       << "      x_indices[j] = y_indices[j] * " << GetElementAt("uniforms.strides", "d_idx", kernel_rank) << ";\n"
+       << "      x_indices[j] += k_indices[d_idx];\n"
+       << "      x_indices[j] -= " << GetElementAt("uniforms.pads", "d_idx", pads_rank) << ";\n"
+       << "      let j_dim_len = " << input.IndicesGet("uniforms.input_shape", "j") << ";\n"
+       // ------ Check if x_indices[j] is out of bounds to handle padding.
+       << "      if (x_indices[j] < 0 || x_indices[j] >= j_dim_len) {\n"
+       << "        is_pad = true;\n"
+       << "        break;\n"
+       << "      }\n"
+       << "    }\n"
+       << "    if (!is_pad) {\n"
+       << "      let x_val = " << input.GetByIndices("x_indices") << ";\n"
+       << sampling_code
+       << "    }\n"
+       << "  }\n"
+       << downsampling_code
+       << "  " << output.SetByOffset("global_idx", "value") << ";\n";
+
+  return Status::OK();
+}
+
+template <typename PoolType, bool is_nhwc>
+Status Pool<PoolType, is_nhwc>::ComputeInternal(ComputeContext& context) const {
+  // TODO: support 'ceil' mode.
+  ORT_RETURN_IF_NOT(pool_attrs_.ceil_mode == 0, "Using ceil is not supported yet.");
+  // TODO: support 'column major' storage_order.
+  ORT_RETURN_IF_NOT(pool_attrs_.storage_order == 0, "Using column major storage_order is not supported yet.");
+
+  // TODO: support 'Indices' output.
+  ORT_RETURN_IF_NOT(context.OutputCount() == 1, "The Indices output is not supported yet.");
+
+  const auto* X = context.Input<Tensor>(0);
+  const TensorShape& x_shape = X->Shape();
+  const auto input_shape = x_shape.AsShapeVector();
+  ORT_RETURN_IF_NOT(input_shape.size() >= 3, "Input dimension cannot be less than 3.");
+
+  auto kernel_shape = pool_attrs_.kernel_shape;
+  auto strides = pool_attrs_.strides;
+  auto pads = pool_attrs_.pads;
+  auto dilations = pool_attrs_.dilations;
+  // Global pooling is equivalent to having the kernel size equal to the spatial dimension of input tensor.
+  if (pool_attrs_.global_pooling) {
+    if (!is_nhwc) {
+      kernel_shape.assign(input_shape.begin() + 2, input_shape.end());
+    } else {
+      kernel_shape.assign(input_shape.begin() + 1, input_shape.end() - 1);
+    }
+    // No padding.
+    pads.assign(2 * kernel_shape.size(), 0);
+    // Stride of 1.
+    strides.assign(kernel_shape.size(), 1);
+    // Dilation of 1.
+    dilations.assign(kernel_shape.size(), 1);
+  }
+
+  // Calculate the output shape
+  const auto out_channel = x_shape[is_nhwc ? input_shape.size() - 1 : 1];
+  const auto output_shape = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, is_nhwc);
+  Tensor* Y = context.Output(0, output_shape);
+
+  std::vector<uint32_t> kernel_strides(kernel_shape.size());
+  ORT_ENFORCE(kernel_shape.size() > 0, "kernel_shape must have at least one element.");
+  // Calculate the kernel element strides for each dimension in reverse order. For example:
+  //   kernel_shape = [3, 2], kernel_strides = [2, 1]
+  //   kernel_shape = [2, 3, 2], kernel_strides = [6, 2, 1]
+  for (size_t i = kernel_shape.size(); i > 0; --i) {
+    if (i == kernel_shape.size()) {
+      kernel_strides[i - 1] = 1;
+    } else {
+      kernel_strides[i - 1] = kernel_strides[i] * gsl::narrow_cast<uint32_t>(kernel_shape[i]);
+    }
+  }
+
+  bool is_max_pool = false;
+  if constexpr (PoolType::type == onnxruntime::PoolType::kMaxPool) {
+    is_max_pool = true;
+  } else if constexpr (PoolType::type != onnxruntime::PoolType::kAveragePool) {
+    ORT_NOT_IMPLEMENTED("Unsupported PoolType.");
+  }
+  bool is_float16 = X->GetElementType() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
+  bool count_include_pad = pool_attrs_.count_include_pad;
+  PoolProgram program{is_max_pool, is_nhwc, kernel_shape, is_float16, count_include_pad};
+
+  // Number of elements
+  uint32_t output_size = gsl::narrow_cast<uint32_t>(Y->Shape().Size());
+  uint32_t kernel_size = gsl::narrow_cast<uint32_t>(TensorShape{kernel_shape}.Size());
+
+  const auto pads_u32 = NarrowToU32(pads);
+  const auto strides_u32 = NarrowToU32(strides);
+  const auto dilations_u32 = NarrowToU32(dilations);
+
+  program.CacheHint(kernel_shape.size(), is_max_pool, is_nhwc, is_float16, count_include_pad)
+      .AddInputs({{X, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddOutputs({{Y}})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({output_size, kernel_size,
+                            gsl::span<const uint32_t>(kernel_strides.data(), kernel_strides.size()),
+                            gsl::span<const uint32_t>(pads_u32.data(), pads_u32.size()),
+                            gsl::span<const uint32_t>(strides_u32.data(), strides_u32.size()),
+                            gsl::span<const uint32_t>(dilations_u32.data(), dilations_u32.size())});
+
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/nn/pool.h b/onnxruntime/core/providers/webgpu/nn/pool.h
new file mode 100644
index 0000000000000..c1716542e5549
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/nn/pool.h
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/common.h"
+#include "core/providers/cpu/nn/pool_base.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class PoolProgram final : public Program<PoolProgram> {
+ public:
+  PoolProgram(bool is_max_pool, bool is_nhwc, const TensorShapeVector& kernel_shape, bool is_float16,
+              bool count_include_pad)
+      : Program{"Pool"},
+        is_max_pool_{is_max_pool},
+        is_nhwc_{is_nhwc},
+        kernel_shape_{kernel_shape},
+        is_float16_{is_float16},
+        count_include_pad_{count_include_pad} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"kernel_size", ProgramUniformVariableDataType::Uint32},
+                                          {"kernel_strides", ProgramUniformVariableDataType::Uint32},
+                                          {"pads", ProgramUniformVariableDataType::Uint32},
+                                          {"strides", ProgramUniformVariableDataType::Uint32},
+                                          {"dilations", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  // Whether it is max pool or average pool.
+  const bool is_max_pool_;
+
+  const bool is_nhwc_;
+  const TensorShapeVector kernel_shape_;
+  const bool is_float16_;
+  const bool count_include_pad_;
+};
+
+template <typename PoolType, bool is_nhwc>
+class Pool : public WebGpuKernel, public PoolBase {
+ public:
+  explicit Pool(const OpKernelInfo& info) : WebGpuKernel(info), PoolBase(info) {}
+
+  Status ComputeInternal(ComputeContext& context) const override;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
index 11fa30c798809..8f2619b6cb2b6 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -247,7 +247,13 @@ Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context)
     program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank});
   }
 
-  program.CacheHint(is_input_empty)
+  // TODO: the ReduceKernel class is designed to use `keepdims_`, `noop_with_empty_axes_` and input axes as uniform variables,
+  //       but the current implementation does not work without them in cache key.
+  //       This is a temporary workaround to make it work. We should fix this in the future.
+  program.CacheHint(keepdims_,
+                    noop_with_empty_axes_,
+                    select_last_index_,
+                    absl::StrJoin(input_axes, ","))
       .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
       .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({{static_cast<uint32_t>(output_size)},
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 82ba00f8d09e7..9234aab58dd99 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -768,7 +768,7 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co
   auto it = contexts_.find(context_id);
   if (it == contexts_.end()) {
     GSL_SUPPRESS(r.11)
-    auto context = std::unique_ptr<WebGpuContext>(new WebGpuContext(instance, device, config.validation_mode));
+    auto context = std::unique_ptr<WebGpuContext>(new WebGpuContext(instance, device, config.validation_mode, config.preserve_device));
     it = contexts_.emplace(context_id, WebGpuContextFactory::WebGpuContextInfo{std::move(context), 0}).first;
   } else if (context_id != 0) {
     ORT_ENFORCE(it->second.context->instance_.Get() == instance &&
@@ -794,7 +794,7 @@ void WebGpuContextFactory::ReleaseContext(int context_id) {
   auto it = contexts_.find(context_id);
   ORT_ENFORCE(it != contexts_.end(), "WebGPU EP context ID ", context_id, " is not found.");
 
-  if (--it->second.ref_count == 0) {
+  if (--it->second.ref_count == 0 && !it->second.context->preserve_device_) {
     contexts_.erase(it);
   }
 }
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
index a221bbcec95c3..96f5a0cebc5c1 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -32,6 +32,7 @@ struct WebGpuContextConfig {
   WGPUDevice device;
   const void* dawn_proc_table;
   ValidationMode validation_mode;
+  bool preserve_device;
 };
 
 struct WebGpuBufferCacheConfig {
@@ -152,8 +153,8 @@ class WebGpuContext final {
     AtPasses
   };
 
-  WebGpuContext(WGPUInstance instance, WGPUDevice device, webgpu::ValidationMode validation_mode)
-      : instance_{instance}, device_{device}, validation_mode_{validation_mode}, query_type_{TimestampQueryType::None} {}
+  WebGpuContext(WGPUInstance instance, WGPUDevice device, webgpu::ValidationMode validation_mode, bool preserve_device)
+      : instance_{instance}, device_{device}, validation_mode_{validation_mode}, query_type_{TimestampQueryType::None}, preserve_device_{preserve_device} {}
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WebGpuContext);
 
   std::vector<const char*> GetEnabledAdapterToggles() const;
@@ -229,6 +230,7 @@ class WebGpuContext final {
 
   uint64_t gpu_timestamp_offset_ = 0;
   bool is_profiling_ = false;
+  bool preserve_device_;
 
 #if defined(ENABLE_PIX_FOR_WEBGPU_EP)
   std::unique_ptr<WebGpuPIXFrameGenerator> pix_frame_generator_ = nullptr;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index aacbcc5fb4f0a..15166df54e40c 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -593,36 +593,36 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, 10, ConvTranspose)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 8, 9, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 10, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, 7, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool)>,
-
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 7, 9, AveragePool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 10, AveragePool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, AveragePool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool)>,
-
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, GlobalAveragePool)>,
-
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 8, 9, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 10, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, 7, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 7, 9, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 10, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, GlobalAveragePool)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, GlobalMaxPool)>,
 
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 7, 8, Gemm)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, 10, Gemm)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Gemm)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Gemm)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, MatMul)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMax)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMax)>,
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
index 1d779152f91f3..d6812b2d0704d 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -143,12 +143,25 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
     }
   }
 
+  std::string preserve_device_str;
+  bool preserve_device = false;
+  if (config_options.TryGetConfigEntry(kPreserveDevice, preserve_device_str)) {
+    if (preserve_device_str == kPreserveDevice_ON) {
+      preserve_device = true;
+    } else if (preserve_device_str == kPreserveDevice_OFF) {
+      preserve_device = false;
+    } else {
+      ORT_THROW("Invalid preserve device: ", preserve_device_str);
+    }
+  }
+
   webgpu::WebGpuContextConfig context_config{
       context_id,
       reinterpret_cast<WGPUInstance>(webgpu_instance),
       reinterpret_cast<WGPUDevice>(webgpu_device),
       reinterpret_cast<const void*>(dawn_proc_table),
       validation_mode,
+      preserve_device,
   };
 
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP Device ID: " << context_id;
@@ -156,6 +169,7 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP WGPUDevice: " << webgpu_device;
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP DawnProcTable: " << dawn_proc_table;
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP ValidationMode: " << validation_mode;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP PreserveDevice: " << preserve_device;
 
   //
   // STEP.3 - prepare parameters for WebGPU context initialization.
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
index 1cf316f4304e5..fcfd6774f8ab8 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
@@ -30,6 +30,8 @@ constexpr const char* kValidationMode = "WebGPU:validationMode";
 constexpr const char* kForceCpuNodeNames = "WebGPU:forceCpuNodeNames";
 constexpr const char* kEnablePIXCapture = "WebGPU:enablePIXCapture";
 
+constexpr const char* kPreserveDevice = "WebGPU:preserveDevice";
+
 // The following are the possible values for the provider options.
 
 constexpr const char* kDawnBackendType_D3D12 = "D3D12";
@@ -44,6 +46,9 @@ constexpr const char* kEnableGraphCapture_OFF = "0";
 constexpr const char* kEnablePIXCapture_ON = "1";
 constexpr const char* kEnablePIXCapture_OFF = "0";
 
+constexpr const char* kPreserveDevice_ON = "1";
+constexpr const char* kPreserveDevice_OFF = "0";
+
 constexpr const char* kBufferCacheMode_Disabled = "disabled";
 constexpr const char* kBufferCacheMode_LazyRelease = "lazyRelease";
 constexpr const char* kBufferCacheMode_Simple = "simple";
diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.h b/onnxruntime/core/providers/webgpu/webgpu_utils.h
index 4f9018646905d..5f6f18f34b7f5 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_utils.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_utils.h
@@ -7,7 +7,7 @@
 namespace onnxruntime {
 namespace webgpu {
 
-inline int64_t GetMaxComponents(int64_t size) {
+inline int GetMaxComponents(int64_t size) {
   if (size % 4 == 0) {
     return 4;
   } else if (size % 2 == 0) {
@@ -16,5 +16,33 @@ inline int64_t GetMaxComponents(int64_t size) {
   return 1;
 }
 
+inline std::string SumVector(std::string x, int components) {
+  switch (components) {
+    case 1:
+      return x;
+    case 2:
+      return "(" + x + ".x + " + x + ".y" + ")";
+    case 4:
+      return "(" + x + ".x + " + x + ".y + " + x + ".z + " + x + ".w" + ")";
+    default:
+      ORT_THROW("Unsupported number of components: ", components);
+  }
+}
+
+inline std::string MakeScalarOrVectorType(int components, std::string_view data_type) {
+  switch (components) {
+    case 1:
+      return std::string{data_type};
+    case 2:
+      return MakeStringWithClassicLocale("vec2<", data_type, ">");
+    case 3:
+      return MakeStringWithClassicLocale("vec3<", data_type, ">");
+    case 4:
+      return MakeStringWithClassicLocale("vec4<", data_type, ">");
+    default:
+      ORT_THROW("Unsupported number of components: ", components);
+  }
+}
+
 }  // namespace webgpu
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index 47f65cd0b8e85..e0d82b26c2174 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -58,12 +58,12 @@ bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const loggin
   return true;
 }
 
-bool IsNodeSupported(const Node& node, const GraphViewer& graph_viewer, const WebnnDeviceType device_type,
+bool IsNodeSupported(const GraphViewer& graph_viewer, const Node& node, const WebnnDeviceType device_type,
                      const emscripten::val& wnn_limits, const logging::Logger& logger) {
   const auto& op_builders = GetOpBuilders();
   if (Contains(op_builders, node.OpType())) {
     const auto* op_builder = op_builders.at(node.OpType());
-    return op_builder->IsOpSupported(graph_viewer.GetAllInitializedTensors(), node, device_type, wnn_limits, logger);
+    return op_builder->IsOpSupported(graph_viewer, node, device_type, wnn_limits, logger);
   } else {
     return false;
   }
@@ -107,7 +107,7 @@ std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewe
   std::unordered_set<const Node*> supported_nodes;
 
   for (const auto& node : graph_viewer.Nodes()) {
-    const bool supported = IsNodeSupported(node, graph_viewer, device_type, wnn_limits, logger);
+    const bool supported = IsNodeSupported(graph_viewer, node, device_type, wnn_limits, logger);
     LOGS(logger, VERBOSE) << "Operator type: [" << node.OpType()
                           << "] index: [" << node.Index()
                           << "] name: [" << node.Name()
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 9a1566ccdc99a..95c4b79053a1f 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -171,12 +171,13 @@ inline bool ReadScalarTensorData(const onnx::TensorProto& tensor, emscripten::va
   return true;
 }
 
-inline bool IsEmptyTensor(const InitializedTensorSet& initializers, const std::string& name) {
-  if (name.empty() || !Contains(initializers, name)) {
+inline bool IsEmptyTensor(const GraphViewer& graph_viewer, const std::string& name) {
+  const auto* tensor_init = graph_viewer.GetConstantInitializer(name);
+  if (name.empty() || !tensor_init) {
     return true;
   }
 
-  const auto& tensor = *initializers.at(name);
+  const auto& tensor = *tensor_init;
   const auto dims = tensor.dims();
   // An empty tensor contains a 0 in the dimensions list.
   return std::any_of(dims.begin(), dims.end(), [](auto d) { return d == 0; });
diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index 08580ab2861d7..fc630af8cf1e3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -20,7 +20,7 @@ class ArgMaxMinOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          WebnnDeviceType device_type, const logging::Logger& logger) const override;
 };
 
@@ -66,7 +66,7 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool ArgMaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool ArgMaxMinOpBuilder::IsOpSupportedImpl(const GraphViewer& /* initializers */,
                                            const Node& node,
                                            WebnnDeviceType device_type,
                                            const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.cc
index 4504a54a379f3..d5683454c89b7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.cc
@@ -17,7 +17,7 @@ namespace webnn {
 Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
                                         const logging::Logger& logger) const {
   ORT_RETURN_IF_NOT(
-      IsOpSupported(model_builder.GetInitializerTensors(), node, model_builder.GetWebnnDeviceType(),
+      IsOpSupported(model_builder.GetGraphViewer(), node, model_builder.GetWebnnDeviceType(),
                     model_builder.GetOpSupportLimits(), logger),
       "Unsupported operator ", node.OpType());
   ORT_RETURN_IF_ERROR(AddToModelBuilderImpl(model_builder, node, logger));
@@ -26,10 +26,10 @@ Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const Node&
 
 // Operator support related.
 
-bool BaseOpBuilder::IsOpSupported(const InitializedTensorSet& initializers, const Node& node,
+bool BaseOpBuilder::IsOpSupported(const GraphViewer& graph_viewer, const Node& node,
                                   const WebnnDeviceType device_type, const emscripten::val& wnn_limits,
                                   const logging::Logger& logger) const {
-  if (!HasSupportedInputs(initializers, node, wnn_limits, logger))
+  if (!HasSupportedInputs(graph_viewer, node, wnn_limits, logger))
     return false;
 
   if (!HasSupportedOutputs(node, wnn_limits, logger))
@@ -38,11 +38,11 @@ bool BaseOpBuilder::IsOpSupported(const InitializedTensorSet& initializers, cons
   if (!HasSupportedOpSet(node, logger))
     return false;
 
-  return IsOpSupportedImpl(initializers, node, device_type, logger);
+  return IsOpSupportedImpl(graph_viewer, node, device_type, logger);
 }
 
-bool BaseOpBuilder::HasSupportedInputs(const InitializedTensorSet& initializers, const Node& node, const emscripten::val& wnn_limits,
-                                       const logging::Logger& logger) const {
+bool BaseOpBuilder::HasSupportedInputs(const GraphViewer& graph_viewer, const Node& node,
+                                       const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto node_name = MakeString("Node [", node.Name(), "] type [", node.OpType(), "]");
   for (const auto* input : node.InputDefs()) {
     if (!IsTensorShapeSupported(*input, node_name, logger, allow_empty_tensor_as_input_)) {
@@ -50,10 +50,10 @@ bool BaseOpBuilder::HasSupportedInputs(const InitializedTensorSet& initializers,
     }
   }
 
-  return HasSupportedInputsImpl(initializers, node, wnn_limits, logger);
+  return HasSupportedInputsImpl(graph_viewer, node, wnn_limits, logger);
 }
 
-bool BaseOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& initializers, const Node& node,
+bool BaseOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                            const emscripten::val& wnn_limits,
                                            const logging::Logger& logger) const {
   // We only check the type of input 0 by default, specific op builder can override this.
diff --git a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
index 0a4367a71add4..b794ff6a63a6c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
@@ -30,17 +30,17 @@ class BaseOpBuilder : public IOpBuilder {
 
   // Operator support related.
  public:
-  bool IsOpSupported(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupported(const GraphViewer& graph_viewer, const Node& node,
                      const WebnnDeviceType device_type, const emscripten::val& wnn_limits,
                      const logging::Logger& logger) const override;
 
  protected:
-  virtual bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& /* node */,
+  virtual bool IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const Node& /* node */,
                                  const WebnnDeviceType /* device_type */, const logging::Logger& /* logger */) const {
     return true;
   }
 
-  virtual bool HasSupportedInputsImpl(const InitializedTensorSet& initializers, const Node& node, const emscripten::val& wnn_limits,
+  virtual bool HasSupportedInputsImpl(const GraphViewer&, const Node& node, const emscripten::val& wnn_limits,
                                       const logging::Logger& logger) const;
   virtual bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
                                        const logging::Logger& logger) const;
@@ -56,7 +56,7 @@ class BaseOpBuilder : public IOpBuilder {
 
  private:
   bool HasSupportedOpSet(const Node& node, const logging::Logger& logger) const;
-  bool HasSupportedInputs(const InitializedTensorSet& initializers, const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger) const;
+  bool HasSupportedInputs(const GraphViewer&, const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger) const;
   bool HasSupportedOutputs(const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger) const;
 
   const bool allow_empty_tensor_as_input_;  // Some operators can handle ignoring an empty tensor as input.
diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
index c17ad0c89bd9d..29d02690d17c8 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
@@ -20,7 +20,7 @@ class BinaryOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -57,7 +57,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   return Status::OK();
 }
 
-bool BinaryOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool BinaryOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                              const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
index a244efdd9b2eb..8e618285e29ba 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
@@ -23,7 +23,7 @@ class ClipOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType device_type, const logging::Logger& logger) const override;
 };
 
@@ -61,15 +61,12 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool ClipOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool ClipOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                       const Node& node,
                                       const WebnnDeviceType device_type,
                                       const logging::Logger& logger) const {
-  // TODO: Update IsOpSupportedImpl to pass GraphViewer instead of InitializedTensorSet so the implementations
-  // can ensure initializers are constant. See #19401 for details of how this update was made to the NNAPI EP.
-  // GetClipMinMax(graph_viewer, node, minValue, maxValue, logger)
   float min, max;
-  return GetClipMinMax(initializers, node, min, max, logger);
+  return GetClipMinMax(graph_viewer, node, min, max, logger);
 }
 
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
index ee2512ddd8b5a..f5b78bf4bc16b 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
@@ -20,7 +20,7 @@ class ConcatOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -54,7 +54,7 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   return Status::OK();
 }
 
-bool ConcatOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool ConcatOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                              const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 4c393e8a9bdba..436324e087321 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -26,9 +26,9 @@ class ConvOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType device_type, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -344,7 +344,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 // Operator support related.
 
-bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool ConvOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                       const Node& node,
                                       const WebnnDeviceType device_type,
                                       const logging::Logger& logger) const {
@@ -381,7 +381,7 @@ bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   return true;
 }
 
-bool ConvOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool ConvOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                            const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
index 99be8f75771ad..14324415b3659 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
@@ -27,7 +27,7 @@ class CumSumOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -70,7 +70,7 @@ Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 }
 
 // Operator support related.
-bool CumSumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool CumSumOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                         const Node& node,
                                         WebnnDeviceType /* device_type */,
                                         const logging::Logger& logger) const {
@@ -82,7 +82,8 @@ bool CumSumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
 
   const std::string axis_name = GetTensorName(input_defs, 1);
   // Inputs contain optional 'axis' input.
-  if (!Contains(initializers, axis_name)) {
+  const auto* init = graph_viewer.GetConstantInitializer(axis_name);
+  if (init == nullptr) {
     LOGS(logger, VERBOSE) << "The axis must be a constant initializer.";
     return false;
   }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dropout_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dropout_op_builder.cc
index 974e48bc57de9..c22dd9e97bb1a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/dropout_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/dropout_op_builder.cc
@@ -24,7 +24,7 @@ class DropoutOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -73,7 +73,7 @@ Status DropoutOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool DropoutOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool DropoutOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                          const Node& node,
                                          const WebnnDeviceType /* device_type */,
                                          const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
index 6cee04bac3e2b..e5b4fcddc4221 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
@@ -24,9 +24,9 @@ class EinsumOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -694,7 +694,7 @@ Status EinsumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool EinsumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool EinsumOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                         const Node& node,
                                         const WebnnDeviceType device_type,
                                         const logging::Logger& logger) const {
@@ -734,7 +734,7 @@ bool EinsumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializ
   return true;
 }
 
-bool EinsumOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool EinsumOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                              const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
index 3f813f08279e7..2c28786b788f9 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
@@ -27,7 +27,7 @@ class ExpandOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -63,23 +63,27 @@ Status ExpandOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool ExpandOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool ExpandOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                         const Node& node,
                                         const WebnnDeviceType /* device_type */,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& shape_name = input_defs[1]->Name();
-  if (!Contains(initializers, shape_name)) {
+
+  // We need a constant which can not be overriden by input
+  const auto* shape_init = graph_viewer.GetConstantInitializer(shape_name);
+  if (!shape_init) {
     LOGS(logger, VERBOSE) << "The shape must be a constant initializer.";
     return false;
   }
 
-  std::vector<int64_t> new_shape;
-  const auto& shape_tensor = *initializers.at(shape_name);
+  const auto& shape_tensor = *shape_init;
   if (shape_tensor.data_type() != ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
     LOGS(logger, VERBOSE) << "The type of tensor's element data must be INT64.";
     return false;
   }
+
+  std::vector<int64_t> new_shape;
   if (!ReadIntArrayFrom1DTensor(shape_tensor, new_shape, logger)) {
     LOGS(logger, VERBOSE) << "Cannot get shape.";
     return false;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gatherElements_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gatherElements_op_builder.cc
index 1db0ec12b0dfb..06beb56415609 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gatherElements_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gatherElements_op_builder.cc
@@ -20,7 +20,7 @@ class GatherElementsOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -49,7 +49,7 @@ Status GatherElementsOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builde
 
 // Operator support related.
 
-bool GatherElementsOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool GatherElementsOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                                      const emscripten::val& wnn_limits,
                                                      const logging::Logger& logger) const {
   const auto& data = *node.InputDefs()[0];
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gatherND_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gatherND_op_builder.cc
index 279305c8ed3b4..9200c596c0e53 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gatherND_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gatherND_op_builder.cc
@@ -20,9 +20,9 @@ class GatherNDOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -43,7 +43,7 @@ Status GatherNDOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, con
 
 // Operator support related.
 
-bool GatherNDOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool GatherNDOpBuilder::IsOpSupportedImpl(const GraphViewer&, const Node& node,
                                           const WebnnDeviceType /* device_type */,
                                           const logging::Logger& logger) const {
   NodeAttrHelper helper(node);
@@ -55,7 +55,7 @@ bool GatherNDOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initial
   return true;
 }
 
-bool GatherNDOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool GatherNDOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                                const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& data = *node.InputDefs()[0];
   const auto& indices = *node.InputDefs()[1];
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc
index 4d983eeb799fd..d84c70032e1d1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc
@@ -20,9 +20,9 @@ class GatherOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -51,7 +51,7 @@ Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool GatherOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool GatherOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                         const Node& node,
                                         const WebnnDeviceType /* device_type */,
                                         const logging::Logger& logger) const {
@@ -69,7 +69,7 @@ bool GatherOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializ
   return true;
 }
 
-bool GatherOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool GatherOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                              const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input = *node.InputDefs()[0];
   const auto& indices = *node.InputDefs()[1];
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 1f24124745a19..fbf3ac1df2bc2 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -22,9 +22,9 @@ class GemmOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -148,7 +148,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 // Operator support related.
 
-bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool GemmOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                       const Node& node,
                                       const WebnnDeviceType /* device_type */,
                                       const logging::Logger& logger) const {
@@ -212,7 +212,7 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializer
   return true;
 }
 
-bool GemmOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool GemmOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                            const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gru_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gru_op_builder.cc
index 1180721105fb0..403bc8af8ac1f 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gru_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gru_op_builder.cc
@@ -24,9 +24,9 @@ class GruOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /*device_type*/, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer& graph_viewer, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
   bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
                                const logging::Logger& logger) const override;
@@ -119,7 +119,7 @@ Status GruOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const No
   return Status::OK();
 }
 
-bool GruOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+bool GruOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                                      const WebnnDeviceType /*device_type*/, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   if (input_defs.size() < 3) {
@@ -135,12 +135,13 @@ bool GruOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, c
   int32_t steps = static_cast<int32_t>(input_shape[0]);
 
   if (TensorExists(input_defs, 4)) {
-    if (!Contains(initializers, input_defs[4]->Name())) {
+    const auto* seq_initializer = graph_viewer.GetConstantInitializer(input_defs[4]->Name());
+    if (!seq_initializer) {
       LOGS(logger, ERROR) << "GRU: sequence_lens must be constant";
       return false;
     }
 
-    const auto& sequence_lens_tensor = *initializers.at(input_defs[4]->Name());
+    const auto& sequence_lens_tensor = *seq_initializer;
     std::vector<int32_t> sequence_lens;
     if (!ReadIntArrayFrom1DTensor(sequence_lens_tensor, sequence_lens, logger)) {
       LOGS(logger, ERROR) << "Cannot read sequence lens tensor";
@@ -187,7 +188,7 @@ bool GruOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, c
   return true;
 }
 
-bool GruOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool GruOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                           const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
index 86fc1bc01e18a..7c6de428d0934 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
@@ -19,9 +19,9 @@ class LogicalOpBuilder : public BaseOpBuilder {
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -54,7 +54,7 @@ Status LogicalOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   return Status::OK();
 }
 
-bool LogicalOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool LogicalOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                          const Node& node,
                                          const WebnnDeviceType /* device_type */,
                                          const logging::Logger& logger) const {
@@ -72,7 +72,7 @@ bool LogicalOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initiali
   return true;
 }
 
-bool LogicalOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool LogicalOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                               const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/lrn_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/lrn_op_builder.cc
index 19c17f58bdb23..2e5d3d6b5228a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/lrn_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/lrn_op_builder.cc
@@ -21,9 +21,9 @@ class LRNOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
   bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
                                const logging::Logger& logger) const override;
@@ -128,7 +128,7 @@ Status LRNOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool LRNOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool LRNOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                      const Node& node,
                                      const WebnnDeviceType /* device_type */,
                                      const logging::Logger& logger) const {
@@ -146,7 +146,7 @@ bool LRNOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   return true;
 }
 
-bool LRNOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool LRNOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                           const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/lstm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/lstm_op_builder.cc
index b25037d439bf4..c49f360c11737 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/lstm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/lstm_op_builder.cc
@@ -23,9 +23,9 @@ class LstmOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /*device_type*/, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
   bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
                                const logging::Logger& logger) const override;
@@ -125,7 +125,7 @@ Status LstmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   return Status::OK();
 }
 
-bool LstmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+bool LstmOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                                       const WebnnDeviceType /*device_type*/, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   if (input_defs.size() < 3) {
@@ -141,12 +141,13 @@ bool LstmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   int32_t steps = static_cast<int32_t>(input_shape[0]);
 
   if (TensorExists(input_defs, 4)) {
-    if (!Contains(initializers, input_defs[4]->Name())) {
+    const auto* sequence_lens_init = graph_viewer.GetConstantInitializer(input_defs[4]->Name());
+    if (!sequence_lens_init) {
       LOGS(logger, ERROR) << "LSTM: sequence_lens must be constant";
       return false;
     }
 
-    const auto& sequence_lens_tensor = *initializers.at(input_defs[4]->Name());
+    const auto& sequence_lens_tensor = *sequence_lens_init;
     std::vector<int32_t> sequence_lens;
     if (!ReadIntArrayFrom1DTensor(sequence_lens_tensor, sequence_lens, logger)) {
       LOGS(logger, ERROR) << "Cannot read sequence lens tensor";
@@ -198,7 +199,7 @@ bool LstmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   return true;
 }
 
-bool LstmOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool LstmOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                            const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc
index a7bcbfeb4b13e..7ec4ff640132c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc
@@ -20,9 +20,9 @@ class MaxMinOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -68,7 +68,7 @@ Status MaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool MaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool MaxMinOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                         const Node& node,
                                         WebnnDeviceType /* device_type */,
                                         const logging::Logger& logger) const {
@@ -87,7 +87,7 @@ bool MaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializ
   return true;
 }
 
-bool MaxMinOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool MaxMinOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                              const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index 5b57df7f184e7..704c6a65624d8 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -22,9 +22,9 @@ class NormalizationOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
   bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
                                const logging::Logger& logger) const override;
@@ -225,7 +225,7 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
 
 // Operator support related.
 
-bool NormalizationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool NormalizationOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                                const Node& node,
                                                const WebnnDeviceType /* device_type */,
                                                const logging::Logger& logger) const {
@@ -270,7 +270,7 @@ bool NormalizationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initi
   return true;
 }
 
-bool NormalizationOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool NormalizationOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                                     const emscripten::val& wnn_limits,
                                                     const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
index e8f26af928ab3..f17d87d41f9ae 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
@@ -25,7 +25,7 @@ class PadOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -155,7 +155,7 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool PadOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool PadOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                      const Node& node,
                                      const WebnnDeviceType /* device_type */,
                                      const logging::Logger& logger) const {
@@ -183,7 +183,7 @@ bool PadOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     for (size_t i = 1; i < input_defs.size(); i++) {
       // Optional tensors (constant_value, axes) can be indicated by an empty name, just ignore it.
       const std::string input_name = GetTensorName(input_defs, i);
-      if (!input_name.empty() && !Contains(initializers, input_name)) {
+      if (!input_name.empty() && !graph_viewer.GetConstantInitializer(input_name)) {
         LOGS(logger, VERBOSE) << "Input [" << input_name << "] must be known as initializer";
         return false;
       }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
index 79ad3574e07e9..2d263c1ec1f9f 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
@@ -22,7 +22,7 @@ class PoolOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -111,7 +111,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool PoolOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool PoolOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                       const Node& node,
                                       const WebnnDeviceType /* device_type */,
                                       const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
index ed62b2bd69618..dd25fb9bf9315 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
@@ -21,9 +21,9 @@ class QDQOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -121,7 +121,7 @@ Status QDQOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool QDQOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool QDQOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                      const Node& node,
                                      const WebnnDeviceType /* device_type */,
                                      const logging::Logger& logger) const {
@@ -152,7 +152,7 @@ bool QDQOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers
   return true;
 }
 
-bool QDQOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool QDQOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                           const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index b23fbeba1ddc8..a3a0397eda4a3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -28,7 +28,7 @@ class ReductionOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -123,7 +123,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool ReductionOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool ReductionOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                            const Node& node,
                                            const WebnnDeviceType /* device_type */,
                                            const logging::Logger& logger) const {
@@ -136,7 +136,7 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializ
   const auto& op_type = node.OpType();
   const std::string axes_name = GetTensorName(input_defs, 1);
   // If the optional input 'axes' is provided, it must be an initializer.
-  if (!axes_name.empty() && !Contains(initializers, axes_name)) {
+  if (!axes_name.empty() && !graph_viewer.GetConstantInitializer(axes_name)) {
     LOGS(logger, VERBOSE) << "Input axes of " << op_type << " must be a constant";
     return false;
   }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
index 2fc47430a1c66..da5e034c38c8e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
@@ -27,7 +27,7 @@ class ReshapeOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -74,7 +74,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool ReshapeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool ReshapeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                          const Node& node,
                                          const WebnnDeviceType /* device_type */,
                                          const logging::Logger& logger) const {
@@ -85,12 +85,13 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializer
     return false;
 
   const auto& perm_name = input_defs[1]->Name();
-  if (!Contains(initializers, perm_name)) {
+  const auto* perm_init = graph_viewer.GetConstantInitializer(perm_name);
+  if (!perm_init) {
     LOGS(logger, VERBOSE) << "New shape of reshape must be a constant initializer";
     return false;
   }
 
-  const auto& perm_tensor = *initializers.at(perm_name);
+  const auto& perm_tensor = *perm_init;
   std::vector<uint8_t> unpacked_tensor;
   auto status = onnxruntime::utils::UnpackInitializerData(perm_tensor, unpacked_tensor);
   if (!status.IsOK()) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index eec6911a686cf..f71ec2f98d112 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing.
@@ -39,7 +39,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
 };
 
 // Helper functions
-bool GetResizeScalesAndAxes(const InitializedTensorSet& initializers,
+bool GetResizeScalesAndAxes(const GraphViewer& graph_viewer,
                             const Node& node, std::vector<float>& scales,
                             std::vector<int64_t>& axes, const bool is_nhwc,
                             const logging::Logger& logger) {
@@ -48,13 +48,14 @@ bool GetResizeScalesAndAxes(const InitializedTensorSet& initializers,
     return false;
 
   const bool has_axes = !axes.empty();
-  const auto& scales_tensor = *initializers.at(input_defs[2]->Name());
-  if (scales_tensor.dims_size() != 1) {
-    LOGS(logger, ERROR) << "'scales' should be a 1D tensor.";
+  const auto* scales_init = graph_viewer.GetConstantInitializer(input_defs[2]->Name());
+  if (!scales_init || scales_init->dims_size() != 1) {
+    LOGS(logger, ERROR) << "Expecting 'scales' as a 1D constant initialized tensor.";
     return false;
   }
 
   // Number of elements of 'scales' tensor.
+  const auto& scales_tensor = *scales_init;
   const auto num_of_scales = scales_tensor.dims()[0];
 
   if (has_axes && num_of_scales != 2) {
@@ -106,7 +107,7 @@ bool GetResizeScalesAndAxes(const InitializedTensorSet& initializers,
   return true;
 }
 
-bool GetResizeSizesAndAxes(const InitializedTensorSet& initializers,
+bool GetResizeSizesAndAxes(const GraphViewer& graph_viewer,
                            const Node& node, std::vector<int64_t>& sizes,
                            std::vector<int64_t>& axes, const bool is_nhwc,
                            const gsl::span<int64_t>& input_shape,
@@ -116,12 +117,13 @@ bool GetResizeSizesAndAxes(const InitializedTensorSet& initializers,
     return false;
 
   const bool has_axes = !axes.empty();
-  const auto& sizes_tensor = *initializers.at(input_defs[3]->Name());
-  if (sizes_tensor.dims_size() != 1) {
-    LOGS(logger, ERROR) << "'sizes' should be a 1D tensor.";
+  const auto* sizes_init = graph_viewer.GetConstantInitializer(input_defs[3]->Name());
+  if (!sizes_init || sizes_init->dims_size() != 1) {
+    LOGS(logger, ERROR) << "'sizes' should be a 1D constant initializer tensor.";
     return false;
   }
 
+  const auto& sizes_tensor = *sizes_init;
   // Number of elements of sizes tensor.
   const auto num_of_sizes = sizes_tensor.dims()[0];
   if (has_axes && num_of_sizes != 2) {
@@ -222,12 +224,13 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   // This handles Resize-11 where 'scales' was a required input but 'sizes' were used if provided.
   bool using_sizes = !sizes_name.empty() && Contains(initializers, sizes_name);
   if (using_sizes) {
-    ORT_RETURN_IF_NOT(GetResizeSizesAndAxes(initializers, node, sizes, axes, is_nhwc, input_shape, logger),
+    ORT_RETURN_IF_NOT(GetResizeSizesAndAxes(model_builder.GetGraphViewer(), node, sizes, axes, is_nhwc,
+                                            input_shape, logger),
                       "Error getting Resize sizes");
     webnn_sizes = GetNarrowedIntfromInt64<uint32_t>(sizes);
     options.set("sizes", emscripten::val::array(webnn_sizes));
   } else {
-    ORT_RETURN_IF_NOT(GetResizeScalesAndAxes(initializers, node, scales, axes, is_nhwc, logger),
+    ORT_RETURN_IF_NOT(GetResizeScalesAndAxes(model_builder.GetGraphViewer(), node, scales, axes, is_nhwc, logger),
                       "Error getting Resize scales");
     options.set("scales", emscripten::val::array(scales));
   }
@@ -243,7 +246,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool ResizeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                         const Node& node,
                                         const WebnnDeviceType /* device_type */,
                                         const logging::Logger& logger) const {
@@ -304,8 +307,8 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     // Check for 'sizes' first.
     // This handles Resize-11 where 'scales' was a required input but 'sizes' were used if provided.
     // 'scales' or 'sizes' may be empty tensor.
-    bool using_sizes = !IsEmptyTensor(initializers, sizes_name);
-    bool using_scales = !using_sizes && !IsEmptyTensor(initializers, scales_name);
+    bool using_sizes = !IsEmptyTensor(graph_viewer, sizes_name);
+    bool using_scales = !using_sizes && !IsEmptyTensor(graph_viewer, scales_name);
 
     if (!using_scales && !using_sizes) {
       LOGS(logger, VERBOSE) << "Resize: only one of 'scales' and 'sizes' can be specified";
@@ -325,12 +328,12 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     const bool is_nhwc = node.Domain() == kMSInternalNHWCDomain;
     if (using_sizes) {  // We are using 'sizes'.
       std::vector<int64_t> sizes;
-      if (!GetResizeSizesAndAxes(initializers, node, sizes, axes, is_nhwc, input_shape, logger)) {
+      if (!GetResizeSizesAndAxes(graph_viewer, node, sizes, axes, is_nhwc, input_shape, logger)) {
         return false;
       }
     } else {  // We are using 'scales'.
       std::vector<float> scales;
-      if (!GetResizeScalesAndAxes(initializers, node, scales, axes, is_nhwc, logger)) {
+      if (!GetResizeScalesAndAxes(graph_viewer, node, scales, axes, is_nhwc, logger)) {
         return false;
       }
     }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
index 0a84835ee9fc0..bdd0c97b7b81c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
@@ -57,9 +57,9 @@ class RotaryEmbeddingOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer& graph_viewer, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
   bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
                                const logging::Logger& logger) const override;
@@ -286,7 +286,7 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
 }
 
 // Operator support related.
-bool RotaryEmbeddingOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+bool RotaryEmbeddingOpBuilder::IsOpSupportedImpl(const GraphViewer&, const Node& node,
                                                  const WebnnDeviceType /* device_type */,
                                                  const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
@@ -325,7 +325,7 @@ bool RotaryEmbeddingOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& ini
   return true;
 }
 
-bool RotaryEmbeddingOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */,
+bool RotaryEmbeddingOpBuilder::HasSupportedInputsImpl(const GraphViewer&,
                                                       const Node& node,
                                                       const emscripten::val& wnn_limits,
                                                       const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/scatterElements_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/scatterElements_op_builder.cc
index 1a68f7862f1ab..f894e8bfbd517 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/scatterElements_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/scatterElements_op_builder.cc
@@ -20,9 +20,9 @@ class ScatterElementsOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer& graph_viewer, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -53,7 +53,7 @@ Status ScatterElementsOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
 
 // Operator support related.
 
-bool ScatterElementsOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool ScatterElementsOpBuilder::IsOpSupportedImpl(const GraphViewer&, const Node& node,
                                                  const WebnnDeviceType /* device_type */,
                                                  const logging::Logger& logger) const {
   NodeAttrHelper helper(node);
@@ -65,7 +65,7 @@ bool ScatterElementsOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /*
   return true;
 }
 
-bool ScatterElementsOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool ScatterElementsOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                                       const emscripten::val& wnn_limits,
                                                       const logging::Logger& logger) const {
   const auto& data = *node.InputDefs()[0];
diff --git a/onnxruntime/core/providers/webnn/builders/impl/scatterND_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/scatterND_op_builder.cc
index 2b123dc2e9323..e61ac3dcc9617 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/scatterND_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/scatterND_op_builder.cc
@@ -20,9 +20,9 @@ class ScatterNDOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer& graph_viewer, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -45,7 +45,7 @@ Status ScatterNDOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
 
 // Operator support related.
 
-bool ScatterNDOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool ScatterNDOpBuilder::IsOpSupportedImpl(const GraphViewer&, const Node& node,
                                            const WebnnDeviceType /* device_type */,
                                            const logging::Logger& logger) const {
   NodeAttrHelper helper(node);
@@ -57,7 +57,7 @@ bool ScatterNDOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initia
   return true;
 }
 
-bool ScatterNDOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool ScatterNDOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                                 const emscripten::val& wnn_limits,
                                                 const logging::Logger& logger) const {
   const auto& data = *node.InputDefs()[0];
diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
index 468c0e24a3e88..6206ac23e4bd4 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
@@ -24,9 +24,9 @@ class SliceOpBuilder : public BaseOpBuilder {
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer& graph_viewer, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
   // TODO: Support Slice opset < 10, which uses attributes for starts and ends.
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 10; }
@@ -132,7 +132,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   return Status::OK();
 }
 
-bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+bool SliceOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                                        const WebnnDeviceType /* device_type */, const logging::Logger& logger) const {
   const auto& name = node.Name();
   const auto& op_type = node.OpType();
@@ -152,7 +152,8 @@ bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   for (size_t i = 1; i < input_defs.size(); i++) {
     // Optional tensors (axes, steps) can be indicated by an empty name, just ignore it.
     const std::string input_name = GetTensorName(input_defs, i);
-    if (!input_name.empty() && !Contains(initializers, input_name)) {
+    const auto* init = graph_viewer.GetConstantInitializer(input_name);
+    if (!input_name.empty() && !init) {
       LOGS(logger, VERBOSE) << "Input [" << input_name << "] of " << op_type << " [" << name
                             << "] must be known as initializer";
       return false;
@@ -162,7 +163,7 @@ bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   return true;
 }
 
-bool SliceOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& initializers, const Node& node,
+bool SliceOpBuilder::HasSupportedInputsImpl(const GraphViewer& graph_viewer, const Node& node,
                                             const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& input = *input_defs[0];
@@ -174,7 +175,8 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& initiali
   // If there is step < 0, check data type support of reverse.
   if (TensorExists(input_defs, 4)) {
     std::vector<int64_t> steps;
-    if (!ReadIntArrayFrom1DTensor(*initializers.at(input_defs[4]->Name()), steps, logger))
+    const auto* init = graph_viewer.GetConstantInitializer(input_defs[4]->Name());
+    if (!init || !ReadIntArrayFrom1DTensor(*init, steps, logger))
       return false;
     if (std::any_of(steps.begin(), steps.end(), [](int64_t step) { return step < 0; })) {
       if (!IsDataTypeSupportedByWebNNOp(op_type, "reverse", input_type, wnn_limits, "input", "data", logger)) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
index 0e754b53e78d1..23e73bb8f1e74 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
@@ -21,7 +21,7 @@ class SoftmaxOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer&, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -48,7 +48,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool SoftmaxOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool SoftmaxOpBuilder::IsOpSupportedImpl(const GraphViewer&,
                                          const Node& node,
                                          const WebnnDeviceType /* device_type */,
                                          const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index 21b44b1066694..8094d3024a321 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -25,7 +25,7 @@ class SplitOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
   bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
                                const logging::Logger& logger) const override;
@@ -94,7 +94,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool SplitOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                        const Node& node,
                                        const WebnnDeviceType /* device_type */,
                                        const logging::Logger& logger) const {
@@ -114,12 +114,13 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   const std::string split_name = GetTensorName(input_defs, 1);
   // Inputs contain optional 'split' input.
   if (!split_name.empty()) {
-    if (!Contains(initializers, split_name)) {
+    const auto* split_init = graph_viewer.GetConstantInitializer(split_name);
+    if (!split_init) {
       LOGS(logger, VERBOSE) << "The split must be a constant initializer.";
       return false;
     }
     // Values should be >= 0. Sum of the values must be equal to the dim value at 'axis' specified.
-    const auto& split_tensor = *initializers.at(input_defs[1]->Name());
+    const auto& split_tensor = *split_init;
     if (split_tensor.data_type() != ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
       LOGS(logger, VERBOSE) << "The type of tensor's element data must be INT64.";
       return false;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
index 5687b1133c628..1ba6df9febf14 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
@@ -27,7 +27,7 @@ class SqueezeUnsqueezeOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -121,7 +121,7 @@ Status SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
 
 // Operator support related.
 
-bool SqueezeUnsqueezeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool SqueezeUnsqueezeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                                   const Node& node,
                                                   const WebnnDeviceType /* device_type */,
                                                   const logging::Logger& logger) const {
@@ -140,7 +140,8 @@ bool SqueezeUnsqueezeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& in
   if (node.SinceVersion() >= 13) {
     const std::string axes_name = GetTensorName(input_defs, 1);
     if (!axes_name.empty()) {
-      if (!Contains(initializers, axes_name)) {
+      const auto* init = graph_viewer.GetConstantInitializer(axes_name);
+      if (!init) {
         LOGS(logger, ERROR) << "Input axes of " << op_type << " is not present and constant";
         return false;
       }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc
index a233ba82ebbc6..f6c1744ca7a3e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc
@@ -18,7 +18,7 @@ class TernaryOpBuilder : public BaseOpBuilder {
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
-  bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+  bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
 
@@ -46,7 +46,7 @@ Status TernaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   return Status::OK();
 }
 
-bool TernaryOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+bool TernaryOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                                               const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
index 259bb0552b7c7..29b232026d7df 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
@@ -26,7 +26,7 @@ class TileOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -65,13 +65,14 @@ Status TileOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-bool TileOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool TileOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                       const Node& node,
                                       const WebnnDeviceType /* device_type */,
                                       const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& repetitions_name = input_defs[1]->Name();
-  if (!Contains(initializers, repetitions_name)) {
+  const auto* init = graph_viewer.GetConstantInitializer(repetitions_name);
+  if (!init) {
     LOGS(logger, VERBOSE) << "Repetitions of tile must be a constant initializer";
     return false;
   }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
index f2092d6163713..ca98d8264fdcd 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
@@ -24,7 +24,7 @@ class TriangularOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
@@ -69,7 +69,7 @@ Status TriangularOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 }
 
 // Operator support related.
-bool TriangularOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool TriangularOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer,
                                             const Node& node,
                                             const WebnnDeviceType /* device_type */,
                                             const logging::Logger& logger) const {
@@ -87,7 +87,8 @@ bool TriangularOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initiali
   const std::string diagonal_name = GetTensorName(input_defs, 1);
   // Inputs contain optional 'diagonal' input.
   if (!diagonal_name.empty()) {
-    if (!Contains(initializers, diagonal_name)) {
+    const auto* init = graph_viewer.GetConstantInitializer(diagonal_name);
+    if (!init) {
       LOGS(logger, VERBOSE) << "The diagonal must be a constant initializer.";
       return false;
     }
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder.h b/onnxruntime/core/providers/webnn/builders/op_builder.h
index bb69a6a545597..636b1dc9f478a 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/op_builder.h
@@ -28,7 +28,7 @@ class IOpBuilder {
   // Operator support related.
  public:
   // Check if an operator is supported.
-  virtual bool IsOpSupported(const InitializedTensorSet& initializers, const Node& node,
+  virtual bool IsOpSupported(const GraphViewer& graph_viewer, const Node& node,
                              const WebnnDeviceType device_type, const emscripten::val& wnn_limits,
                              const logging::Logger& logger) const = 0;
 };
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index e46236f4ca11c..5fd197d7a798b 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1636,7 +1636,7 @@ struct ProviderHostImpl : ProviderHost {
   }
 #endif
 
-  void MurmurHash3__x86_128(const void* key, int len, uint32_t seed, void* out) override {
+  void MurmurHash3__x86_128(const void* key, size_t len, uint32_t seed, void* out) override {
     MurmurHash3::x86_128(key, len, seed, out);
   }
 
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index bd88eb1b6b353..33f5b5e5853a5 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -219,6 +219,8 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "PackedMultiHeadAttention": self._infer_PackedMultiHeadAttention,
             "PagedAttention": self._infer_PagedAttention,
             "PythonOp": self._infer_PythonOp,
+            "QLinearAdd": self._infer_QLinearBinary,
+            "QLinearMul": self._infer_QLinearBinary,
             "QuantizeLinear": self._infer_QuantizeLinear,
             "QuickGelu": self._infer_FastGelu,
             "RelativePositionBias": self._infer_RelativePositionBias,
@@ -490,6 +492,8 @@ def _onnx_infer_single_node(self, node):
             "SkipSimplifiedLayerNormalization",
             "SparseAttention",
             "SkipGroupNorm",
+            "QLinearAdd",
+            "QLinearMul",
         ]
 
         if not skip_infer:
@@ -1040,6 +1044,20 @@ def _infer_QuantizeLinear(self, node):  # noqa: N802
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
 
+    def _infer_QLinearBinary(self, node):  # noqa: N802
+        # Get the output data type from the first input to QLinearAdd / QLinearMul.
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+
+        # The inputs are first and fourth operands respectively.
+        input_1_shape = self._get_shape(node, 0)
+        input_2_shape = self._get_shape(node, 3)
+
+        # Compute the broadcasted shape
+        new_shape = self._broadcast_shapes(input_1_shape, input_2_shape)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_shape))
+
     def _infer_Einsum(self, node):  # noqa: N802
         # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
         equation = get_attribute(node, "equation")
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 7af34447f1f66..3e913094628c3 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -16,8 +16,8 @@
 import sys
 
 TRT_DOCKER_FILES = {
-    "10.8_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
-    "10.8_cuda12.6_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
+    "10.9_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
+    "10.9_cuda12.8_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index 2a210729112d7..3dd2c2ef945ec 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -88,61 +88,62 @@ def create_onnxruntime_session(
     enable_mlas_gemm_fastmath_arm64_bfloat16=False,
     provider_options={},  # map execution provider name to its option  # noqa: B006
 ):
-    session = None
-    try:
-        sess_options = onnxruntime.SessionOptions()
+    sess_options = onnxruntime.SessionOptions()
 
-        if enable_all_optimization:
-            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-        else:
-            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
+    if enable_all_optimization:
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    else:
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
 
-        if enable_profiling:
-            sess_options.enable_profiling = True
+    if enable_profiling:
+        sess_options.enable_profiling = True
 
-        if num_threads > 0:
-            sess_options.intra_op_num_threads = num_threads
-            logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
+    if num_threads > 0:
+        sess_options.intra_op_num_threads = num_threads
+        logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
 
-        if verbose:
-            sess_options.log_severity_level = 0
-        else:
-            sess_options.log_severity_level = 4
-
-        logger.debug(f"Create session for onnx model: {onnx_model_path}")
-        if use_gpu:
-            if provider == "dml":
-                providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
-            elif provider == "rocm":
-                providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
-            elif provider == "migraphx":
-                providers = [
-                    "MIGraphXExecutionProvider",
-                    "ROCMExecutionProvider",
-                    "CPUExecutionProvider",
-                ]
-            elif provider == "cuda":
-                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
-            elif provider == "tensorrt":
-                providers = [
-                    "TensorrtExecutionProvider",
-                    "CUDAExecutionProvider",
-                    "CPUExecutionProvider",
-                ]
-            else:
-                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    if verbose:
+        sess_options.log_severity_level = 0
+    else:
+        sess_options.log_severity_level = 4
+
+    if provider in onnxruntime.get_available_providers():
+        providers = [provider]
+    elif use_gpu:
+        if provider == "dml":
+            providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
+        elif provider == "rocm":
+            providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
+        elif provider == "migraphx":
+            providers = [
+                "MIGraphXExecutionProvider",
+                "ROCMExecutionProvider",
+                "CPUExecutionProvider",
+            ]
+        elif provider == "cuda" or provider is None:
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        elif provider == "tensorrt":
+            providers = [
+                "TensorrtExecutionProvider",
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ]
         else:
-            providers = ["CPUExecutionProvider"]
+            raise RuntimeError(f"The execution provider is not supported: {provider}")
+    else:
+        providers = ["CPUExecutionProvider"]
 
-        if provider_options:
-            providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]
+    if provider_options:
+        providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]
 
-        if enable_mlas_gemm_fastmath_arm64_bfloat16:
-            sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1")
+    if enable_mlas_gemm_fastmath_arm64_bfloat16:
+        sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1")
 
+    session = None
+    try:
         session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
     except Exception:
-        logger.error("Exception", exc_info=True)  # noqa: G201
+        logger.exception(f"Failed to create session for {onnx_model_path} with providers={providers}")
 
     return session
 
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index 045910ea20828..8eb2afb3db896 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -16,19 +16,17 @@
     python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu -p fp16 --use_sln_strict_mode
 
 Example 4: convert T5 model with beam search in two steps:
-    cd ./models/t5
-    python convert_to_onnx.py -m t5-small
-    cd ../..
-    python convert_generation.py -m t5-small --model_type t5                                    \
-        --decoder_onnx ./models/t5/onnx_models/t5-small_decoder.onnx                            \
-        --encoder_decoder_init_onnx ./models/t5/onnx_models/t5-small_encoder_decoder_init.onnx  \
-        --output ./models/t5/onnx_models/t5_small_beam_search.onnx
+    python -m models.t5.convert_to_onnx -m t5-small
+    python convert_generation.py -m t5-small --model_type t5             \
+        --decoder_onnx ./onnx_models/t5-small_decoder.onnx               \
+        --encoder_decoder_init_onnx ./onnx_models/t5-small_encoder.onnx  \
+        --output ./onnx_models/t5_small_beam_search.onnx
 
 Example 5: convert T5 model with beam search. All in one step:
-    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx
+    python convert_generation.py -m t5-small --model_type t5 --output t5_small_beam_search.onnx
 
 Example 6: convert T5 model with beam search containing specific cuda optimizations. All in one step:
-    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx   \
+    python convert_generation.py -m t5-small --model_type t5 --output t5_small_beam_search.onnx   \
         --use_gpu --past_present_share_buffer --use_decoder_masked_attention
 
 Example 7: convert MT5 model with external data file like mt5-base-beamsearch.onnx.data in below example.
@@ -68,11 +66,23 @@
     T5Tokenizer,
 )
 
-from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_available_providers
-from onnxruntime.transformers.models.gpt2.convert_to_onnx import main as convert_gpt2_to_onnx
+from onnxruntime import (
+    GraphOptimizationLevel,
+    InferenceSession,
+    SessionOptions,
+    get_available_providers,
+)
+from onnxruntime.transformers.models.gpt2.convert_to_onnx import (
+    main as convert_gpt2_to_onnx,
+)
 from onnxruntime.transformers.models.gpt2.gpt2_helper import PRETRAINED_GPT2_MODELS
-from onnxruntime.transformers.models.t5.convert_to_onnx import export_onnx_models as export_t5_onnx_models
-from onnxruntime.transformers.models.t5.t5_helper import PRETRAINED_MT5_MODELS, PRETRAINED_T5_MODELS
+from onnxruntime.transformers.models.t5.convert_to_onnx import (
+    export_onnx_models as export_t5_onnx_models,
+)
+from onnxruntime.transformers.models.t5.t5_helper import (
+    PRETRAINED_MT5_MODELS,
+    PRETRAINED_T5_MODELS,
+)
 
 logger = logging.getLogger("")
 
@@ -162,9 +172,9 @@ def parse_arguments(argv: list[str] | None = None) -> argparse.Namespace:
         "-p",
         "--precision",
         required=False,
-        type=Precision,
-        default=Precision.FLOAT32,
-        choices=[Precision.FLOAT32, Precision.FLOAT16],
+        type=str,
+        default=Precision.FLOAT32.value,
+        choices=[Precision.FLOAT32.value, Precision.FLOAT16.value],
         help="Precision of model to run. fp32 for full precision, fp16 for half or mixed precision",
     )
 
@@ -189,7 +199,11 @@ def parse_arguments(argv: list[str] | None = None) -> argparse.Namespace:
     output_group.set_defaults(use_external_data_format=False)
 
     output_group.add_argument(
-        "-s", "--run_shape_inference", required=False, action="store_true", help="run shape inference"
+        "-s",
+        "--run_shape_inference",
+        required=False,
+        action="store_true",
+        help="run shape inference",
     )
     output_group.set_defaults(run_shape_inference=False)
 
@@ -223,6 +237,14 @@ def parse_arguments(argv: list[str] | None = None) -> argparse.Namespace:
     )
     output_group.set_defaults(disable_shared_initializers=False)
 
+    output_group.add_argument(
+        "--encoder_decoder_init",
+        required=False,
+        action="store_true",
+        help="Add decoder initialization to encoder for T5 model. This is legacy format that will be deprecated.",
+    )
+    output_group.set_defaults(encoder_decoder_init=False)
+
     model_group = parser.add_argument_group("Beam search parameters that stored in the output model")
 
     model_group.add_argument(
@@ -426,7 +448,10 @@ def parse_arguments(argv: list[str] | None = None) -> argparse.Namespace:
     test_group.set_defaults(use_sln_strict_mode=False)
 
     test_group.add_argument(
-        "--use_gpu", required=False, action="store_true", help="use GPU for inference. Required for fp16."
+        "--use_gpu",
+        required=False,
+        action="store_true",
+        help="use GPU for inference. Required for fp16.",
     )
     test_group.set_defaults(use_gpu=False)
 
@@ -490,7 +515,7 @@ def gpt2_to_onnx(args: argparse.Namespace):
         args.decoder_onnx,
         "--optimize_onnx",
         "--precision",
-        "fp32" if args.precision == Precision.FLOAT32 else "fp16",
+        args.precision,
         "--test_runs",
         "1",
         "--test_cases",
@@ -508,7 +533,7 @@ def gpt2_to_onnx(args: argparse.Namespace):
         arguments.extend(["--op_block_list"])
         arguments.extend(args.op_block_list)
 
-    if args.precision == Precision.FLOAT16:
+    if args.precision == Precision.FLOAT16.value:
         assert args.use_gpu, "fp16 or mixed precision model cannot run in CPU. Please add --use_gpu"
         # TODO(tianleiwu): Use auto mixed precision for fp16 conversion: arguments.append('--auto_mixed_precision')
         #       Need change cuda kernel to support a combination of fp32 logits and fp16 past state.
@@ -527,20 +552,21 @@ def t5_to_onnx(args: argparse.Namespace):
         args (argparse.Namespace): arguments parsed from command line
     """
     paths = export_t5_onnx_models(
-        args.model_name_or_path,
-        args.cache_dir,
-        Path(args.output).parent,
+        model_name_or_path=args.model_name_or_path,
+        cache_dir=args.cache_dir,
+        output_dir=Path(args.output).parent,
         use_gpu=args.use_gpu,
         use_external_data_format=args.use_external_data_format,
-        optimize_onnx=(args.precision != Precision.FLOAT16),
+        optimize_onnx=(args.precision != Precision.FLOAT16.value),
         precision=args.precision,
         verbose=False,
         use_decoder_start_token=False,
-        merge_encoder_and_decoder_init=True,
         overwrite=True,
         disable_auto_mixed_precision=False,
         use_int32_inputs=True,
         model_type=args.model_type,
+        encoder_decoder_init=args.encoder_decoder_init,
+        force_fp16_io=(args.precision == Precision.FLOAT16.value),  # required by BeamSearch op implementation.
     )
 
     logger.debug(f"onnx model for encoder: {paths[0]}")
@@ -693,7 +719,7 @@ def verify_gpt2_subgraph(graph: onnx.GraphProto, precision: Precision):
         ValueError: Output name is not expected.
         ValueError: Output data type is not expected.
     """
-    is_float16 = precision == Precision.FLOAT16
+    is_float16 = precision == Precision.FLOAT16.value
 
     input_count = len(graph.input)
     layer_count = input_count - 3
@@ -749,7 +775,7 @@ def verify_t5_decoder_subgraph(graph: onnx.GraphProto, precision: Precision):
         ValueError: Output name is not expected.
         ValueError: Output data type is not expected.
     """
-    is_float16 = precision == Precision.FLOAT16
+    is_float16 = precision == Precision.FLOAT16.value
     float_type = TensorProto.FLOAT16 if is_float16 else TensorProto.FLOAT
 
     input_count = len(graph.input)
@@ -825,15 +851,20 @@ def verify_t5_encoder_decoder_init_subgraph(graph: onnx.GraphProto, precision: P
         ValueError: Output name is not expected.
         ValueError: Output data type is not expected.
     """
-    is_float16 = precision == Precision.FLOAT16
-    layer_count = (len(graph.output) - 2) // 4
-    assert layer_count >= 1
+    is_float16 = precision == Precision.FLOAT16.value
+    new_format = "cross" in graph.output[0].name
 
     # Expect 3 inputs:
     #   encoder_input_ids:      int32 (B, encode_sequence_length)
     #   encoder_attention_mask: int32 (B, encode_sequence_length)
     #   decoder_input_ids:      int32 (B, 1)
-    expected_inputs = ["encoder_input_ids", "encoder_attention_mask", "decoder_input_ids"]
+    expected_inputs = [
+        "encoder_input_ids",
+        "encoder_attention_mask",
+        "decoder_input_ids",
+    ]
+    if new_format:
+        expected_inputs = expected_inputs[:2]
     if len(graph.input) != len(expected_inputs):
         raise ValueError(f"Number of inputs expected to be {len(expected_inputs)}. Got {len(graph.input)}")
 
@@ -846,22 +877,41 @@ def verify_t5_encoder_decoder_init_subgraph(graph: onnx.GraphProto, precision: P
         if input_type != expected_type:
             raise ValueError(f"Input {i} is expected to have onnx data type {expected_type}. Got {input_type}")
 
-    # Expected outputs:
-    #   logits:                (B, 1, vocab_size)
-    #   encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size)
-    #   present_key_self_0:    (B, num_heads, 1, head_size)
-    #   present_value_self_0:  (B, num_heads, 1, head_size)
-    #                      ... (for each self attention layer)
-    #   present_key_cross_0:   (B, num_heads, encode_sequence_length, head_size)
-    #   present_value_cross_0: (B, num_heads, encode_sequence_length, head_size)
-    #                      ... (for each cross attention layer)
-    expected_outputs = ["logits", "encoder_hidden_states"]
-    for i in range(layer_count):
-        expected_outputs.append(f"present_key_self_{i}")
-        expected_outputs.append(f"present_value_self_{i}")
-    for i in range(layer_count):
-        expected_outputs.append(f"present_key_cross_{i}")
-        expected_outputs.append(f"present_value_cross_{i}")
+    if new_format:
+        assert len(graph.output) % 2 == 0
+        layer_count = len(graph.output) // 2
+        assert layer_count >= 1
+
+        # Expected outputs:
+        #   present_key_cross_0:   (B, num_heads, encode_sequence_length, head_size)
+        #   present_value_cross_0: (B, num_heads, encode_sequence_length, head_size)
+        #                      ... (for each cross attention layer)
+        expected_outputs = []
+        for i in range(layer_count):
+            expected_outputs.append(f"present_key_cross_{i}")
+            expected_outputs.append(f"present_value_cross_{i}")
+    else:
+        logger.warning("This format is deprecated. Please export T5 encoder in new format with only cross outputs.")
+        assert (len(graph.output) - 2) % 4 == 0
+        layer_count = (len(graph.output) - 2) // 4
+        assert layer_count >= 1
+
+        # Expected outputs:
+        #   logits:                (B, 1, vocab_size)
+        #   encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size)
+        #   present_key_self_0:    (B, num_heads, 1, head_size)
+        #   present_value_self_0:  (B, num_heads, 1, head_size)
+        #                      ... (for each self attention layer)
+        #   present_key_cross_0:   (B, num_heads, encode_sequence_length, head_size)
+        #   present_value_cross_0: (B, num_heads, encode_sequence_length, head_size)
+        #                      ... (for each cross attention layer)
+        expected_outputs = ["logits", "encoder_hidden_states"]
+        for i in range(layer_count):
+            expected_outputs.append(f"present_key_self_{i}")
+            expected_outputs.append(f"present_value_self_{i}")
+        for i in range(layer_count):
+            expected_outputs.append(f"present_key_cross_{i}")
+            expected_outputs.append(f"present_value_cross_{i}")
 
     if len(graph.output) != len(expected_outputs):
         raise ValueError(f"Number of outputs expected to be {len(expected_outputs)}. Got {len(graph.output)}")
@@ -1116,6 +1166,7 @@ def update_decoder_subgraph_past_present_share_buffer(subg: GraphProto):
 
     new_nodes = []
     for node in subg.node:
+        new_node = node
         if node.op_type == "Attention":
             kwargs = kwargs_of(node)
             kwargs.update({"past_present_share_buffer": 1})
@@ -1125,8 +1176,8 @@ def update_decoder_subgraph_past_present_share_buffer(subg: GraphProto):
                 nis.extend([""])
             if len(nis) < 7:
                 nis.extend(["past_sequence_length"])
-            node = onnx.helper.make_node("Attention", nis, node.output, name=node.name, **kwargs)  # noqa: PLW2901
-        new_nodes.extend([node])
+            new_node = onnx.helper.make_node("Attention", nis, node.output, name=node.name, **kwargs)
+        new_nodes.extend([new_node])
     subg.ClearField("node")
     subg.node.extend(new_nodes)
     return subg
@@ -1152,7 +1203,9 @@ def update_decoder_subgraph_use_decoder_masked_attention(
         new_inputs.extend(
             [
                 onnx.helper.make_tensor_value_info(
-                    "cache_indirection", onnx.TensorProto.INT32, shape=["batch_size", "beam_width", "max_seq_len"]
+                    "cache_indirection",
+                    onnx.TensorProto.INT32,
+                    shape=["batch_size", "beam_width", "max_seq_len"],
                 )
             ]
         )
@@ -1203,7 +1256,11 @@ def update_decoder_subgraph_use_decoder_masked_attention(
                         nis.extend(["cache_indirection"])
 
                 node = onnx.helper.make_node(  # noqa: PLW2901
-                    "DecoderMaskedSelfAttention", nis, node.output, name=node.name, **kwargs
+                    "DecoderMaskedSelfAttention",
+                    nis,
+                    node.output,
+                    name=node.name,
+                    **kwargs,
                 )
             new_nodes.extend([node])
         subg.ClearField("node")
@@ -1573,7 +1630,11 @@ def replace_mha_with_dmmha(model: OnnxModel, past_seq_len_name: str):
 
 
 def replace_mha_with_gqa(
-    model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1, window_size: int = -1
+    model: OnnxModel,
+    attn_mask: str,
+    kv_num_heads: int = 0,
+    world_size: int = 1,
+    window_size: int = -1,
 ):
     # Insert attention_mask subgraph to calculate shared inputs for all GroupQueryAttention nodes
     #
@@ -1635,7 +1696,14 @@ def replace_mha_with_gqa(
         to=TensorProto.INT32,
     )
     model.model.graph.node.extend(
-        [reduce_sum_node, sub_node, seqlen_k_cast_node, shape_node, gather_node, total_seqlen_cast_node]
+        [
+            reduce_sum_node,
+            sub_node,
+            seqlen_k_cast_node,
+            shape_node,
+            gather_node,
+            total_seqlen_cast_node,
+        ]
     )
 
     # Replace MultiHeadAttention with GroupQueryAttention
@@ -1776,14 +1844,14 @@ def replace_mha_with_gqa(
                 node.input[7],  # past_value
                 seqlen_k_cast_node.output[0],  # seqlens_k (for attention mask)
                 total_seqlen_cast_node.output[0],  # total_seq_len (for attention mask)
-                q_rotary.input[2] if q_rotary is not None else "",  # cos_cache (for rotary embeddings)
-                q_rotary.input[3] if q_rotary is not None else "",  # sin_cache (for rotary embeddings)
+                (q_rotary.input[2] if q_rotary is not None else ""),  # cos_cache (for rotary embeddings)
+                (q_rotary.input[3] if q_rotary is not None else ""),  # sin_cache (for rotary embeddings)
             ],
             outputs=node.output,
             name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"),
             domain="com.microsoft",
             num_heads=num_heads // world_size,
-            kv_num_heads=num_heads // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
+            kv_num_heads=(num_heads // world_size if kv_num_heads == 0 else kv_num_heads // world_size),
             local_window_size=window_size,
             do_rotary=int(q_rotary is not None and k_rotary is not None),
             rotary_interleaved=interleaved,
@@ -1831,7 +1899,9 @@ def update_decoder_subgraph_output_cross_attention(subg: GraphProto):
             node.attribute.extend([onnx.helper.make_attribute("output_qk", 1)])
 
             cross_attention = onnx.helper.make_tensor_value_info(
-                cross_attention_out_name, TensorProto.FLOAT, [batch_size_dim, num_heads_dim, 1, cross_seq_len_dim]
+                cross_attention_out_name,
+                TensorProto.FLOAT,
+                [batch_size_dim, num_heads_dim, 1, cross_seq_len_dim],
             )
             subg.output.extend([cross_attention])
     if num_layer_output_qk != num_layers:
@@ -1935,7 +2005,11 @@ def update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha(subg: ModelP
             kwargs["past_present_share_buffer"] = 1
 
             node = onnx.helper.make_node(  # noqa: PLW2901
-                "DecoderMaskedMultiHeadAttention", nis, node.output, name=node.name, **kwargs
+                "DecoderMaskedMultiHeadAttention",
+                nis,
+                node.output,
+                name=node.name,
+                **kwargs,
             )
 
         if node not in nodes_to_remove:
@@ -1968,7 +2042,9 @@ def update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha(subg: ModelP
         new_inputs.extend(
             [
                 onnx.helper.make_tensor_value_info(
-                    "cache_indirection", onnx.TensorProto.INT32, shape=["batch_size", "beam_width", "max_seq_len"]
+                    "cache_indirection",
+                    onnx.TensorProto.INT32,
+                    shape=["batch_size", "beam_width", "max_seq_len"],
                 )
             ]
         )
@@ -2020,7 +2096,7 @@ def pack_qkv_for_decoder_masked_mha(model_proto: ModelProto):
             matmul_node_name = onnx_model.create_node_name("MatMul", name_prefix="MatMul_QKV")
             weight = onnx.helper.make_tensor(
                 name=matmul_node_name + "_weight",
-                data_type=TensorProto.FLOAT if q_weight.data_type == 1 else TensorProto.FLOAT16,
+                data_type=(TensorProto.FLOAT if q_weight.data_type == 1 else TensorProto.FLOAT16),
                 dims=[qkv_weight.shape[0], qkv_weight.shape[1]],
                 vals=qkv_weight.flatten().tolist(),
             )
@@ -2074,12 +2150,18 @@ def update_input_shapes_for_gpt2_decoder_model(decoder_onnx_path: str, use_exter
             # Update dim_value to be 1
             shape_dim_proto.dim_value = 1
 
-    OnnxModel.save(decoder_model_proto, decoder_onnx_path, save_as_external_data=use_external_data_format)
+    OnnxModel.save(
+        decoder_model_proto,
+        decoder_onnx_path,
+        save_as_external_data=use_external_data_format,
+    )
     return True
 
 
 def generate_gpt2_init_decoder(
-    decoder_onnx_path: str, init_decoder_onnx_path: str, use_external_data_format: bool = True
+    decoder_onnx_path: str,
+    init_decoder_onnx_path: str,
+    use_external_data_format: bool = True,
 ) -> bool:
     """Generates the initial decoder GPT2 subgraph and saves it for downstream use.
        The initial decoder model will be saved to init_decoder_onnx_path.
@@ -2152,7 +2234,16 @@ def generate_gpt2_init_decoder(
         # Normalization Node is : LayerNormalization
         logits_matmul_to_residual_add_path = gpt2_init_decoder_model.match_parent_path(
             logits_matmul_node,
-            ["LayerNormalization", "Add", "Add", "MatMul", "FastGelu", "MatMul", "LayerNormalization", "Add"],
+            [
+                "LayerNormalization",
+                "Add",
+                "Add",
+                "MatMul",
+                "FastGelu",
+                "MatMul",
+                "LayerNormalization",
+                "Add",
+            ],
             [0, 0, 1, 0, 0, 0, 0, 0],
         )
 
@@ -2183,7 +2274,9 @@ def generate_gpt2_init_decoder(
     if not is_skiplayernorm_path:
         residual_add_to_attention_parent_index = 0
         residual_add_to_attention_path = gpt2_init_decoder_model.match_parent_path(
-            residual_add_node, ["Add", "Cast", "MatMul", "Attention"], [residual_add_to_attention_parent_index, 0, 0, 0]
+            residual_add_node,
+            ["Add", "Cast", "MatMul", "Attention"],
+            [residual_add_to_attention_parent_index, 0, 0, 0],
         )
 
         # Try other parent index of the residual Add node
@@ -2199,42 +2292,54 @@ def generate_gpt2_init_decoder(
         if residual_add_to_attention_path is None:
             residual_add_to_attention_parent_index = 0
             residual_add_to_attention_path = gpt2_init_decoder_model.match_parent_path(
-                residual_add_node, ["Add", "MatMul", "Attention"], [residual_add_to_attention_parent_index, 0, 0]
+                residual_add_node,
+                ["Add", "MatMul", "Attention"],
+                [residual_add_to_attention_parent_index, 0, 0],
             )
 
         # Try without the Casts before and after the MatMuls and other parent index of the residual Add node
         if residual_add_to_attention_path is None:
             residual_add_to_attention_parent_index = 1
             residual_add_to_attention_path = gpt2_init_decoder_model.match_parent_path(
-                residual_add_node, ["Add", "MatMul", "Attention"], [residual_add_to_attention_parent_index, 0, 0]
+                residual_add_node,
+                ["Add", "MatMul", "Attention"],
+                [residual_add_to_attention_parent_index, 0, 0],
             )
 
     # SkipLayerNormalization path
     else:
         residual_add_to_attention_parent_index = 0
         residual_add_to_attention_path = gpt2_init_decoder_model.match_parent_path(
-            residual_add_node, ["Cast", "MatMul", "Attention"], [residual_add_to_attention_parent_index, 0, 0]
+            residual_add_node,
+            ["Cast", "MatMul", "Attention"],
+            [residual_add_to_attention_parent_index, 0, 0],
         )
 
         # Try other parent index of the residual Add node
         if residual_add_to_attention_path is None:
             residual_add_to_attention_parent_index = 1
             residual_add_to_attention_path = gpt2_init_decoder_model.match_parent_path(
-                residual_add_node, ["Cast", "MatMul", "Attention"], [residual_add_to_attention_parent_index, 0, 0]
+                residual_add_node,
+                ["Cast", "MatMul", "Attention"],
+                [residual_add_to_attention_parent_index, 0, 0],
             )
 
         # Try without the Casts before and after the MatMuls
         if residual_add_to_attention_path is None:
             residual_add_to_attention_parent_index = 0
             residual_add_to_attention_path = gpt2_init_decoder_model.match_parent_path(
-                residual_add_node, ["MatMul", "Attention"], [residual_add_to_attention_parent_index, 0]
+                residual_add_node,
+                ["MatMul", "Attention"],
+                [residual_add_to_attention_parent_index, 0],
             )
 
         # Try without the Casts before and after the MatMuls and other parent index of the residual Add node
         if residual_add_to_attention_path is None:
             residual_add_to_attention_parent_index = 1
             residual_add_to_attention_path = gpt2_init_decoder_model.match_parent_path(
-                residual_add_node, ["MatMul", "Attention"], [residual_add_to_attention_parent_index, 0]
+                residual_add_node,
+                ["MatMul", "Attention"],
+                [residual_add_to_attention_parent_index, 0],
             )
 
     # TODO(hasesh): Are there more permutations to try before returning ?
@@ -2252,7 +2357,9 @@ def generate_gpt2_init_decoder(
     # SkipLayerNormalization path
     else:
         add_before_residual_add = gpt2_init_decoder_model.match_parent(
-            residual_add_node, "SkipLayerNormalization", residual_add_to_add_parent_index
+            residual_add_node,
+            "SkipLayerNormalization",
+            residual_add_to_add_parent_index,
         )
 
     if add_before_residual_add is None:
@@ -2342,7 +2449,11 @@ def generate_gpt2_init_decoder(
     gpt2_init_decoder_model.topological_sort()
 
     # Save the init decoder model
-    OnnxModel.save(init_decoder_model_proto, init_decoder_onnx_path, save_as_external_data=use_external_data_format)
+    OnnxModel.save(
+        init_decoder_model_proto,
+        init_decoder_onnx_path,
+        save_as_external_data=use_external_data_format,
+    )
     return True
 
 
@@ -2383,7 +2494,10 @@ def make_dim_proto_numeric_t5(model, config):
                 dim_proto.dim_value = dim_value
 
 
-def convert_generation_model(args: argparse.Namespace, generation_type: GenerationType = GenerationType.BEAMSEARCH):
+def convert_generation_model(
+    args: argparse.Namespace,
+    generation_type: GenerationType = GenerationType.BEAMSEARCH,
+):
     """Convert model according to command line arguments.
 
     Args:
@@ -2397,8 +2511,13 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
 
     logger.info(f"**** past_present_share_buffer={past_present_share_buffer}")
     if len(args.op_block_list) == 1 and args.op_block_list[0] == "auto":
-        if is_gpt2 and args.precision == Precision.FLOAT16:
-            args.op_block_list = ["Add", "LayerNormalization", "SkipLayerNormalization", "FastGelu"]
+        if is_gpt2 and args.precision == Precision.FLOAT16.value:
+            args.op_block_list = [
+                "Add",
+                "LayerNormalization",
+                "SkipLayerNormalization",
+                "FastGelu",
+            ]
             logger.info(f"**** Setting op_block_list to {args.op_block_list}")
             logger.info("**** use --op_block_list if you want to override the block operator list.")
         else:
@@ -2434,9 +2553,7 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
             logger.info(f"skip convert_to_onnx since path existed: {args.decoder_onnx}")
         else:
             if not args.decoder_onnx:
-                onnx_filename = "{}_past_{}.onnx".format(
-                    args.model_name_or_path, "fp16" if args.precision == Precision.FLOAT16 else "fp32"
-                )
+                onnx_filename = f"{args.model_name_or_path}_past_{args.precision}.onnx"
                 args.decoder_onnx = Path(Path(args.output).parent, onnx_filename).as_posix()
 
             logger.info(f"Convert GPT model {args.model_name_or_path} to onnx {args.decoder_onnx} ...")
@@ -2458,7 +2575,7 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
     logits_matmul_weight_padded = False
     if (
         not args.disable_pad_vocab_size
-        and args.precision == Precision.FLOAT16
+        and args.precision == Precision.FLOAT16.value
         and is_gpt2
         and (is_beamsearch or is_greedysearch or is_sampling)
     ):
@@ -2481,14 +2598,14 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
     ):
         logger.info(f"Creating an initial run GPT2 decoder from {args.decoder_onnx}. ")
 
-        gpt2_init_decoder_onnx_filename = "gpt2_init_past_{}.onnx".format(
-            "fp16" if args.precision == Precision.FLOAT16 else "fp32"
-        )
+        gpt2_init_decoder_onnx_filename = f"gpt2_init_past_{args.precision}.onnx"
 
         gpt2_init_decoder_onnx_path = Path(Path(args.output).parent, gpt2_init_decoder_onnx_filename).as_posix()
 
         gpt2_init_decoder_generated = generate_gpt2_init_decoder(
-            args.decoder_onnx, gpt2_init_decoder_onnx_path, args.use_external_data_format
+            args.decoder_onnx,
+            gpt2_init_decoder_onnx_path,
+            args.use_external_data_format,
         )
 
         if not gpt2_init_decoder_generated:
@@ -2672,7 +2789,8 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
             logger.info(f"Symbolic shape inference on {args.encoder_decoder_init_onnx}. The file will be overwritten.")
             shape_inference(args.encoder_decoder_init_onnx, args.use_external_data_format)
         encoder_model = onnx.load_model(args.encoder_decoder_init_onnx, load_external_data=True)
-        encoder_model.graph.name = f"{args.model_type} encoder and decoder init"
+        suffix = "encoder" if len(encoder_model.graph.input) == 2 else "encoder and decoder init"
+        encoder_model.graph.name = f"{args.model_type} {suffix}"
         verify_t5_encoder_decoder_init_subgraph(encoder_model.graph, args.precision)
 
         make_dim_proto_numeric_t5(encoder_model, config)
@@ -2711,14 +2829,13 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
             # )
             # initializers.extend(moved_initializers)
 
+        assert config.decoder_start_token_id >= 0, "decoder_start_token_id should be >= 0"
+
         node.attribute.extend(
             [
                 onnx.helper.make_attribute("encoder", encoder_model.graph),
                 onnx.helper.make_attribute("decoder", decoder_model.graph),
-                onnx.helper.make_attribute(
-                    "decoder_start_token_id",
-                    config.decoder_start_token_id if len(encoder_model.graph.input) == 3 else -1,
-                ),
+                onnx.helper.make_attribute("decoder_start_token_id", config.decoder_start_token_id),
             ]
         )
     else:
@@ -2838,7 +2955,9 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
 
     if args.output_sequences_scores:
         sequences_scores = onnx.helper.make_tensor_value_info(
-            "sequences_scores", TensorProto.FLOAT, ["batch_size", "num_return_sequences"]
+            "sequences_scores",
+            TensorProto.FLOAT,
+            ["batch_size", "num_return_sequences"],
         )
         graph_outputs.append(sequences_scores)
 
@@ -2852,7 +2971,7 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
 
     new_graph = onnx.helper.make_graph(
         [node],
-        f"{args.model_type} beam search" if not is_greedysearch else f"{args.model_type} greedy search",
+        (f"{args.model_type} beam search" if not is_greedysearch else f"{args.model_type} greedy search"),
         graph_inputs,
         graph_outputs,
         initializers,
@@ -2912,7 +3031,7 @@ def test_torch_performance(
     if args.use_gpu and not torch.cuda.is_available():
         raise RuntimeError("Please install PyTorch with Cuda for testing gpu performance.")
 
-    if args.precision == Precision.FLOAT16:
+    if args.precision == Precision.FLOAT16.value:
         model.half()
 
     device = torch.device("cuda:0" if args.use_gpu else "cpu")
@@ -2961,7 +3080,11 @@ def create_attention_mask(input_ids, pad_token_id):
     return attention_mask
 
 
-def test_gpt_model(args: argparse.Namespace, sentences: list[str] | None = None, is_greedy: bool = False):
+def test_gpt_model(
+    args: argparse.Namespace,
+    sentences: list[str] | None = None,
+    is_greedy: bool = False,
+):
     """Test GPT-2 model
 
     Args:
@@ -3152,7 +3275,7 @@ def test_gpt_model(args: argparse.Namespace, sentences: list[str] | None = None,
         print("-" * 50)
         # Compare the generated text instead of word IDs since ORT pads to max sequence length but Torch not.
         is_same = torch_decoded_sequences == ort_decoded_sequences
-        print("Torch and ORT result is ", "same" if is_same else "different")
+        print("Torch and ORT result is", "same" if is_same else "different")
         output["parity"] = is_same
 
     if args.torch_performance:
diff --git a/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py b/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py
index a0eff081675fe..5ce089712ccb1 100644
--- a/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py
@@ -51,6 +51,7 @@ def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict):
             mul_node, div_node, _sqrt_node, add_node, reduce_mean_node = sim_ln_nodes
             if not self.model.has_constant_input(div_node, 1.0):
                 return
+            node_parent = mul_node
         else:
             # Div(1, RMS) can also be represented as Reciprocal(RMS) like
             #
@@ -66,6 +67,7 @@ def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict):
             #      Mul --> ReduceMean --> Add ---> Sqrt --> Reciprocal --> Mul --> Mul (node)
             #      (B=2)                  (A/B=eps)                                (A/B=scale)
             #
+            return_indice = []
             sim_ln_nodes = self.model.match_parent_path(
                 node,
                 ["Mul", "Reciprocal", "Sqrt", "Add", "ReduceMean"],
@@ -73,24 +75,50 @@ def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict):
                 output_name_to_node=output_name_to_node,
                 return_indice=return_indice,
             )
-            if sim_ln_nodes is None:
-                return
-            mul_node, _reciprocal_node, _sqrt_node, add_node, reduce_mean_node = sim_ln_nodes
-
-        pow_or_mul_node = self.model.get_parent(reduce_mean_node, 0, output_name_to_node)
-        if pow_or_mul_node is None or pow_or_mul_node.op_type not in ["Pow", "Mul"]:
+            if sim_ln_nodes is not None:
+                mul_node, _reciprocal_node, _sqrt_node, add_node, reduce_mean_node = sim_ln_nodes
+                node_parent = mul_node
+            else:
+                #  (root_input) --------------------------------+
+                #       |                                       |
+                #       v                                       v
+                #      Pow --> ReduceMean --> Add ---> Sqrt --> Div --> Mul (node)
+                #      (B=2)                  (A/B=eps)                 (A/B=scale)
+                #
+                #  (root_input) --------------------------------+
+                #      | |                                      |
+                #      v v                                      v
+                #      Mul --> ReduceMean --> Add ---> Sqrt --> Div --> Mul (node)
+                #      (B=2)                  (A/B=eps)                 (A/B=scale)
+                #
+                return_indice = []
+                sim_ln_nodes = self.model.match_parent_path(
+                    node,
+                    ["Div", "Sqrt", "Add", "ReduceMean"],
+                    [None, 1, 0, None],
+                    output_name_to_node=output_name_to_node,
+                    return_indice=return_indice,
+                )
+                if sim_ln_nodes is not None:
+                    div_node, _sqrt_node, add_node, reduce_mean_node = sim_ln_nodes
+                    node_parent = div_node
+                else:
+                    return
+
+        reduce_mean_parent = self.model.get_parent(reduce_mean_node, 0, output_name_to_node)
+        if reduce_mean_parent is None or reduce_mean_parent.op_type not in ["Pow", "Mul"]:
             return
 
-        if pow_or_mul_node.op_type == "Pow":
-            if self.model.find_constant_input(pow_or_mul_node, 2.0) != 1:
+        if reduce_mean_parent.op_type == "Pow":
+            if self.model.find_constant_input(reduce_mean_parent, 2.0) != 1:
                 return
         else:
-            assert pow_or_mul_node.op_type == "Mul"
-            if pow_or_mul_node[0] != pow_or_mul_node[1]:
+            assert reduce_mean_parent.op_type == "Mul"
+            if reduce_mean_parent[0] != reduce_mean_parent[1]:
                 return
 
-        root_input = pow_or_mul_node.input[0]
-        if root_input != mul_node.input[0]:
+        root_input = reduce_mean_parent.input[0]
+        if root_input not in node_parent.input:
             return
 
         _i, epsilon = self.model.get_constant_input(add_node)
@@ -113,7 +141,7 @@ def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict):
             return
 
         self.nodes_to_remove.extend(sim_ln_nodes)
-        self.nodes_to_remove.append(pow_or_mul_node)
+        self.nodes_to_remove.append(reduce_mean_parent)
         self.nodes_to_remove.append(node)
 
         normalize_node = helper.make_node(
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
index 75887cc744081..f8b7dd80710ae 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
@@ -371,9 +371,6 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
     model_size_in_MB = int(get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024)  # noqa: N806
 
     provider = args.provider
-    if args.provider == "migraphx":
-        provider = "MIGraphXExecutionProvider"
-
     session = create_onnxruntime_session(
         output_path, args.use_gpu, provider, enable_all_optimization=True, verbose=args.verbose
     )
diff --git a/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py
index adf5206be8353..dd519e36cfa88 100755
--- a/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py
@@ -10,8 +10,15 @@
 import os
 
 import torch
-from benchmark_helper import Precision, create_onnxruntime_session, prepare_environment, setup_logger
+from benchmark_helper import (
+    Precision,
+    create_onnxruntime_session,
+    prepare_environment,
+    setup_logger,
+)
+from onnx.shape_inference import infer_shapes_path
 from t5_helper import PRETRAINED_MT5_MODELS, PRETRAINED_T5_MODELS, T5Helper
+from transformers import MT5Config, T5Config
 
 logger = logging.getLogger("")
 
@@ -70,9 +77,9 @@ def parse_arguments():
         "-p",
         "--precision",
         required=False,
-        type=Precision,
-        default=Precision.FLOAT32,
-        choices=[Precision.FLOAT32, Precision.FLOAT16],
+        type=str,
+        default=Precision.FLOAT32.value,
+        choices=[Precision.FLOAT32.value, Precision.FLOAT16.value],
         help="Precision of model to run. fp32 for full precision, fp16 for half precision",
     )
 
@@ -104,17 +111,17 @@ def parse_arguments():
         "--disable_auto_mixed_precision",
         required=False,
         action="store_true",
-        help="use pure fp16 instead of mixed precision",
+        help="do not use auto mixed precision conversion",
     )
     parser.set_defaults(disable_auto_mixed_precision=False)
 
     parser.add_argument(
-        "--separate_encoder_and_decoder_init",
+        "--force_fp16_io",
         required=False,
         action="store_true",
-        help="Do not merge encode and decoder init. Output 3 instead of 2 onnx models.",
+        help="Force to convert all float inputs and outputs to fp16 when precision is fp16.",
     )
-    parser.set_defaults(separate_encoder_and_decoder_init=False)
+    parser.set_defaults(force_fp16_io=False)
 
     parser.add_argument(
         "--use_int64_inputs",
@@ -131,34 +138,52 @@ def parse_arguments():
         help="filepath to load pre-trained model with custom state dictionary (e.g. pytorch_model.bin)",
     )
 
+    parser.add_argument(
+        "--encoder_decoder_init",
+        required=False,
+        action="store_true",
+        help="Combine encoder and decoder kv cache initialization into one model. It is legacy format that will be deprecated.",
+    )
+    parser.set_defaults(encoder_decoder_init=False)
+
     args = parser.parse_args()
 
     return args
 
 
 def export_onnx_models(
-    model_name_or_path,
-    cache_dir,
-    output_dir,
-    use_gpu,
-    use_external_data_format,
-    optimize_onnx,
-    precision,
-    verbose,
+    model_name_or_path: str,
+    cache_dir: str,
+    output_dir: str,
+    use_gpu: bool = False,
+    use_external_data_format: bool = False,
+    optimize_onnx: bool = False,
+    precision: str = Precision.FLOAT32.value,
+    verbose: bool = False,
     use_decoder_start_token: bool = False,
-    merge_encoder_and_decoder_init: bool = True,
     overwrite: bool = False,
     disable_auto_mixed_precision: bool = False,
     use_int32_inputs: bool = True,
     model_type: str = "t5",
     state_dict_path: str = "",
+    encoder_decoder_init: bool = False,
+    force_fp16_io: bool = False,
+    shape_infer_before_optimization: bool = False,
 ):
+    assert precision in [Precision.FLOAT32.value, Precision.FLOAT16.value], (
+        f"Invalid precision: {precision}. Use 'fp32' or 'fp16'."
+    )
     device = torch.device("cuda:0" if use_gpu else "cpu")
 
     models = T5Helper.load_model(
-        model_name_or_path, cache_dir, device, merge_encoder_and_decoder_init, model_type, state_dict_path
+        model_name_or_path,
+        cache_dir,
+        device,
+        model_type,
+        state_dict_path,
+        encoder_decoder_init=encoder_decoder_init,
     )
-    config = models["decoder"].config
+    config: T5Config | MT5Config = models["decoder"].config
 
     if (not use_external_data_format) and (config.num_layers > 24):
         logger.info("Try use_external_data_format when model size > 2GB")
@@ -191,8 +216,20 @@ def export_onnx_models(
         else:
             logger.info(f"Skip exporting: existed ONNX model {onnx_path}")
 
-        # Optimize ONNX graph. Note that we have not implemented graph optimization for T5 yet.
-        if optimize_onnx or precision != Precision.FLOAT32:
+        # Optimize ONNX graph.
+        # The precision shall be compared with string value. It is because the Precision enum loaded from local file
+        # (like by transformers test in CI pipeline) are not same as Precision enum from package.
+        if optimize_onnx or precision != Precision.FLOAT32.value:
+            onnx_shape_path = None
+            if shape_infer_before_optimization:
+                onnx_shape_path = T5Helper.get_onnx_path(
+                    output_dir,
+                    model_name_or_path,
+                    suffix=filename_suffix + "_shape",
+                    new_folder=False,
+                )
+                infer_shapes_path(onnx_path, onnx_shape_path)
+
             output_path = T5Helper.get_onnx_path(
                 output_dir,
                 model_name_or_path,
@@ -203,30 +240,35 @@ def export_onnx_models(
             if overwrite or not os.path.exists(output_path):
                 logger.info(f"Optimizing model to {output_path}")
                 T5Helper.optimize_onnx(
-                    onnx_path,
+                    onnx_shape_path or onnx_path,
                     output_path,
-                    precision == Precision.FLOAT16,
+                    precision == Precision.FLOAT16.value,
                     config.num_heads,
                     config.hidden_size,
                     use_external_data_format,
                     auto_mixed_precision=not disable_auto_mixed_precision,
                     use_gpu=use_gpu,
+                    force_fp16_io=force_fp16_io,
                 )
             else:
-                logger.info(f"Skip optimizing: existed ONNX model {onnx_path}")
+                logger.info(f"Skip optimizing: existed ONNX model {output_path}")
         else:
             output_path = onnx_path
 
         ort_session = create_onnxruntime_session(
             output_path,
             use_gpu=use_gpu,
-            provider=["CUDAExecutionProvider", "CPUExecutionProvider"] if use_gpu else ["CPUExecutionProvider"],
+            verbose=verbose,
         )
+        if ort_session is None:
+            break
 
         with torch.no_grad():
             max_diff = T5Helper.verify_onnx(model, ort_session, device, use_int32_inputs)
         logger.info(f"PyTorch and OnnxRuntime results max difference = {max_diff}")
-        if max_diff > 1e-4:
+
+        # The threshold cannot apply to fp16 model, which need a larger threshold.
+        if precision == Precision.FLOAT32.value and max_diff > 1e-4:
             logger.warning("PyTorch and OnnxRuntime results are NOT close")
 
         output_paths.append(output_path)
@@ -245,15 +287,12 @@ def main():
     output_dir = args.output if not args.output.endswith(".onnx") else os.path.dirname(args.output)
     prepare_environment(cache_dir, output_dir, args.use_gpu)
 
-    if args.precision != Precision.FLOAT32:
+    if args.precision != Precision.FLOAT32.value:
         assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"
 
-    if args.precision == Precision.FLOAT16:
+    if args.precision == Precision.FLOAT16.value:
         assert args.use_gpu, "fp16 requires --use_gpu"
 
-    if args.optimize_onnx:
-        logger.warning("Graph optimization for T5 is not implemented yet.")
-
     output_paths = export_onnx_models(
         args.model_name_or_path,
         cache_dir,
@@ -264,11 +303,12 @@ def main():
         args.precision,
         args.verbose,
         args.use_decoder_start_token,
-        not args.separate_encoder_and_decoder_init,
         args.overwrite,
         args.disable_auto_mixed_precision,
         not args.use_int64_inputs,
         args.model_type,
+        encoder_decoder_init=args.encoder_decoder_init,
+        force_fp16_io=args.force_fp16_io,
     )
 
     logger.info(f"Done! Outputs: {output_paths}")
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py
index c6b0f7ee3adc2..df3a416f2947c 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py
@@ -1,24 +1,14 @@
 # -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# -------------------------------------------------------------------------
 
 import logging
-import os
 import random
-import tempfile
-from pathlib import Path
 
-import numpy
-import onnx
 import torch
-from onnx_model import OnnxModel
-from torch_onnx_export_helper import torch_onnx_export
 from transformers import MT5Config, T5Config
 
-from onnxruntime import InferenceSession
-
 logger = logging.getLogger(__name__)
 
 
@@ -41,7 +31,11 @@ def __init__(self, input_ids, attention_mask):
 
     @staticmethod
     def create_dummy(
-        batch_size: int, sequence_length: int, vocab_size: int, device: torch.device, use_int32_inputs: bool = False
+        batch_size: int,
+        sequence_length: int,
+        vocab_size: int,
+        device: torch.device,
+        use_int32_inputs: bool = False,
     ):  # -> T5EncoderInputs
         """Create dummy inputs for T5 encoder.
 
@@ -74,97 +68,3 @@ def create_dummy(
     def to_list(self) -> list:
         input_list = [v for v in [self.input_ids, self.attention_mask] if v is not None]
         return input_list
-
-
-class T5EncoderHelper:
-    @staticmethod
-    def export_onnx(
-        encoder: T5Encoder,
-        device: torch.device,
-        onnx_model_path: str,
-        verbose: bool = True,
-        use_external_data_format: bool = False,
-        use_int32_inputs: bool = False,
-    ):
-        """Export encoder to ONNX
-
-        Args:
-            encoder (T5Encoder): encoder object
-            device (torch.device): device of encoder object
-            onnx_model_path (str): onnx path
-            verbose (bool, optional): print verbose information. Defaults to True.
-            use_external_data_format (bool, optional): use external data format or not. Defaults to False.
-        """
-        config = encoder.config
-        encoder_inputs = T5EncoderInputs.create_dummy(
-            batch_size=2,
-            sequence_length=4,
-            vocab_size=config.vocab_size,
-            device=device,
-            use_int32_inputs=use_int32_inputs,
-        )
-
-        Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
-
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            temp_onnx_model_path = os.path.join(tmp_dir_name, "encoder.onnx")
-            Path(temp_onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
-            torch_onnx_export(
-                encoder,
-                args=tuple(encoder_inputs.to_list()),
-                f=temp_onnx_model_path if use_external_data_format else onnx_model_path,
-                export_params=True,
-                input_names=["input_ids", "attention_mask"],
-                output_names=["hidden_states"],
-                dynamic_axes={
-                    "input_ids": {0: "batch_size", 1: "sequence_length"},
-                    "attention_mask": {0: "batch_size", 1: "sequence_length"},
-                    "hidden_states": {0: "batch_size", 1: "sequence_length"},
-                },
-                opset_version=12,
-                do_constant_folding=True,
-                use_external_data_format=use_external_data_format,
-                verbose=verbose,
-            )
-
-            if use_external_data_format:
-                model = onnx.load_model(temp_onnx_model_path, load_external_data=True)
-                OnnxModel.save(
-                    model,
-                    onnx_model_path,
-                    save_as_external_data=True,
-                    all_tensors_to_one_file=True,
-                )
-
-    @staticmethod
-    def onnxruntime_inference(ort_session, inputs: T5EncoderInputs):
-        """Run inference of ONNX model."""
-        ort_inputs = {
-            "input_ids": numpy.ascontiguousarray(inputs.input_ids.cpu().numpy()),
-            "attention_mask": numpy.ascontiguousarray(inputs.attention_mask.cpu().numpy()),
-        }
-
-        return ort_session.run(None, ort_inputs)
-
-    @staticmethod
-    def verify_onnx(
-        model: T5Encoder, ort_session: InferenceSession, device: torch.device, use_int32_inputs: bool = False
-    ):
-        """Compare the result from PyTorch and OnnxRuntime to verify the ONNX model is good."""
-        inputs = T5EncoderInputs.create_dummy(
-            batch_size=4,
-            sequence_length=11,
-            vocab_size=model.config.vocab_size,
-            device=device,
-            use_int32_inputs=use_int32_inputs,
-        )
-        input_list = inputs.to_list()
-        torch_outputs = model(*input_list)
-
-        ort_outputs = T5EncoderHelper.onnxruntime_inference(ort_session, inputs)
-
-        max_diff = numpy.amax(numpy.abs(torch_outputs.cpu().numpy() - ort_outputs[0]))
-
-        logger.info(f"max_diff={max_diff}")
-
-        return max_diff
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
index c76d7aabdf11a..98df18eab6064 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
@@ -1,8 +1,7 @@
 # -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# -------------------------------------------------------------------------
 
 import logging
 import os
@@ -31,33 +30,40 @@ def __init__(
         self,
         encoder: torch.nn.Module,
         decoder: torch.nn.Module,
-        lm_head: torch.nn.Module,
+        lm_head: torch.nn.Linear,
         config: T5Config | MT5Config,
         decoder_start_token_id: int | None = None,
+        output_cross_only: bool = False,
     ):
         super().__init__()
-        self.config = config
+        self.config: T5Config | MT5Config = config
         self.t5_encoder = T5Encoder(encoder, config)
         self.t5_decoder_init = T5DecoderInit(decoder, lm_head, config, decoder_start_token_id)
+        self.output_cross_only = output_cross_only
 
     def forward(
         self,
         encoder_input_ids: torch.Tensor,
         encoder_attention_mask: torch.Tensor,
-        decoder_input_ids: torch.Tensor = None,
+        decoder_input_ids: torch.Tensor | None = None,
     ):
         encoder_hidden_states: torch.FloatTensor = self.t5_encoder(encoder_input_ids, encoder_attention_mask)
+
         lm_logits, past_self, past_cross = self.t5_decoder_init(
             decoder_input_ids, encoder_attention_mask, encoder_hidden_states
         )
-        return lm_logits, encoder_hidden_states, past_self, past_cross
+
+        if self.output_cross_only:
+            return past_cross
+        else:
+            return lm_logits, encoder_hidden_states, past_self, past_cross
 
 
 class T5EncoderDecoderInitInputs:
     def __init__(self, encoder_input_ids, encoder_attention_mask, decoder_input_ids=None):
         self.encoder_input_ids: torch.LongTensor = encoder_input_ids
         self.encoder_attention_mask: torch.LongTensor = encoder_attention_mask
-        self.decoder_input_ids: torch.LongTensor = decoder_input_ids
+        self.decoder_input_ids: torch.LongTensor | None = decoder_input_ids
 
     @staticmethod
     def create_dummy(
@@ -108,9 +114,14 @@ def export_onnx(
             onnx_model_path (str): onnx path
             verbose (bool, optional): print verbose information. Defaults to True.
             use_external_data_format (bool, optional): use external data format or not. Defaults to False.
+            use_int32_inputs (bool, optional): use int32 instead of int64 for integer inputs. Defaults to False.
         """
         assert isinstance(model, T5EncoderDecoderInit)
 
+        # Do not exclude decoder in torch onnx export so that cross can show up.
+        output_cross_only = model.output_cross_only
+        model.output_cross_only = False
+
         inputs = T5EncoderDecoderInitInputs.create_dummy(
             model.config,
             batch_size=2,
@@ -139,7 +150,7 @@ def export_onnx(
 
         input_names = ["encoder_input_ids", "encoder_attention_mask"]
 
-        # ONNX exporter might mark dimension like 'Transposepresent_value_self_1_dim_2' in shape inference.
+        # ONNX exporter might mark dimension like 'present_value_self_1_dim_2' in shape inference.
         # We use a workaround here: first use dim_param "1" for sequence_length, and later change to dim_value.
         sequence_length = "1"
         num_heads = str(model.config.num_heads)
@@ -201,9 +212,12 @@ def export_onnx(
                 verbose=verbose,
             )
 
+            # Restore output_cross_only setting.
+            model.output_cross_only = output_cross_only
+
             # Workaround as mentioned earlier: change numeric dim_param to dim_value
-            model = onnx.load(temp_onnx_model_path)
-            for tensor in model.graph.output:
+            exported_model: onnx.ModelProto = onnx.load(temp_onnx_model_path)
+            for tensor in exported_model.graph.output:
                 for dim_proto in tensor.type.tensor_type.shape.dim:
                     if dim_proto.HasField("dim_param") and dim_proto.dim_param in [
                         sequence_length,
@@ -215,8 +229,50 @@ def export_onnx(
                         dim_proto.Clear()
                         dim_proto.dim_value = dim_value
 
+            if output_cross_only:
+                # Rewrite onnx graph to only keep present_[key|value]_cross_* outputs.
+                onnx_model = OnnxModel(exported_model)
+                output_name_to_node = onnx_model.output_name_to_node()
+
+                for output in exported_model.graph.output:
+                    if "cross" in output.name:
+                        assert output.name in output_name_to_node
+
+                        transpose_node = output_name_to_node[output.name]
+                        assert transpose_node and transpose_node.op_type == "Transpose"
+
+                        permutation = OnnxModel.get_node_attribute(transpose_node, "perm")
+                        assert isinstance(permutation, list)
+                        assert permutation == [0, 2, 1, 3]
+
+                        matched_nodes = onnx_model.match_parent_path(
+                            transpose_node,
+                            ["Reshape", "MatMul"],
+                            [0, 0],
+                            output_name_to_node,
+                        )
+                        assert matched_nodes is not None
+
+                        reshape_node, matmul_node = matched_nodes
+                        assert "encoder_hidden_states" in matmul_node.input
+
+                        if not onnx_model.get_initializer("cross_reshape_shape"):
+                            shape_tensor = onnx.helper.make_tensor(
+                                name="cross_reshape_shape",
+                                data_type=onnx.TensorProto.INT64,
+                                dims=[4],
+                                vals=[0, 0, int(num_heads), int(head_size)],
+                                raw=False,
+                            )
+                            onnx_model.add_initializer(shape_tensor)
+
+                        reshape_node.input[1] = "cross_reshape_shape"
+
+                cross_outputs = [output.name for output in exported_model.graph.output if "cross" in output.name]
+                onnx_model.prune_graph(cross_outputs, allow_remove_graph_inputs=True)
+
             OnnxModel.save(
-                model,
+                exported_model,
                 onnx_model_path,
                 save_as_external_data=use_external_data_format,
                 all_tensors_to_one_file=True,
@@ -269,27 +325,34 @@ def verify_onnx(
 
             num_decoder_layers = model.config.num_decoder_layers
 
-            assert torch_outputs[0].cpu().numpy().shape == ort_outputs[0].shape
-            max_diff = numpy.amax(numpy.abs(torch_outputs[0].cpu().numpy() - ort_outputs[0]))
-            logger.debug(f"logits max_diff={max_diff}")
-            max_diff_all = max_diff
-
-            assert torch_outputs[1].cpu().numpy().shape == ort_outputs[1].shape
-            max_diff = numpy.amax(numpy.abs(torch_outputs[1].cpu().numpy() - ort_outputs[1]))
-            logger.debug(f"encoder_hidden_states max_diff={max_diff}")
-            max_diff_all = max(max_diff_all, max_diff)
-
-            for i in range(2 * num_decoder_layers):
-                max_diff = numpy.amax(numpy.abs(torch_outputs[2][i].cpu().numpy() - ort_outputs[2 + i]))
-                logger.debug(f"self attention past state {i} max_diff={max_diff}")
-
-            for i in range(2 * num_decoder_layers):
-                max_diff = numpy.amax(
-                    numpy.abs(torch_outputs[3][i].cpu().numpy() - ort_outputs[2 + 2 * num_decoder_layers + i])
-                )
-                logger.debug(f"cross attention past state {i} max_diff={max_diff}")
+            if not model.output_cross_only:
+                assert torch_outputs[0].cpu().numpy().shape == ort_outputs[0].shape
+                max_diff = numpy.amax(numpy.abs(torch_outputs[0].cpu().numpy() - ort_outputs[0]))
+                logger.debug(f"logits max_diff={max_diff}")
+                max_diff_all = max_diff
+
+                assert torch_outputs[1].cpu().numpy().shape == ort_outputs[1].shape
+                max_diff = numpy.amax(numpy.abs(torch_outputs[1].cpu().numpy() - ort_outputs[1]))
+                logger.debug(f"encoder_hidden_states max_diff={max_diff}")
                 max_diff_all = max(max_diff_all, max_diff)
 
+                for i in range(2 * num_decoder_layers):
+                    max_diff = numpy.amax(numpy.abs(torch_outputs[2][i].cpu().numpy() - ort_outputs[2 + i]))
+                    logger.debug(f"self attention past state {i} max_diff={max_diff}")
+
+                for i in range(2 * num_decoder_layers):
+                    max_diff = numpy.amax(
+                        numpy.abs(torch_outputs[3][i].cpu().numpy() - ort_outputs[2 + 2 * num_decoder_layers + i])
+                    )
+                    logger.debug(f"cross attention past state {i} max_diff={max_diff}")
+                    max_diff_all = max(max_diff_all, max_diff)
+            else:
+                max_diff_all = -float("inf")
+                for i in range(2 * num_decoder_layers):
+                    max_diff = numpy.amax(numpy.abs(torch_outputs[i].cpu().numpy() - ort_outputs[i]))
+                    logger.debug(f"cross attention past state {i} max_diff={max_diff}")
+                    max_diff_all = max(max_diff_all, max_diff)
+
             test_cases_max_diff.append(max_diff_all)
             logger.info(
                 f"batch_size={batch_size} encode_sequence_length={encode_sequence_length}, max_diff={max_diff_all}"
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
index d3f25e979887d..7552008f920e0 100755
--- a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
@@ -1,8 +1,7 @@
 # -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# -------------------------------------------------------------------------
 
 import logging
 import os
@@ -12,8 +11,7 @@
 from float16 import float_to_float16_max_diff
 from onnx_model import OnnxModel
 from optimizer import optimize_model
-from t5_decoder import T5Decoder, T5DecoderHelper, T5DecoderInit
-from t5_encoder import T5Encoder, T5EncoderHelper
+from t5_decoder import T5Decoder, T5DecoderHelper
 from t5_encoder_decoder_init import T5EncoderDecoderInit, T5EncoderDecoderInitHelper
 from transformers import MT5ForConditionalGeneration, T5ForConditionalGeneration
 
@@ -22,7 +20,13 @@
 logger = logging.getLogger(__name__)
 
 PRETRAINED_T5_MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]
-PRETRAINED_MT5_MODELS = ["google/mt5-small", "google/mt5-base", "google/mt5-large", "google/mt5-xl", "google/mt5-xxl"]
+PRETRAINED_MT5_MODELS = [
+    "google/mt5-small",
+    "google/mt5-base",
+    "google/mt5-large",
+    "google/mt5-xl",
+    "google/mt5-xxl",
+]
 
 
 class T5Helper:
@@ -60,18 +64,19 @@ def load_model(
         model_name_or_path: str,
         cache_dir: str,
         device: torch.device,
-        merge_encoder_and_decoder_init: bool = True,
         model_type: str = "t5",
         state_dict_path: str = "",
-    ) -> dict[str, torch.nn.Module]:
+        encoder_decoder_init: bool = False,
+    ) -> dict[str, T5EncoderDecoderInit | T5Decoder]:
         """Load model given a pretrained name or path, then build models for ONNX conversion.
 
         Args:
             model_name_or_path (str): pretrained model name or path
             cache_dir (str): cache directory
             device (torch.device): device to run the model
-            merge_encoder_and_decoder_init (bool, optional): Whether merge encoder and decoder initialization into one ONNX model. Defaults to True.
-            is_mt5 (bool, optional): whether the model is MT5 instead of T5
+            model_type (str, optional): model type "t5" or "mt5"
+            state_dict_path(str, optional): state dictionary path
+            encoder_decoder_init (bool, optional): combine encoder and decoder kv cache initialization into one model.
         Returns:
             Dict[str, torch.nn.Module]: mapping from name to modules for ONNX conversion.
         """
@@ -88,29 +93,21 @@ def load_model(
         decoder = T5Decoder(model.decoder, model.lm_head, model.config)
         decoder.eval().to(device)
 
-        if merge_encoder_and_decoder_init:
-            encoder_decoder_init = T5EncoderDecoderInit(
-                model.encoder,
-                model.decoder,
-                model.lm_head,
-                model.config,
-                decoder_start_token_id=None,
-            )
-            return {"encoder_decoder_init": encoder_decoder_init, "decoder": decoder}
-        else:
-            encoder = T5Encoder(model.encoder, model.config)
-            encoder.eval().to(device)
-            decoder_init = T5DecoderInit(model.decoder, model.lm_head, model.config)
-            decoder_init.eval().to(device)
-            return {
-                "encoder": encoder,
-                "decoder": decoder,
-                "decoder_init": decoder_init,
-            }
+        encoder = T5EncoderDecoderInit(
+            model.encoder,
+            model.decoder,
+            model.lm_head,
+            model.config,
+            decoder_start_token_id=None,
+            output_cross_only=not encoder_decoder_init,
+        )
+
+        encoder_name = "encoder_decoder_init" if encoder_decoder_init else "encoder"
+        return {encoder_name: encoder, "decoder": decoder}
 
     @staticmethod
     def export_onnx(
-        model: T5Encoder | T5Decoder | T5DecoderInit | T5EncoderDecoderInit,
+        model: T5Decoder | T5EncoderDecoderInit,
         device: torch.device,
         onnx_model_path: str,
         verbose: bool = True,
@@ -118,16 +115,7 @@ def export_onnx(
         use_decoder_input_ids: bool = True,
         use_int32_inputs: bool = False,
     ):
-        if isinstance(model, T5Encoder):
-            T5EncoderHelper.export_onnx(
-                model,
-                device,
-                onnx_model_path,
-                verbose,
-                use_external_data_format,
-                use_int32_inputs,
-            )
-        elif isinstance(model, T5EncoderDecoderInit):
+        if isinstance(model, T5EncoderDecoderInit):
             T5EncoderDecoderInitHelper.export_onnx(
                 model,
                 device,
@@ -150,21 +138,28 @@ def export_onnx(
     @staticmethod
     def auto_mixed_precision(
         onnx_model: OnnxModel,
-        op_block_list: list[str] = [  # noqa: B006
-            "SimplifiedLayerNormalization",
-            "SkipSimplifiedLayerNormalization",
-            "Relu",
-            "Add",
-        ],
+        op_block_list: list[str] | None = None,
+        force_fp16_logits: bool = False,
+        use_symbolic_shape_infer: bool = True,
     ):
         """Convert model to mixed precision.
            It detects whether original model has fp16 precision weights, and set parameters for float16 conversion automatically.
         Args:
             onnx_model (OnnxModel): optimized ONNX model
-            op_block_list (List[str], optional): . Defaults to ["SimplifiedLayerNormalization", "SkipSimplifiedLayerNormalization", "Relu", "Add"]
+            op_block_list (List[str], optional): operators need to run in fp32.
+            force_fp16_logits (bool, optional): force logits and last MatMul node to be in float16. Defaults to False.
+            use_symbolic_shape_infer (bool, optional): use symbolic shape inference to convert float to float16. Defaults to True.
         Returns:
             parameters(dict): a dictionary of parameters used in float16 conversion
         """
+        if op_block_list is None:
+            op_block_list = [
+                "SimplifiedLayerNormalization",
+                "SkipSimplifiedLayerNormalization",
+                "Relu",
+                "Add",
+            ]
+
         op_full_set = {node.op_type for node in onnx_model.nodes()}
         fp32_op_set = set(op_block_list)
         fp16_op_set = op_full_set.difference(fp32_op_set)
@@ -198,11 +193,38 @@ def auto_mixed_precision(
 
         keep_io_types = []
         node_block_list = []
-        if (not is_weight_fp16_precision) and (last_matmul_node is not None):
+        if (not is_weight_fp16_precision) and (last_matmul_node is not None) and not force_fp16_logits:
             # When original weight is float32 precision, keep logits and last MatMul in float32 could get better precision.
             keep_io_types = [logits_output_name]
             node_block_list = [last_matmul_node.name]
 
+        if "Add" not in op_block_list:
+            input_name_to_nodes = onnx_model.input_name_to_nodes()
+            fp32_add = 0
+            changed = True
+            add_nodes = onnx_model.get_nodes_by_op_type("Add")
+            while changed:
+                changed = False
+                for node in add_nodes:
+                    if node.name not in node_block_list:
+                        parents = onnx_model.get_parents(node, output_name_to_node)
+                        children = onnx_model.get_children(node, input_name_to_nodes)
+                        blocked_children = [
+                            child for child in children if child.op_type in op_block_list or child in node_block_list
+                        ]
+                        blocked_parents = [
+                            parent for parent in parents if parent.op_type in op_block_list or parent in node_block_list
+                        ]
+                        # If any child or parent is in fp32, we place the Add node to fp32.
+                        if (len(blocked_children) + len(blocked_parents)) > 0:
+                            node_block_list.append(node.name)
+                            fp32_add += 1
+                            changed = True
+            fp16_add = len(add_nodes) - fp32_add
+            logger.info(f"node counter of Add operator: fp32={fp32_add} fp16={fp16_add}")
+
+        logger.info(f"node_block_list: {node_block_list}")
+
         parameters = {
             "keep_io_types": keep_io_types,
             "op_block_list": op_block_list,
@@ -211,7 +233,18 @@ def auto_mixed_precision(
         }
 
         logger.info(f"auto_mixed_precision parameters: {parameters}")
-        onnx_model.convert_float_to_float16(use_symbolic_shape_infer=True, **parameters)
+        if use_symbolic_shape_infer:
+            onnx_model.convert_float_to_float16(use_symbolic_shape_infer=True, **parameters)
+        else:
+            # Workaround when symbolic shape inference fails.
+            # Need enable shape_infer_before_optimization in convert_to_onnx.py as well.
+            from float16 import convert_float_to_float16
+
+            convert_float_to_float16(
+                onnx_model.model,
+                disable_shape_infer=True,
+                **parameters,
+            )
 
         return parameters
 
@@ -225,6 +258,7 @@ def optimize_onnx(
         use_external_data_format: bool = False,
         auto_mixed_precision: bool = True,
         use_gpu: bool = False,
+        force_fp16_io: bool = False,
     ):
         """Optimize ONNX model with an option to convert it to use mixed precision."""
 
@@ -233,38 +267,35 @@ def optimize_onnx(
         optimization_options = None
         if is_float16:
             optimization_options = FusionOptions("t5")
-            optimization_options.enable_skip_layer_norm = False
+            # SkipLayerNormalization is faster but might bring accuracy drop since it uses fp16 accumulation.
+            optimization_options.enable_skip_layer_norm = not auto_mixed_precision
 
         m = optimize_model(
             onnx_model_path,
             model_type="t5",
             num_heads=num_attention_heads,
             hidden_size=hidden_size,
-            opt_level=2 if not use_external_data_format else 0,
+            opt_level=0,
             optimization_options=optimization_options,
-            use_gpu=False,
-            only_onnxruntime=not use_gpu,
+            use_gpu=use_gpu,
         )
 
         if is_float16:
             if auto_mixed_precision:
-                T5Helper.auto_mixed_precision(m)
+                T5Helper.auto_mixed_precision(m, force_fp16_logits=force_fp16_io)
             else:
-                m.convert_model_float32_to_float16(cast_input_output=False)
+                m.convert_model_float32_to_float16(cast_input_output=force_fp16_io)
 
         m.save_model_to_file(optimized_model_path, use_external_data_format, all_tensors_to_one_file=True)
 
     @staticmethod
     def verify_onnx(
-        model: T5Encoder | T5Decoder | T5DecoderInit | T5EncoderDecoderInit,
+        model: T5Decoder | T5EncoderDecoderInit,
         ort_session: InferenceSession,
         device: torch.device,
         use_int32_inputs: bool,
     ):
         """Compare the result from PyTorch and OnnxRuntime to verify the ONNX model is good."""
-        if isinstance(model, T5Encoder):
-            return T5EncoderHelper.verify_onnx(model, ort_session, device, use_int32_inputs)
-
         if isinstance(model, T5EncoderDecoderInit):
             return T5EncoderDecoderInitHelper.verify_onnx(model, ort_session, device, use_int32_inputs)
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index c0310b3e8c663..8add38b5a7d07 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -1183,11 +1183,21 @@ def graph_topological_sort(graph, is_deterministic=False):
         graph.ClearField("node")
         graph.node.extend(sorted_nodes)
 
-    def topological_sort(self, is_deterministic=False):
+    def topological_sort(self, is_deterministic=False, dump_model_on_failure=False):
         # TODO: support graph_topological_sort() in subgraphs
         # for graph in self.graphs():
         #    self.graph_topological_sort(graph)
-        OnnxModel.graph_topological_sort(self.model.graph, is_deterministic)
+        try:
+            OnnxModel.graph_topological_sort(self.model.graph, is_deterministic)
+        except RuntimeError as e:
+            if dump_model_on_failure:
+                logger.info(
+                    "Failed to sort graph in topological order. Dumping model to _topo_sort_failed.onnx for debugging."
+                )
+                OnnxModel.save(
+                    self.model, "_topo_sort_failed.onnx", save_as_external_data=True, all_tensors_to_one_file=True
+                )
+            raise e
 
     @staticmethod
     def save(
diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py
index 33dcc7795a465..de299a970ffd3 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_t5.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_t5.py
@@ -34,13 +34,13 @@ def __init__(
             num_heads,
             attention_mask,
             use_multi_head_attention=False,
-            search_op_types=["SkipSimplifiedLayerNormalization", "Add"],
+            search_op_types=["Softmax"],
         )
         self.static_kv = 1
 
-    def create_attention_node(
+    def make_attention_node(
         self,
-        mask_index: str,
+        mask_index: str | None,
         q_matmul: NodeProto,
         k_matmul: NodeProto,
         v_matmul: NodeProto,
@@ -48,8 +48,8 @@ def create_attention_node(
         hidden_size: int,
         input: str,
         output: str,
-        add_qk_str: str,
-        scale: float | None = None,
+        attn_bias: str | None,
+        scale: float,
     ) -> NodeProto | None:
         """Create an Attention node.
         Args:
@@ -122,14 +122,17 @@ def create_attention_node(
             attention_node_name + "_qkv_weight",
             "",
         ]
-        if mask_index is not None:
+        if mask_index:
             attention_inputs.append(mask_index)
         else:
             attention_inputs.append("")
 
-        if add_qk_str is not None:
+        if attn_bias:
             attention_inputs.append("")  # no past
-            attention_inputs.append(add_qk_str)
+            attention_inputs.append(attn_bias)
+
+        while attention_inputs and attention_inputs[-1] == "":
+            attention_inputs.pop()
 
         attention_node = helper.make_node(
             "Attention",
@@ -153,50 +156,55 @@ def create_mha_node(
         query: str,
         key: str,
         value: str,
-        mask_index: str,
-        res_pos_bias: str,
-        past_key: str,
-        past_value: str,
+        mask_index: str | None,
+        attn_bias: str | None,
+        past_key: str | None,
+        past_value: str | None,
         output: str,
-        present_key: str,
-        present_value: str,
+        present_key: str | None,
+        present_value: str | None,
         num_heads: int,
         hidden_size: int,
     ) -> NodeProto | None:
-        assert num_heads > 0
+        assert num_heads > 0 and hidden_size > 0 and query and key and value
 
-        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+        if (hidden_size % num_heads) != 0:
             logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
             return None
 
         attention_node_name = self.model.create_node_name("MultiHeadAttention")
         attention_inputs = [
             query,
-            "" if key is None else key,  # key
-            "" if value is None else value,  # value
+            key,
+            value,
             "",  # bias
         ]
-        if mask_index is not None:
+
+        if mask_index:
             attention_inputs.append(mask_index)
         else:
             attention_inputs.append("")
 
-        if res_pos_bias is not None:
-            attention_inputs.append(res_pos_bias)
+        if attn_bias:
+            attention_inputs.append(attn_bias)
         else:
             attention_inputs.append("")
 
-        if past_key is not None:
-            assert past_value is not None
+        if past_key:
+            assert past_value
             attention_inputs.append(past_key)
             attention_inputs.append(past_value)
 
+        while attention_inputs and attention_inputs[-1] == "":
+            attention_inputs.pop()
+
         attention_outputs = [output]
-        if present_key is not None:
-            assert present_value is not None
+        if present_key:
+            assert present_value
             attention_outputs.append(present_key)
             attention_outputs.append(present_value)
 
+        print(f"{attention_inputs=}, {attention_outputs=}, {attention_node_name=}")
         attention_node = helper.make_node(
             "MultiHeadAttention",
             inputs=attention_inputs,
@@ -213,21 +221,23 @@ def create_mha_node(
         self.increase_counter("MultiHeadAttention")
         return attention_node
 
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        self.fuse_t5_encoder(normalize_node, input_name_to_nodes, output_name_to_node)
-        self.fuse_t5_decoder(normalize_node, input_name_to_nodes, output_name_to_node)
-
-    def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        if normalize_node.op_type != "SkipSimplifiedLayerNormalization" and normalize_node.op_type != "Add":
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if self.fuse_t5_encoder(node, input_name_to_nodes, output_name_to_node):
             return
 
-        qkv_nodes = self.model.match_parent_path(
-            normalize_node, ["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0], output_name_to_node
+        self.fuse_t5_decoder(node, input_name_to_nodes, output_name_to_node)
+
+    def fuse_t5_encoder(self, softmax_node, input_name_to_nodes, output_name_to_node):
+        assert softmax_node.op_type == "Softmax"
+        qkv_nodes = self.model.match_child_path(
+            softmax_node,
+            ["MatMul", "Transpose", "Reshape"],
+            edges=[(0, 0), (0, 0), (0, 0)],
+            input_name_to_nodes=input_name_to_nodes,
         )
         if qkv_nodes is None:
-            return
-
-        _, reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
+            return False
+        matmul_qkv, _, reshape_qkv = qkv_nodes
 
         qkv_shape_nodes = self.model.match_parent_path(
             reshape_qkv,
@@ -236,7 +246,7 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no
             output_name_to_node,
         )
         if qkv_shape_nodes is None:
-            return
+            return False
         input_shape_node = qkv_shape_nodes[-1]
 
         v_nodes = self.model.match_parent_path(
@@ -246,7 +256,7 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no
             output_name_to_node,
         )
         if v_nodes is None:
-            return
+            return False
         _, reshape_v, matmul_v = v_nodes
         # todo: check reshape_v parent nodes
 
@@ -257,7 +267,7 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no
             output_name_to_node,
         )
         if qk_nodes is None:
-            return
+            return False
         _, add_qk, matmul_qk = qk_nodes
 
         mask_nodes = self.model.match_parent_path(
@@ -268,7 +278,9 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no
         )
 
         is_pattern_for_one_graph_input = mask_nodes is None
-        if mask_nodes is None:
+        if mask_nodes is not None:
+            mul_node = mask_nodes[1]
+        else:
             # Pattern for SD3 and Flux.
             mask_nodes = self.model.match_parent_path(
                 add_qk,
@@ -276,15 +288,22 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no
                 [1, 1, 0, 0, 1, 0],
                 output_name_to_node,
             )
+
+            # If the model is not optimized by ORT, there might be an additional Cast node.
             if mask_nodes is None:
-                return
+                mask_nodes = self.model.match_parent_path(
+                    add_qk,
+                    ["Add", "Slice", "Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
+                    [1, 1, 0, 0, 1, 0, 0],
+                    output_name_to_node,
+                )
+                if mask_nodes is None:
+                    return False
             mul_node = mask_nodes[2]
-        else:
-            mul_node = mask_nodes[1]
 
         _, mul_val = self.model.get_constant_input(mul_node)
         if mul_val is None:
-            return
+            return False
 
         if mul_val != -10000:
             self.mask_filter_value = float(mul_val)
@@ -327,7 +346,7 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no
                 [1, 0, 0],
             )
         if rpb_nodes is None:
-            return
+            return False
 
         res_pos_bias = rpb_nodes[-1].output[0]
 
@@ -337,8 +356,8 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no
             [1, 0, 0],
         )
         if k_nodes is None:
-            return
-        _, reshape_k, matmul_k = k_nodes
+            return False
+        _, _, matmul_k = k_nodes
         # todo: check reshape_k parent nodes
 
         q_nodes = self.model.match_parent_path(
@@ -347,50 +366,50 @@ def fuse_t5_encoder(self, normalize_node, input_name_to_nodes, output_name_to_no
             [0, 0, 0],
         )
         if q_nodes is None:
-            return
+            return False
 
-        transpose_q, reshape_q, matmul_q = q_nodes
+        _, reshape_q, matmul_q = q_nodes
         # todo: check reshape_q parent nodes
 
         if matmul_q.input[0] != input_shape_node.input[0]:
-            return
+            return False
 
         q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
 
-        new_node = self.create_attention_node(
+        new_node = self.make_attention_node(
             mask_index,
             matmul_q,
             matmul_k,
             matmul_v,
-            q_num_heads,
-            q_hidden_size,
-            input_shape_node.input[0],
-            reshape_qkv.output[0],
-            res_pos_bias,
-            1.0,
+            num_heads=q_num_heads,
+            hidden_size=q_hidden_size,
+            input=input_shape_node.input[0],
+            output=reshape_qkv.output[0],
+            attn_bias=res_pos_bias,
+            scale=1.0,
         )
         if new_node is None:
-            return
+            return False
 
         self.nodes_to_add.append(new_node)
         self.node_name_to_graph_name[new_node.name] = self.this_graph_name
 
         self.nodes_to_remove.append(reshape_qkv)
         self.prune_graph = True
+        return True
 
-    def fuse_t5_decoder(self, normalize_node, input_name_to_nodes, output_name_to_node):
-        if normalize_node.op_type != "SkipSimplifiedLayerNormalization" and normalize_node.op_type != "Add":
-            return
+    def fuse_t5_decoder(self, softmax_node, input_name_to_nodes, output_name_to_node):
+        assert softmax_node.op_type == "Softmax"
 
-        qkv_nodes = self.model.match_parent_path(
-            normalize_node,
-            ["MatMul", "Reshape", "Transpose", "MatMul"],
-            [1, 0, 0, 0],
+        qkv_nodes = self.model.match_child_path(
+            softmax_node,
+            ["MatMul", "Transpose", "Reshape"],
+            edges=[(0, 0), (0, 0), (0, 0)],
+            input_name_to_nodes=input_name_to_nodes,
         )
         if qkv_nodes is None:
             return
-
-        _, reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
+        matmul_qkv, _transpose_qkv, reshape_qkv = qkv_nodes
 
         qkv_shape_nodes = self.model.match_parent_path(
             reshape_qkv,
@@ -462,11 +481,17 @@ def fuse_t5_decoder(self, normalize_node, input_name_to_nodes, output_name_to_no
                 ["Add", "Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
                 [1, 1, 0, 1, 0, 0],
             )
-            if mask_nodes is None:
-                return
-            mul_node = mask_nodes[1]
-            if mask_nodes[1].op_type != "Mul":
-                return
+            if mask_nodes is not None:
+                mul_node = mask_nodes[1]
+            else:
+                mask_nodes = self.model.match_parent_path(
+                    add_qk,
+                    ["Add", "Slice", "Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
+                    [1, 1, 0, 0, 1, 0, 0],
+                )
+                if mask_nodes is None:
+                    return
+                mul_node = mask_nodes[2]
 
             _, mul_val = self.model.get_constant_input(mul_node)
             if mul_val != -10000:
@@ -474,22 +499,19 @@ def fuse_t5_decoder(self, normalize_node, input_name_to_nodes, output_name_to_no
 
             mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
         else:
-            rpb_nodes = self.model.match_parent_path(
+            matched_path_index, _, _ = self.model.match_parent_paths(
                 add_qk,
-                ["Add", "Slice"],
-                [1, 0],
+                [
+                    (["Add", "Slice"], [1, 0]),
+                    (["Add", "RelativePositionBias"], [1, 0]),
+                ],
+                output_name_to_node,
             )
-            if rpb_nodes is not None:
-                res_pos_bias = add_qk.input[1]
-            else:
-                rpb_nodes = self.model.match_parent_path(
-                    add_qk,
-                    ["Add", "RelativePositionBias"],
-                    [1, 0],
-                )
-                if rpb_nodes is None:
-                    return
-                res_pos_bias = add_qk.input[1]
+            if matched_path_index < 0:
+                logger.debug("Skip MultiHeadAttention fusion since attention bias pattern not matched")
+                return
+
+            res_pos_bias = add_qk.input[1]
 
         key = None
         past_key = None
@@ -608,56 +630,73 @@ def fuse_t5_decoder(self, normalize_node, input_name_to_nodes, output_name_to_no
             past_key = None
             past_value = None
 
+        if not (key and value and q_num_heads > 0 and q_hidden_size > 0):
+            return
+
         new_node = self.create_mha_node(
-            matmul_q.output[0],
-            key,
-            value,
-            mask_index,
-            res_pos_bias,
-            past_key,
-            past_value,
-            reshape_qkv.output[0],
-            present_key,
-            present_value,
-            q_num_heads,
-            q_hidden_size,
+            query=matmul_q.output[0],
+            key=key,
+            value=value,
+            mask_index=mask_index,
+            attn_bias=res_pos_bias,
+            past_key=past_key,
+            past_value=past_value,
+            output=reshape_qkv.output[0],
+            present_key=present_key,
+            present_value=present_value,
+            num_heads=q_num_heads,
+            hidden_size=q_hidden_size,
         )
-        if new_node is None:
-            return
 
-        self.nodes_to_add.append(new_node)
-        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+        if new_node:
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
 
-        self.nodes_to_remove.append(reshape_qkv)
+            # Since present_* is graph output, we need update the graph to avoid circular.
+            if present_key or present_value:
+                for graph_output in [present_key, present_value]:
+                    if not (graph_output and self.model.find_graph_output(graph_output)):
+                        print(f"{graph_output=} does not exist in graph output")
+                        return
+                    assert graph_output in output_name_to_node
+                    output_name_to_node[graph_output].output[0] = graph_output + "_copy"
+                    self.model.replace_input_of_all_nodes(graph_output, graph_output + "_copy")
 
-        self.prune_graph = True
+            self.nodes_to_remove.append(reshape_qkv)
+            self.prune_graph = False
 
 
 class FusionRelativePositionBiasBlock(Fusion):
-    def __init__(self, model: OnnxModel, max_distance: int):
-        super().__init__(model, "RelativePositionBias", ["Add", "Slice"])
-        self.max_distance = max_distance
-        self.is_bidirectional = False
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "RelativePositionBias", ["Softmax"])
 
     def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        # TODO: Optimization opportunity: only last dimension of relative_position_bias is used in decoder.
-        # Cuda kernel can be optimized to only compute last dimension.
-        if node.op_type != "Add" and node.op_type != "Slice":
-            return
-
         compute_bias_nodes = self.model.match_parent_path(
-            node, ["Unsqueeze", "Transpose", "Gather", "Where"], [0, 0, 0, 1], output_name_to_node
+            node,
+            ["Add", "Add", "Slice", "Unsqueeze", "Transpose", "Gather", "Where"],
+            [0, 1, 0, 0, 0, 0, 1],
+            output_name_to_node,
         )
+
         if compute_bias_nodes is None:
             compute_bias_nodes = self.model.match_parent_path(
-                node, ["Unsqueeze", "Transpose", "Gather", "Add", "Where"], [0, 0, 0, 1, 1], output_name_to_node
+                node,
+                ["Add", "Add", "Slice", "Unsqueeze", "Transpose", "Gather", "Add", "Where"],
+                [0, 1, 0, 0, 0, 0, 1, 1],
+                output_name_to_node,
             )
             if compute_bias_nodes is None:
                 return
 
-        gather = compute_bias_nodes[2]
+        gather = compute_bias_nodes[5]
         where = compute_bias_nodes[-1]
-        unsqueeze = compute_bias_nodes[0]
+        slice = compute_bias_nodes[2]
+        unsqueeze = compute_bias_nodes[3]
+
+        # Current fusion will not remove the node until the graph is processed.
+        # This avoids to fuse it again when it is shared by multiple layers.
+        if unsqueeze in self.nodes_to_remove:
+            return
 
         compute_buckets_nodes = self.model.match_parent_path(
             where,
@@ -668,12 +707,8 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
         if compute_buckets_nodes is None:
             return
 
-        # It is possible to deduce max_distance from a Div node:
-        # The value of self.model.get_constant_value(compute_buckets_nodes[-3].input[1]) is close to
-        #   math.log(max_distance / (relative_attention_num_buckets // (4 if is_bidirectional else 2)))
-        # See https://github.com/huggingface/transformers/blob/608e163b527eaee41e650ffb9eb4c422d2679902/src/transformers/models/t5/modeling_t5.py#L397.
-        # Most t5 models use max_distance=128, so we hardcode it unitl we see a model with different value.
-        # TODO: maybe add a sanity check here.
+        # This value is to used to compute max_distance later.
+        log_max = self.model.get_constant_value(compute_buckets_nodes[-3].input[1])
 
         div = compute_buckets_nodes[-1]
 
@@ -683,21 +718,33 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             [0, 0, 0, 1, 0, 0, 0, 0],
             output_name_to_node,
         )
+
+        is_bidirectional = False
         if range_nodes is None:
             range_nodes = self.model.match_parent_path(
                 div, ["Cast", "Abs", "Sub", "Unsqueeze", "Range"], [0, 0, 0, 0, 0], output_name_to_node
             )
-            self.is_bidirectional = True
+            is_bidirectional = True
             if range_nodes is None:
                 return
-
         range_node = range_nodes[-1]
 
-        self.nodes_to_remove.append(unsqueeze)
-        self.prune_graph = True
+        # Double check that the constant relative to max_distance and relative_attention_num_buckets.
+        # Most t5 models use max_distance=128, so we hardcode it unitl we see a model with different value.
+
+        # The log_max is the value of the following formula:
+        #   math.log(max_distance / (relative_attention_num_buckets // (4 if is_bidirectional else 2)))
+        # See https://github.com/huggingface/transformers/blob/608e163b527eaee41e650ffb9eb4c422d2679902/src/transformers/models/t5/modeling_t5.py#L397.
+        # Here is the value based on max_distance=128 and relative_attention_num_buckets=32:
+        max_distance = int(np.round(np.exp(log_max) * (32 // (4 if is_bidirectional else 2))))
+        if max_distance != 128:
+            logger.warning(
+                f"max_distance is {max_distance}, which is different from the default value 128. "
+                "Please double check the model configuration."
+            )
 
         node_name = self.model.create_node_name(
-            "RelativePositionBias", name_prefix="RelPosBias_" + ("encoder" if self.is_bidirectional else "decoder")
+            "RelativePositionBias", name_prefix="RelPosBias_" + ("encoder" if is_bidirectional else "decoder")
         )
 
         table_weight_i = self.model.get_initializer(gather.input[0])
@@ -712,22 +759,64 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             vals=table_weight_t.tobytes(),
             raw=True,
         )
-
         self.model.add_initializer(bias_table, self.this_graph_name)
+
+        # Relative position is like the following in encoder:
+        #                seq_len
+        #                   |
+        #                Range(0, *)
+        #                /      \
+        #   Unsqueeze(axes=0)    Unsqueeze(axes=1)
+        #                \    /
+        #                  Sub
+        #                   |
+        #                  Abs
+        #
+        # Relative position is like the following in decoder:
+        #       past_seq_len   seq_len
+        #                 \    /
+        #                  Add
+        #                /      \
+        #        Range(0, *)    Range(0, *)
+        #                \    /
+        #                  Sub
+        # Note that the graph will slice the attention bias to get last seq_len rows.
+        #
+        # In new version of transformers, the pattern of decoder is changed like the following
+        #
+        #      total_seq_len    Range(start=past_seq_len, end=total_seq_len)
+        #              |              |
+        #          Range(0, *)   Unsqueeze(axes=1)
+        #              |              |
+        #    Unsqueeze(axes=0)    Cast(to=int64)
+        #                   \     /
+        #                     Sub
+        # Currently, there is still Slice to get last seq_len rows so end result is same.
+        # But need to be careful that the shape of bias tensor is changed before Slice.
+        #
+        # RelativePositionBias operator requires query_length == key_length so we shall pass in total_seq_len.
+        # Here we get the end value of the Range node as length to pass to the RelativePositionBias node.
+
+        # TODO: Optimization opportunity: change RelativePositionBias op to support query_length != key_length.
+        #       only compute seq_len rows, then we can remove the Slice after the RelativePositionBias node.
         inputs = [bias_table.name, range_node.input[1], range_node.input[1]]
-        outputs = [unsqueeze.output[0]]
+
+        # Use a new tensor name since the shape might be different as mentioned above.
+        bias_output = node_name + "_rel_pos_bias"
+        slice.input[0] = bias_output
+
         rpb_node = helper.make_node(
             "RelativePositionBias",
             inputs=inputs,
-            outputs=outputs,
+            outputs=[bias_output],
             name=node_name,
         )
         rpb_node.domain = "com.microsoft"
-        rpb_node.attribute.extend([helper.make_attribute("max_distance", self.max_distance)])
-        rpb_node.attribute.extend([helper.make_attribute("is_bidirectional", self.is_bidirectional)])
-
-        self.nodes_to_add.append(rpb_node)
+        rpb_node.attribute.extend([helper.make_attribute("max_distance", max_distance)])
+        rpb_node.attribute.extend([helper.make_attribute("is_bidirectional", is_bidirectional)])
         self.node_name_to_graph_name[rpb_node.name] = self.this_graph_name
+        self.nodes_to_add.append(rpb_node)
+        self.prune_graph = True
 
 
 class T5OnnxModel(BertOnnxModel):
@@ -744,7 +833,7 @@ def __init__(self, model, num_heads: int = 0, hidden_size: int = 0):
         self.attention_fusion = FusionT5Attention(self, self.hidden_size, self.num_heads, self.attention_mask)
         self.layer_norm_fusion = FusionSimplifiedLayerNormalization(self)
         self.skip_layer_norm_fusion = FusionSkipSimplifiedLayerNormalization(self)
-        self.rpb_fusion = FusionRelativePositionBiasBlock(self, 128)
+        self.rpb_fusion = FusionRelativePositionBiasBlock(self)
 
     def fuse_attention(self):
         self.attention_fusion.apply()
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index df670012f54c0..367553a28f166 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -403,9 +403,6 @@ TEST(BeamSearchTest, GptBeamSearchFp16_VocabPadded) {
 }
 
 TEST(BeamSearchTest, DummyT5) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   // dummy_t5.onnx model generated using following command:
   // python onnxruntime/test/testdata/dummy_t5_generator.py --output-path dummy_t5.onnx
   ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5.onnx"));
@@ -419,9 +416,6 @@ TEST(BeamSearchTest, DummyT5) {
 }
 
 TEST(BeamSearchTest, DummyT5WithOuterScopeInitializers) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   // dummy_t5_with_outer_scope_initializers.onnx model generated using following command:
   // python onnxruntime/test/testdata/dummy_t5_generator.py --output-path dummy_t5_with_outer_scope_initializers.onnx --move-initializers
   ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5_with_outer_scope_initializers.onnx"));
@@ -448,9 +442,6 @@ TEST(BeamSearchTest, DummyT5WithSequenceInputIds) {
 }
 
 TEST(BeamSearchTest, DummyT5PointerGenerator) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   // dummy_t5_pointer_generator.onnx model generated using following command:
   // python onnxruntime/test/testdata/dummy_t5_generator.py --output-path dummy_t5_pointer_generator.onnx --decoder-needs-input-ids
   ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5_pointer_generator.onnx"));
diff --git a/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
index 64d229889214b..88c036ac4854e 100644
--- a/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
@@ -63,13 +63,13 @@ void RunQNBitGemmBenchmark(size_t BlkLen,
                                             tp.get());
 
   std::unique_ptr<std::byte[]> Workspace;
-  if (const auto WorkspaceSize = MlasQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
+  if (const auto WorkspaceSize = MlasQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, !Symmetric, ComputeType);
       WorkspaceSize > 0) {
     Workspace = std::make_unique<std::byte[]>(WorkspaceSize);
   }
 
   std::unique_ptr<std::byte[]> PackedQuantBData;
-  if (const auto PackedQuantBDataSize = MlasQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType);
+  if (const auto PackedQuantBDataSize = MlasQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, !Symmetric, ComputeType);
       PackedQuantBDataSize > 0) {
     PackedQuantBData = std::make_unique<std::byte[]>(PackedQuantBDataSize);
     MlasQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData.data(), PackedQuantBData.get(),
diff --git a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
index e22018ae2877f..16af51cfaa12d 100644
--- a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
@@ -265,13 +265,13 @@ class MlasSQNBitGemmTest : public MlasTestBase {
     }
 
     void* Workspace = nullptr;
-    if (const auto WorkspaceSize = MlasQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
+    if (const auto WorkspaceSize = MlasQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, !Symmetric, ComputeType);
         WorkspaceSize > 0) {
       Workspace = BufferWorkspace.GetBuffer(WorkspaceSize);
     }
 
     void* PackedQuantBDataWorkspace = nullptr;
-    if (const auto PackedQuantBDataSize = MlasQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType);
+    if (const auto PackedQuantBDataSize = MlasQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, !Symmetric, ComputeType);
         PackedQuantBDataSize > 0) {
       PackedQuantBDataWorkspace = BufferPackedQuantBData.GetBuffer(PackedQuantBDataSize);
       bool has_zp_input = QuantBZeroPoint != nullptr;
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
index dd8cbed15e5ef..504e645738344 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -158,6 +158,112 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
            // clang-format on
        })});
 
+#ifdef USE_WEBGPU
+  test_cases.push_back(
+      {"test 3D tensors with M = 1",
+       {6, 1, 8},
+       {1, 8, 3},
+       {6, 1, 3},
+       real_expected_vals({
+           // clang-format off
+            420, 448, 476,
+            1092, 1184, 1276,
+            1764, 1920, 2076,
+            2436, 2656, 2876,
+            3108, 3392, 3676,
+            3780, 4128, 4476,
+           // clang-format on
+       })});
+
+  test_cases.push_back(
+      {"test 4D tensors with M = 1",
+       {2, 3, 1, 8},
+       {1, 1, 8, 3},
+       {2, 3, 1, 3},
+       real_expected_vals({420, 448, 476, 1092, 1184, 1276, 1764, 1920, 2076, 2436, 2656, 2876, 3108, 3392, 3676, 3780, 4128, 4476})});
+
+  test_cases.push_back(
+      {"test 4D tensors",
+       {2, 3, 4, 3},
+       {2, 3, 3, 5},
+       {2, 3, 4, 5},
+       real_expected_vals({
+           // clang-format off
+          25, 28, 31, 34, 37, 70, 82, 94, 106, 118, 115, 136, 157, 178, 199, 160, 190, 220,
+          250, 280, 790, 829, 868, 907, 946, 970, 1018, 1066, 1114, 1162, 1150, 1207, 1264,
+          1321, 1378, 1330, 1396, 1462, 1528, 1594, 2635, 2710, 2785, 2860, 2935, 2950, 3034,
+          3118, 3202, 3286, 3265, 3358, 3451, 3544, 3637, 3580, 3682, 3784, 3886, 3988, 5560,
+          5671, 5782, 5893, 6004, 6010, 6130, 6250, 6370, 6490, 6460, 6589, 6718, 6847, 6976,
+          6910, 7048, 7186, 7324, 7462, 9565, 9712, 9859, 10006, 10153, 10150, 10306, 10462,
+          10618, 10774, 10735, 10900, 11065, 11230, 11395, 11320, 11494, 11668, 11842, 12016,
+          14650, 14833, 15016, 15199, 15382, 15370, 15562, 15754, 15946, 16138, 16090, 16291,
+          16492, 16693, 16894, 16810, 17020, 17230, 17440, 17650
+           // clang-format on
+       })});
+
+  // Test case: multiplies 2D broadcasted to 4D tensors
+  test_cases.push_back(
+      {"test 2D broadcasted to 4D tensors",
+       {2, 4},
+       {4, 3, 2, 4, 2},
+       {4, 3, 2, 2, 2},
+       real_expected_vals({
+           // clang-format off
+          28, 34, 76, 98, 76, 82, 252, 274, 124, 130, 428, 450, 172, 178, 604, 626,
+          220, 226, 780, 802, 268, 274, 956, 978, 316, 322, 1132, 1154, 364, 370,
+          1308, 1330, 412, 418, 1484, 1506, 460, 466, 1660, 1682, 508, 514, 1836,
+          1858, 556, 562, 2012, 2034, 604, 610, 2188, 2210, 652, 658, 2364, 2386,
+          700, 706, 2540, 2562, 748, 754, 2716, 2738, 796, 802, 2892, 2914, 844,
+          850, 3068, 3090, 892, 898, 3244, 3266, 940, 946, 3420, 3442, 988, 994,
+          3596, 3618, 1036, 1042, 3772, 3794, 1084, 1090, 3948, 3970, 1132, 1138,
+          4124, 4146
+           // clang-format on
+       })});
+
+  // Test case: multiplies 4D broadcasted to 5D tensors
+  test_cases.push_back(
+      {"test 4D broadcasted to 5D tensors",
+       {3, 1, 2, 4},
+       {4, 3, 2, 4, 2},
+       {4, 3, 2, 2, 2},
+       real_expected_vals({
+           // clang-format off
+            28, 34, 76, 98, 76, 82, 252, 274, 732, 770, 1036, 1090, 1036, 1074, 1468,
+            1522, 2460, 2530, 3020, 3106, 3020, 3090, 3708, 3794, 316, 322, 1132,
+            1154, 364, 370, 1308, 1330, 2556, 2594, 3628, 3682, 2860, 2898, 4060,
+            4114, 5820, 5890, 7148, 7234, 6380, 6450, 7836, 7922, 604, 610, 2188,
+            2210, 652, 658, 2364, 2386, 4380, 4418, 6220, 6274, 4684, 4722, 6652,
+            6706, 9180, 9250, 11276, 11362, 9740, 9810, 11964, 12050, 892, 898, 3244,
+            3266, 940, 946, 3420, 3442, 6204, 6242, 8812, 8866, 6508, 6546, 9244,
+            9298, 12540, 12610, 15404, 15490, 13100, 13170, 16092, 16178
+
+           // clang-format on
+       })});
+
+  // Test case: same ranks different broadcast small 1
+  test_cases.push_back(
+      {"test same ranks different broadcast small 1",
+       {2, 1, 2, 2},
+       {1, 2, 2, 1},
+       {2, 2, 2, 1},
+       real_expected_vals({1, 3, 3, 13, 5, 7, 23, 33})});
+
+  // Test case: same ranks different broadcast larger 0
+  test_cases.push_back(
+      {"test same ranks different broadcast larger 0",
+       {1, 2, 2, 8},
+       {2, 1, 8, 1},
+       {2, 2, 2, 1},
+       real_expected_vals({140, 364, 588, 812, 364, 1100, 1836, 2572})});
+
+  // Test case: same ranks different broadcast larger 1
+  test_cases.push_back(
+      {"test same ranks different broadcast larger 1",
+       {2, 1, 2, 8},
+       {1, 2, 8, 1},
+       {2, 2, 2, 1},
+       real_expected_vals({140, 364, 364, 1100, 588, 812, 1836, 2572})});
+#endif
   return test_cases;
 }
 
@@ -189,6 +295,17 @@ void RunMatMulTest(int32_t opset_version, bool is_a_constant, bool is_b_constant
       excluded_providers.insert(kNnapiExecutionProvider);
     }
 
+    // TODO:: Change MatMulNaive Shader to support these test cases webgpu
+    std::unordered_set<std::string> webgpu_excluded_test_cases{
+        "test left 1D",
+        "test right 1D",
+        "test 2D empty input"};
+
+    // if test in webgpu_excluded_test_cases, add webgpu to excluded_providers
+    if (webgpu_excluded_test_cases.find(t.name) != webgpu_excluded_test_cases.end()) {
+      excluded_providers.insert(kWebGpuExecutionProvider);
+    }
+
     test.ConfigExcludeEps(excluded_providers)
         .Config(run_with_tunable_op)
         .RunWithConfig();
@@ -234,10 +351,18 @@ TEST(MathOpTest, MatMulDoubleType) {
 }
 
 TEST(MathOpTest, MatMulInt32Type) {
+  // Webgpu does not support int32 matmul
+  if (DefaultWebGpuExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Webgpu does not support int32 matmul";
+  }
   RunMatMulTest<int32_t>(9);
 }
 
 TEST(MathOpTest, MatMulUint32Type) {
+  // Webgpu does not support uint32 matmul
+  if (DefaultWebGpuExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Webgpu does not support uint32 matmul";
+  }
   RunMatMulTest<uint32_t>(9);
 }
 
@@ -263,16 +388,22 @@ void RunMatMulZeroKTest() {
   // No special case is implemented.
   test.ConfigExcludeEps({kCoreMLExecutionProvider, kNnapiExecutionProvider,
                          kDmlExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
-                         kOpenVINOExecutionProvider})
+                         kOpenVINOExecutionProvider, kWebGpuExecutionProvider})
       .Config(run_with_tunable_op)
       .RunWithConfig();
 }
 
 TEST(MathOpTest, MatMulZeroKFloatType) {
+  if (DefaultWebGpuExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Webgpu does not support zero-sized tensor";
+  }
   RunMatMulZeroKTest<float>();
 }
 
 TEST(MathOpTest, MatMulZeroKInt32Type) {
+  if (DefaultWebGpuExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Webgpu does not support zero-sized tensor";
+  }
   RunMatMulZeroKTest<int32_t>();
 }
 
diff --git a/onnxruntime/test/providers/cpu/math/softmax_test.cc b/onnxruntime/test/providers/cpu/math/softmax_test.cc
index 1c6375ebdb0b1..03f5fdaab2780 100644
--- a/onnxruntime/test/providers/cpu/math/softmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/softmax_test.cc
@@ -422,7 +422,8 @@ TEST(SoftmaxOperator, GH15949_regression_test) {
                           {0.00032932f, 0.01798029f, 0.9816904f});
 
   // disable TRT as it does not support axis=0 as used by the model
-  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  // TODO: Fix the Softmax operator of WebGPU EP.
+  tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kWebGpuExecutionProvider});
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
index c14fc1fb62ae5..24cdb818b9e32 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
@@ -3,7 +3,7 @@
 
 #include "core/mlas/inc/mlas.h"
 
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK) || defined(USE_WEBGPU)
 
 #include "core/providers/cpu/nn/pool.h"
 #include "gtest/gtest.h"
@@ -280,7 +280,10 @@ TEST(PoolFp16Test, MaxPool_Dilation_Ceil1_2d) {
 
   test.AddInput<MLFloat16>("X", x_dims, x_vals);
   test.AddOutput<MLFloat16>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+
+  // TODO: Enable the case for WebGPU once ceil is supported.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DilationPadding_3d) {
@@ -484,7 +487,10 @@ TEST(PoolFp16Test, AveragePool_10_ceil1_2d) {
 
   test.AddInput<MLFloat16>("X", x_dims, x_vals);
   test.AddOutput<MLFloat16>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+
+  // TODO: Enable the case for WebGPU once ceil is supported.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(PoolFp16Test, GlobalAveragePool) {
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index f1d612276174f..b6eb812e6a399 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -178,9 +178,10 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) {
     storage_order == 0 ? test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_row)
                        : test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_col);
   }
+  // TODO: Enable the case for WebGPU once WGSL can support int64.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kDnnlExecutionProvider, kTensorrtExecutionProvider,
-            kAclExecutionProvider, kArmNNExecutionProvider, kOpenVINOExecutionProvider});
+           {kDnnlExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider, kArmNNExecutionProvider,
+            kOpenVINOExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_8_With_Index) {
@@ -268,8 +269,10 @@ static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
+
+  // TODO: Enable the case for WebGPU once WGSL can support int64.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_8_With_Index) {
@@ -641,8 +644,10 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
+
+  // TODO: Enable the case for WebGPU once ceil is supported.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_3d) {
@@ -1000,8 +1005,10 @@ TEST(PoolTest, AveragePool_10_ceil1_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
+
+  // TODO: Enable the case for WebGPU once ceil is supported.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_19_dilation_2d) {
@@ -1817,8 +1824,10 @@ TEST(PoolTest, MaxPoolDimWithZeroForN) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
+
+  // TODO: Fix WebGPU Transpose error: "Invalid dispatch group size (0, 1, 1)".
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kTensorrtExecutionProvider, kQnnExecutionProvider});
+           {kTensorrtExecutionProvider, kQnnExecutionProvider, kWebGpuExecutionProvider});
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index d311b4b8517cf..5267ffcc65ab7 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -644,6 +644,87 @@ def test_matmulnbits(self):
         ]
         self._check_shapes(graph, inferred.graph, expected_shapes)
 
+    def test_qlinear_binary(self):
+        """
+        Test ONNX QLinearAdd op ('com.microsoft' domain). .
+        Check that the output shape is propagated from the inputs to the op with broadcasting.
+        """
+        initializers = [
+            helper.make_tensor(
+                "A_scale",
+                TensorProto.FLOAT,
+                [],
+                [0.7],
+            ),
+            helper.make_tensor(
+                "A_zero_point",
+                TensorProto.UINT8,
+                [],
+                [158],
+            ),
+            helper.make_tensor(
+                "B_scale",
+                TensorProto.FLOAT,
+                [],
+                [0.02],
+            ),
+            helper.make_tensor(
+                "B_zero_point",
+                TensorProto.UINT8,
+                [],
+                [5],
+            ),
+            helper.make_tensor(
+                "C_scale",
+                TensorProto.FLOAT,
+                [],
+                [0.26],
+            ),
+            helper.make_tensor(
+                "C_zero_point",
+                TensorProto.UINT8,
+                [],
+                [0],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "QLinearAdd",
+                inputs=[
+                    "A",
+                    "A_scale",
+                    "A_zero_point",
+                    "B",
+                    "B_scale",
+                    "B_zero_point",
+                    "C_scale",
+                    "C_zero_point",
+                ],
+                outputs=["C"],
+                domain="com.microsoft",
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("A", TensorProto.UINT8, ["b", 4, 128]),
+            helper.make_tensor_value_info("B", TensorProto.UINT8, ["b", 1, 4, 1, 128]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("C", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "QLinearAdd_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        expected_shapes = [
+            helper.make_tensor_value_info("C", TensorProto.UINT8, ["b", 1, 4, 4, 128]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
 
 class TestSymbolicShapeInferenceForSlice(unittest.TestCase):
     def check_slice_of_concat(self, input_dims, start, end, step, expected_output_dim):
diff --git a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
index 42682d67e94ec..2d29135726839 100644
--- a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
+++ b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
@@ -6,22 +6,28 @@
 import unittest
 
 import numpy as np
-from mpi4py import MPI
 from onnx import TensorProto, helper
 
 import onnxruntime
 
-np.random.seed(3)
+try:
+    from mpi4py import MPI
+
+    comm = MPI.COMM_WORLD
+except (ImportError, RuntimeError):
+    comm = None
 
-comm = MPI.COMM_WORLD
+has_mpi = comm is not None
+
+np.random.seed(3)
 
 
 def get_rank():
-    return comm.Get_rank()
+    return comm.Get_rank() if comm else 0
 
 
 def get_size():
-    return comm.Get_size()
+    return comm.Get_size() if comm else 0
 
 
 def print_out(*args):
@@ -254,7 +260,7 @@ def run_ort_with_parity_check(
     )
 
 
-def test_moe_with_tensor_parallelism(
+def run_moe_with_tensor_parallelism(
     hidden_size,
     inter_size,
     num_experts,
@@ -327,7 +333,7 @@ def get_fc2_tensor_shards(expert_weights):
     )
 
 
-def test_moe_with_expert_parallelism(
+def run_moe_with_expert_parallelism(
     hidden_size,
     inter_size,
     num_experts,
@@ -390,19 +396,22 @@ def test_moe_with_expert_parallelism(
 
 class TestMoE(unittest.TestCase):
     def test_moe_parallelism(self):
+        if not has_mpi:
+            self.skipTest("No MPI support")
+
         for hidden_size in [128, 1024]:
             for inter_size in [512, 2048]:
                 for num_experts in [64]:
                     for num_rows in [1024]:
                         print_out("EP")
-                        test_moe_with_expert_parallelism(
+                        run_moe_with_expert_parallelism(
                             hidden_size,
                             inter_size,
                             num_experts,
                             num_rows,
                         )
                         print_out("TP")
-                        test_moe_with_tensor_parallelism(
+                        run_moe_with_tensor_parallelism(
                             hidden_size,
                             inter_size,
                             num_experts,
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
index 7a94519c92bc8..c5cf8a07f557d 100644
--- a/onnxruntime/test/python/transformers/test_generation.py
+++ b/onnxruntime/test/python/transformers/test_generation.py
@@ -28,6 +28,10 @@
     from onnxruntime.transformers.models.whisper.convert_to_onnx import main as run_whisper
 
 
+def has_cuda_environment():
+    return torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
+
+
 class TestBeamSearchGpt(unittest.TestCase):
     """Test BeamSearch for GPT-2 model"""
 
@@ -49,7 +53,7 @@ def setUp(self):
             # "The selloff in tech shares deepened",
             # "Abortion rights take center stage",
         ]
-        self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
+        self.enable_cuda = has_cuda_environment()
         self.remove_onnx_files()
 
     def tearDown(self):
@@ -176,112 +180,253 @@ def test_external_data(self):
         )
 
 
-class TestBeamSearchT5(unittest.TestCase):
-    """Test BeamSearch for T5 model"""
+def get_tiny_t5_model_dir():
+    """Get the path to the tiny T5 model directory."""
+    # This function is used to get the path to the tiny T5 model directory.
+    # It is used in the TestBeamSearchT5 and TestBeamSearchT5Fp16 classes.
 
-    def setUp(self):
-        self.model_name = "t5-small"
-        self.decoder_onnx_path = os.path.join(".", "onnx_models", "t5-small_decoder.onnx")
-        self.encoder_onnx_path = os.path.join(".", "onnx_models", "t5-small_encoder_decoder_init.onnx")
-        self.beam_search_onnx_path = os.path.join(".", "onnx_models", "t5_small_beam_search.onnx")
-        self.default_arguments = [
-            f"-m {self.model_name}",
+    # Path relative to the build\Release directory, where transformers test is launched in pipeline.
+    tiny_model_dir = os.path.join(
+        "testdata",
+        "transformers",
+        "tiny_t5",
+    )
+    if os.path.exists(tiny_model_dir):
+        return os.path.normpath(tiny_model_dir)
+
+    # The path is relative to the current file's directory.
+    tiny_model_dir = os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "..",
+        "testdata",
+        "transformers",
+        "tiny_t5",
+    )
+    return os.path.normpath(tiny_model_dir)
+
+
+use_tiny_model = True
+
+
+class TestBeamSearchT5(unittest.TestCase):
+    """Test BeamSearch for T5 model with fp32 in CPU"""
+
+    @classmethod
+    def setUpClass(cls):
+        tiny_model_dir = get_tiny_t5_model_dir()
+        model_name = "tiny_t5" if use_tiny_model and os.path.exists(tiny_model_dir) else "t5-small"
+        cls.model_name = tiny_model_dir if model_name == "tiny_t5" else "t5-small"
+        cls.decoder_onnx_path = os.path.join(".", "t5_onnx_models", f"{model_name}_decoder.onnx")
+        cls.encoder_onnx_path = os.path.join(".", "t5_onnx_models", f"{model_name}_encoder.onnx")
+        cls.beam_search_onnx_path = os.path.join(".", "t5_onnx_models", f"{model_name}_beam_search.onnx")
+        cls.default_arguments = [
+            f"-m {cls.model_name}",
             "--model_type t5",
-            f"--decoder_onnx {self.decoder_onnx_path}",
-            f"--encoder_decoder_init_onnx {self.encoder_onnx_path}",
-            f"--output {self.beam_search_onnx_path}",
+            f"--decoder_onnx {cls.decoder_onnx_path}",
+            f"--encoder_decoder_init_onnx {cls.encoder_onnx_path}",
+            f"--output {cls.beam_search_onnx_path}",
             "--output_sequences_score",
             "--repetition_penalty 2.0",
         ]
 
-        self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
+        # Remove onnx files if existed for any reason.
+        cls.remove_onnx_files()
 
-        export_t5_onnx_models(
-            self.model_name,
+        # This is in class setup so that we only export t5 model once.
+        paths = export_t5_onnx_models(
+            cls.model_name,
             os.path.join(".", "cache_models"),
-            os.path.join(".", "onnx_models"),
+            os.path.join(".", "t5_onnx_models"),
             use_gpu=False,
             use_external_data_format=False,
             optimize_onnx=False,
-            precision=Precision.FLOAT32,
+            precision=Precision.FLOAT32.value,
             verbose=False,
             use_decoder_start_token=False,
-            merge_encoder_and_decoder_init=True,
             overwrite=True,
             disable_auto_mixed_precision=False,
             use_int32_inputs=True,
         )
+        assert len(paths) == 2
 
-        self.sentences = [
+        cls.sentences = [
             "translate English to French: The product is released",
             "summarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.",
         ]
 
-        if os.path.exists(self.beam_search_onnx_path):
-            os.remove(self.beam_search_onnx_path)
+    @classmethod
+    def remove_onnx_files(cls, beam_search_onnx_only: bool = False):
+        if os.path.exists(cls.beam_search_onnx_path):
+            os.remove(cls.beam_search_onnx_path)
+        if os.path.exists(cls.beam_search_onnx_path + ".data"):
+            os.remove(cls.beam_search_onnx_path + ".data")
 
-    def tearDown(self):
-        self.remove_onnx_files()
+        if not beam_search_onnx_only:
+            if os.path.exists(cls.encoder_onnx_path):
+                os.remove(cls.encoder_onnx_path)
+            if os.path.exists(cls.decoder_onnx_path):
+                os.remove(cls.decoder_onnx_path)
 
-    def remove_onnx_files(self):
-        if os.path.exists(self.beam_search_onnx_path):
-            os.remove(self.beam_search_onnx_path)
+    @classmethod
+    def tearDownClass(cls):
+        # cls.remove_onnx_files()
+        pass
 
-        if os.path.exists(self.decoder_onnx_path):
-            os.remove(self.decoder_onnx_path)
+    def setUp(self):
+        pass
 
-        if os.path.exists(self.encoder_onnx_path):
-            os.remove(self.encoder_onnx_path)
+    def tearDown(self):
+        # self.remove_onnx_files(beam_search_onnx_only=True)
+        pass
 
-    def run_beam_search(self, extra_arguments: str, sentences=None, append_arguments=True):
-        if append_arguments:
-            arguments = " ".join([*self.default_arguments, extra_arguments]).split()
-        else:
-            arguments = extra_arguments.split()
+    def run_beam_search(self, extra_arguments: str):
+        arguments = " ".join([*self.default_arguments, extra_arguments]).split()
 
         # Test CPU
-        result = run(arguments, sentences=self.sentences if sentences is None else sentences)
+        result = run(arguments)
         self.assertTrue(result["parity"], f"ORT and PyTorch result is different on CPU for arguments {arguments}")
 
-        # Test GPU
-        if self.enable_cuda:
-            if "--use_gpu" not in arguments:
-                arguments.append("--use_gpu")
-            result = run(arguments, sentences=self.sentences if sentences is None else sentences)
-            self.assertTrue(result["parity"], f"ORT and PyTorch result is different on GPU for arguments {arguments}")
-
-        os.remove(self.beam_search_onnx_path)
-
-    @pytest.mark.slow
     def test_return_sequences(self):
         for return_sequences in [1, 2]:
             self.run_beam_search(f"--num_return_sequences {return_sequences}")
 
-    @pytest.mark.slow
     def test_early_stopping(self):
         self.run_beam_search("--early_stopping")
 
-    @pytest.mark.slow
     def test_length_penalty(self):
         for length_penalty in [0.5, 2.0]:
             self.run_beam_search(f"--length_penalty {length_penalty}")
 
-    @pytest.mark.slow
     def test_no_repeat_ngram(self):
         for ngram_size in [1, 2]:
             self.run_beam_search(f"--no_repeat_ngram_size {ngram_size}")
 
-    @pytest.mark.slow
     def test_custom_attention_mask(self):
         self.run_beam_search("--custom_attention_mask")
 
-    @pytest.mark.slow
     def test_external_data(self):
-        self.run_beam_search(
-            f"-m t5-small --model_type t5 -e --output {self.beam_search_onnx_path}",
-            sentences=None,
-            append_arguments=False,
-        )
+        self.run_beam_search("-e")
+
+
+@unittest.skipUnless(
+    has_cuda_environment(),
+    "skip since there is no cuda environment.",
+)
+class TestBeamSearchT5Fp16(unittest.TestCase):
+    """Test BeamSearch for T5 model with fp16 in GPU"""
+
+    @classmethod
+    def setUpClass(cls):
+        tiny_model_dir = get_tiny_t5_model_dir()
+        tiny_model_dir = os.path.normpath(tiny_model_dir)
+        cls.model_name = "tiny_t5" if use_tiny_model and os.path.exists(tiny_model_dir) else "t5-small"
+        cls.model_id = tiny_model_dir if cls.model_name == "tiny_t5" else "t5-small"
+        cls.beam_search_onnx_path = os.path.join(".", "onnx_models", f"{cls.model_name}_beam_search_fp16.onnx")
+        cls.default_arguments = [
+            f"-m {cls.model_id}",
+            "--model_type t5",
+            f"--output {cls.beam_search_onnx_path}",
+            "--min_length 2",
+            "--max_length 16",
+            "--use_gpu",
+            "-p fp16",
+        ]
+
+        cls.sentences = [
+            "translate English to French: The product is released",
+            "summarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.",
+        ]
+
+        cls.remove_onnx_files()
+
+    @classmethod
+    def remove_onnx_files(cls):
+        model_name = cls.model_name
+        for file in [
+            f"{model_name}_beam_search_fp16.onnx",
+            f"{model_name}_encoder.onnx",
+            f"{model_name}_encoder_fp16.onnx",
+            f"{model_name}_decoder.onnx",
+            f"{model_name}_decoder_fp16.onnx",
+        ]:
+            if os.path.exists(os.path.join(".", "onnx_models", file)):
+                os.remove(os.path.join(".", "onnx_models", file))
+            if os.path.exists(os.path.join(".", "onnx_models", file + ".data")):
+                os.remove(os.path.join(".", "onnx_models", file + ".data"))
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        self.remove_onnx_files()
+
+    def check_encoder_fusion(self):
+        model_name = self.model_name
+        onnx_path = os.path.join(".", "onnx_models", f"{model_name}_encoder_fp16.onnx")
+
+        model = onnx.load_model(onnx_path, format=None, load_external_data=True)
+        from onnxruntime.transformers.onnx_model import OnnxModel
+
+        onnx_model = OnnxModel(model)
+        op_counters = onnx_model.get_operator_statistics()
+        print("encoder ops", op_counters)
+
+        expected_node_count = {
+            "RelativePositionBias": 1,
+            "SimplifiedLayerNormalization": 5 if use_tiny_model else 13,
+            "Attention": 2 if use_tiny_model else 6,
+        }
+        for key, value in expected_node_count.items():
+            self.assertIn(key, op_counters, f"Expected {key} to be in op_counters")
+            self.assertEqual(op_counters[key], value, f"Expected {key} to be {value}, but got {op_counters[key]}")
+
+    def check_decoder_fusion(self):
+        model_name = self.model_name
+        onnx_path = os.path.join(".", "onnx_models", f"{model_name}_decoder_fp16.onnx")
+
+        model = onnx.load_model(onnx_path, format=None, load_external_data=True)
+        from onnxruntime.transformers.onnx_model import OnnxModel
+
+        onnx_model = OnnxModel(model)
+        op_counters = onnx_model.get_operator_statistics()
+        print("decoder ops", op_counters)
+
+        expected_node_count = {
+            "RelativePositionBias": 1,
+            "SimplifiedLayerNormalization": 7 if use_tiny_model else 19,
+            "MultiHeadAttention": 4 if use_tiny_model else 12,
+        }
+        for key, value in expected_node_count.items():
+            self.assertIn(key, op_counters, f"Expected {key} to be in op_counters")
+            self.assertEqual(op_counters[key], value, f"Expected {key} to be {value}, but got {op_counters[key]}")
+
+    def run_beam_search(self, extra_arguments: str):
+        arguments = " ".join([*self.default_arguments, extra_arguments]).split()
+        result = run(arguments)
+        self.assertTrue(result["parity"], f"ORT and PyTorch result is different on GPU for arguments {arguments}")
+
+    def test_return_sequences(self):
+        for return_sequences in [1, 2]:
+            self.run_beam_search(f"--num_return_sequences {return_sequences}")
+
+    def test_early_stopping(self):
+        self.run_beam_search("--early_stopping")
+
+    def test_length_penalty(self):
+        for length_penalty in [0.5, 2.0]:
+            self.run_beam_search(f"--length_penalty {length_penalty}")
+
+    def test_no_repeat_ngram(self):
+        for ngram_size in [1, 2]:
+            self.run_beam_search(f"--no_repeat_ngram_size {ngram_size}")
+
+    def test_external_data(self):
+        self.run_beam_search("-e")
+
+        # Ensure fusion is done correctly.
+        self.check_encoder_fusion()
+        self.check_decoder_fusion()
 
 
 class TestBeamSearchWhisper(unittest.TestCase):
@@ -294,7 +439,7 @@ def setUp(self):
         self.decoder_onnx_path = os.path.join(".", self.onnx_folder, "whisper-tiny_decoder.onnx")
         self.encoder_onnx_path = os.path.join(".", self.onnx_folder, "whisper-tiny_encoder.onnx")
         self.beam_search_onnx_path = os.path.join(".", self.onnx_folder, "whisper-tiny_beamsearch.onnx")
-        self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
+        self.enable_cuda = has_cuda_environment()
 
         self.base_arguments = [
             "-m",
diff --git a/onnxruntime/test/python/transformers/test_flash_attn_cuda.py b/onnxruntime/test/python/transformers/test_gqa_cuda.py
similarity index 79%
rename from onnxruntime/test/python/transformers/test_flash_attn_cuda.py
rename to onnxruntime/test/python/transformers/test_gqa_cuda.py
index a74d5389e9047..3923b229a0bff 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn_cuda.py
+++ b/onnxruntime/test/python/transformers/test_gqa_cuda.py
@@ -17,7 +17,6 @@
 
 import numpy
 import torch
-from bert_padding import pad_input, unpad_input
 from einops import rearrange, repeat
 from onnx import TensorProto, helper
 from packaging import version
@@ -39,20 +38,16 @@ class Formats:
 class Config:
     batch_size = 0
     sequence_length = 0
-    kv_sequence_length = 0
-    past_sequence_length = 0
+    kv_sequence_length = 0  # this is past sequence length when there is past state.
     num_heads = 0
     kv_num_heads = 0
     head_size = 0
     ep = "CUDAExecutionProvider"
 
-    def __init__(
-        self, batch_size, sequence_length, kv_sequence_length, past_sequence_length, num_heads, kv_num_heads, head_size
-    ):
+    def __init__(self, batch_size, sequence_length, kv_sequence_length, num_heads, kv_num_heads, head_size):
         self.batch_size = batch_size
         self.sequence_length = sequence_length
         self.kv_sequence_length = kv_sequence_length
-        self.past_sequence_length = past_sequence_length
         self.num_heads = num_heads
         self.kv_num_heads = kv_num_heads
         self.head_size = head_size
@@ -61,7 +56,7 @@ def __repr__(self):
         short_ep = self.ep[: -len("ExecutionProvider")].lower()
         return (
             f"Config(batch_size={self.batch_size}, sequence_length={self.sequence_length}, "
-            f"kv_sequence_length={self.kv_sequence_length}, past_sequence_length={self.past_sequence_length}, "
+            f"kv_sequence_length={self.kv_sequence_length}, "
             f"num_heads={self.num_heads}, kv_num_heads={self.kv_num_heads}, head_size={self.head_size}, ep={short_ep})"
         )
 
@@ -103,118 +98,6 @@ def __repr__(self):
         )
 
 
-def create_packed_multihead_attention_graph(config):
-    nodes = [
-        helper.make_node(
-            "PackedMultiHeadAttention",
-            [
-                "query",
-                "",
-                "",
-                "",
-                "token_offset",
-                "cumulative_sequence_length",
-            ],
-            ["output"],
-            "PackedMultiHeadAttention_0",
-            num_heads=config.num_heads,
-            domain="com.microsoft",
-        ),
-    ]
-
-    graph = helper.make_graph(
-        nodes,
-        "PackedMultiHeadAttention_Graph",
-        [
-            helper.make_tensor_value_info(
-                "query",
-                TensorProto.FLOAT16,
-                [
-                    -1,
-                    config.num_heads,
-                    3,
-                    config.head_size,
-                ],
-            ),
-            helper.make_tensor_value_info(
-                "token_offset", TensorProto.INT32, [config.batch_size, config.sequence_length]
-            ),
-            helper.make_tensor_value_info("cumulative_sequence_length", TensorProto.INT32, [config.batch_size + 1]),
-        ],
-        [
-            helper.make_tensor_value_info(
-                "output",
-                TensorProto.FLOAT16,
-                [-1, config.num_heads * config.head_size],
-            ),
-        ],
-    )
-
-    model = helper.make_model(graph)
-    return model.SerializeToString()
-
-
-def create_multihead_attention_graph(config):
-    nodes = [
-        helper.make_node(
-            "MultiHeadAttention",
-            [
-                "query",
-                "key",
-                "value",
-            ],
-            ["output"],
-            "MultiHeadAttention_0",
-            num_heads=config.num_heads,
-            domain="com.microsoft",
-        ),
-    ]
-
-    graph = helper.make_graph(
-        nodes,
-        "MultiHeadAttention_Graph",
-        [
-            helper.make_tensor_value_info(
-                "query",
-                TensorProto.FLOAT16,
-                [
-                    config.batch_size,
-                    config.sequence_length,
-                    config.num_heads * config.head_size,
-                ],
-            ),
-            helper.make_tensor_value_info(
-                "key",
-                TensorProto.FLOAT16,
-                [
-                    config.batch_size,
-                    config.kv_sequence_length,
-                    config.num_heads * config.head_size,
-                ],
-            ),
-            helper.make_tensor_value_info(
-                "value",
-                TensorProto.FLOAT16,
-                [
-                    config.batch_size,
-                    config.kv_sequence_length,
-                    config.num_heads * config.head_size,
-                ],
-            ),
-        ],
-        [
-            helper.make_tensor_value_info(
-                "output",
-                TensorProto.FLOAT16,
-                [config.batch_size, config.sequence_length, config.num_heads * config.head_size],
-            ),
-        ],
-    )
-
-    model = helper.make_model(graph)
-    return model.SerializeToString()
-
-
 def create_group_query_attention_graph_prompt(
     config,
     past_kv_format=Formats.BSNH,
@@ -575,204 +458,6 @@ def create_group_query_attention_graph_past(
     return model.SerializeToString()
 
 
-def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
-    assert mode in ["full", "random", "third"]
-    if mode == "full":
-        lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
-    elif mode == "random":
-        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen, (batch_size, 1), device=device)
-    else:
-        lengths = torch.randint(max_seqlen // 3, max_seqlen, (batch_size, 1), device=device)
-    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
-    return padding_mask
-
-
-def generate_qkv(q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False):
-    """
-    Arguments:
-        q: (batch_size, seqlen_q, nheads, d)
-        k: (batch_size, seqlen_k, nheads_k, d)
-        v: (batch_size, seqlen_k, nheads_k, d)
-        query_padding_mask: (batch_size, seqlen), bool
-        key_padding_mask: (batch_size, seqlen), bool
-    """
-    assert not (kvpacked and qkvpacked)
-    batch_size, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, nheads_k, _ = k.shape
-    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
-    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
-
-    if query_padding_mask is not None:
-        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
-
-        def output_pad_fn(output_unpad):
-            return pad_input(output_unpad, indices_q, batch_size, seqlen_q)
-
-    else:
-        q_unpad = rearrange(q, "b s h d -> (b s) h d")
-        cu_seqlens_q = torch.arange(
-            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device
-        )
-        max_seqlen_q = seqlen_q
-
-        def output_pad_fn(output_unpad):
-            return rearrange(output_unpad, "(b s) h d -> b s h d", b=batch_size)
-
-    if key_padding_mask is not None:
-        k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
-        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
-    else:
-        k_unpad = rearrange(k, "b s h d -> (b s) h d")
-        v_unpad = rearrange(v, "b s h d -> (b s) h d")
-        cu_seqlens_k = torch.arange(
-            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device
-        )
-        max_seqlen_k = seqlen_k
-
-    if qkvpacked:
-        assert (query_padding_mask == key_padding_mask).all()
-        assert nheads == nheads_k
-        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
-        qkv = torch.stack([q, k, v], dim=2)
-        if query_padding_mask is not None:
-
-            def dqkv_pad_fn(dqkv_unpad):
-                return pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
-
-        else:
-
-            def dqkv_pad_fn(dqkv_unpad):
-                return rearrange(dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
-
-        return (
-            qkv_unpad.detach().requires_grad_(),
-            cu_seqlens_q,
-            max_seqlen_q,
-            qkv.detach().requires_grad_(),
-            output_pad_fn,
-            dqkv_pad_fn,
-        )
-    elif kvpacked:
-        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
-        kv = torch.stack([k, v], dim=2)
-        dq_pad_fn = output_pad_fn
-        if key_padding_mask is not None:
-
-            def dkv_pad_fn(dkv_unpad):
-                return pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
-
-        else:
-
-            def dkv_pad_fn(dkv_unpad):
-                return rearrange(dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
-
-        return (
-            q_unpad.detach().requires_grad_(),
-            kv_unpad.detach().requires_grad_(),
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            q.detach().requires_grad_(),
-            kv.detach().requires_grad_(),
-            output_pad_fn,
-            dq_pad_fn,
-            dkv_pad_fn,
-        )
-    else:
-        dq_pad_fn = output_pad_fn
-        if key_padding_mask is not None:
-
-            def dk_pad_fn(dk_unpad):
-                return pad_input(dk_unpad, indices_k, batch_size, seqlen_k)
-
-        else:
-
-            def dk_pad_fn(dk_unpad):
-                return rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size)
-
-        return (
-            q_unpad.detach().requires_grad_(),
-            k_unpad.detach().requires_grad_(),
-            v_unpad.detach().requires_grad_(),
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            q.detach().requires_grad_(),
-            k.detach().requires_grad_(),
-            v.detach().requires_grad_(),
-            output_pad_fn,
-            dq_pad_fn,
-            dk_pad_fn,
-        )
-
-
-def create_inputs(config: Config, kv_packed=False, qkv_packed=True):
-    qkv = torch.randn(
-        config.batch_size,
-        config.sequence_length,
-        3,
-        config.num_heads,
-        config.head_size,
-        device="cuda",
-        dtype=torch.float16,
-        requires_grad=False,
-    )
-    key_padding_mask = generate_random_padding_mask(
-        config.sequence_length, config.batch_size, device="cuda", mode="random"
-    )
-    qkv_unpad, cu_seqlens, max_seqlen, qkv, output_pad_fn, dqkv_pad_fn = generate_qkv(
-        *qkv.unbind(dim=2), key_padding_mask, key_padding_mask, kv_packed, qkv_packed
-    )
-    return qkv_unpad, cu_seqlens, max_seqlen, qkv, output_pad_fn, dqkv_pad_fn, key_padding_mask
-
-
-def generate_token_offset(cu_seqlens, max_seqlen):
-    token_offset = []
-    token_padset = []  # These are the indices that contain padding tokens
-    for i in range(1, len(cu_seqlens)):
-        start = i - 1
-        pre_seqlen = cu_seqlens[i - 1]
-        seqlen = cu_seqlens[i]
-        token_offset += range(start * max_seqlen, (start * max_seqlen) + (seqlen - pre_seqlen))
-        token_padset += range((start * max_seqlen) + (seqlen - pre_seqlen), i * max_seqlen)
-    return numpy.asarray(token_offset + token_padset, dtype=numpy.int32)
-
-
-def flash_attn_varlen_qkvpacked_func(qkv_unpad, cu_seqlens, token_offset, config, causal=False):
-    onnx_model_str = create_packed_multihead_attention_graph(config)
-    qkv_unpad = torch.swapdims(qkv_unpad, 1, 2)
-    ort_inputs = {
-        "query": qkv_unpad.detach().cpu().numpy(),
-        "token_offset": token_offset,
-        "cumulative_sequence_length": cu_seqlens.cpu().numpy(),
-    }
-    sess_options = SessionOptions()
-    ort_session = InferenceSession(onnx_model_str, sess_options, providers=[config.ep])
-    ort_output = ort_session.run(None, ort_inputs)
-    output = torch.tensor(ort_output)
-    return output
-
-
-def mha_func(q, k, v, config):
-    onnx_model_str = create_multihead_attention_graph(config)
-    q = torch.reshape(q, (config.batch_size, config.sequence_length, -1))
-    k = torch.reshape(k, (config.batch_size, config.kv_sequence_length, -1))
-    v = torch.reshape(v, (config.batch_size, config.kv_sequence_length, -1))
-    ort_inputs = {
-        "query": q.detach().cpu().numpy(),
-        "key": k.detach().cpu().numpy(),
-        "value": v.detach().cpu().numpy(),
-    }
-    sess_options = SessionOptions()
-    ort_session = InferenceSession(onnx_model_str, sess_options, providers=[config.ep])
-    ort_output = ort_session.run(None, ort_inputs)
-    ort_output = numpy.array(ort_output)
-    output = torch.tensor(ort_output)
-    return output
-
-
 def rotary_options_for_current_os():
     # Reference implementation of rotary uses triton, which is not available in Windows.
     # So we only test rotary in Linux right now.
@@ -1009,14 +694,6 @@ def gqa_past_func(
         return output, present_k, present_v
 
 
-def construct_causal_mask(seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, device=None):
-    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
-    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
-    sk = seqlen_k if key_padding_mask is None else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
-    sq = seqlen_q if query_padding_mask is None else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
-    return col_idx > row_idx + sk - sq
-
-
 def construct_local_mask(
     seqlen_q,
     seqlen_k,
@@ -1127,93 +804,6 @@ def attention_ref(
     return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
 
 
-def attention_qkvpacked_ref(
-    qkv,
-    key_padding_mask=None,
-    dropout_p=0.0,
-    dropout_mask=None,
-    causal=False,
-    upcast=True,
-    reorder_ops=False,
-    use_smooth_softmax=False,
-):
-    return attention_ref(
-        qkv[:, :, 0],
-        qkv[:, :, 1],
-        qkv[:, :, 2],
-        key_padding_mask,
-        key_padding_mask,
-        dropout_p,
-        dropout_mask,
-        upcast=upcast,
-        causal=causal,
-        reorder_ops=reorder_ops,
-        use_smooth_softmax=use_smooth_softmax,
-    )
-
-
-def parity_check_mha(
-    config,
-    packed,
-    rtol=1e-3,
-    atol=1e-3,
-):
-    if packed:
-        qkv_unpad, cu_seqlens, _, qkv, output_pad_fn, _, key_padding_mask = create_inputs(config)
-        token_offset = generate_token_offset(cu_seqlens, config.sequence_length).reshape(
-            (config.batch_size, config.sequence_length)
-        )
-        # ORT Flash
-        out_unpad = flash_attn_varlen_qkvpacked_func(qkv_unpad, cu_seqlens, token_offset, config, causal=False)
-        out_unpad = torch.squeeze(out_unpad, 0)
-        out = torch.reshape(
-            output_pad_fn(out_unpad), (config.batch_size, config.sequence_length, config.num_heads, config.head_size)
-        )
-        out = out.detach().cpu().numpy()
-        # Pytorch to compare
-        out_ref, _ = attention_qkvpacked_ref(qkv, key_padding_mask, 0.0, None, causal=False)
-        out_ref = out_ref.detach().cpu().numpy()
-    else:
-        q = torch.randn(
-            config.batch_size,
-            config.sequence_length,
-            config.num_heads,
-            config.head_size,
-            device="cuda",
-            dtype=torch.float16,
-            requires_grad=False,
-        )
-        k = torch.randn(
-            config.batch_size,
-            config.kv_sequence_length,
-            config.kv_num_heads,
-            config.head_size,
-            device="cuda",
-            dtype=torch.float16,
-            requires_grad=False,
-        )
-        v = torch.randn(
-            config.batch_size,
-            config.kv_sequence_length,
-            config.kv_num_heads,
-            config.head_size,
-            device="cuda",
-            dtype=torch.float16,
-            requires_grad=False,
-        )
-        out = mha_func(q, k, v, config)
-        out = torch.squeeze(out, 0)
-        out = torch.reshape(out, (config.batch_size, config.sequence_length, config.num_heads, config.head_size))
-        out = out.detach().cpu().numpy()
-        # Pytorch to compare
-        out_ref, _ = attention_ref(q, k, v, None, None, 0.0, None, causal=False)
-        out_ref = out_ref.detach().cpu().numpy()
-
-    numpy.testing.assert_allclose(
-        out, out_ref, rtol=rtol, atol=atol, equal_nan=True, err_msg=f" with {config} packed={packed}"
-    )
-
-
 def rotary_embedding(*args, **kwargs):
     # Use local import since triton is not available in Windows.
     from rotary_flash import apply_rotary_emb
@@ -1222,7 +812,7 @@ def rotary_embedding(*args, **kwargs):
 
 
 def parity_check_gqa_prompt(
-    config,
+    config: PromptConfig,
     causal=True,
     local=False,
     past_format=Formats.BNSH,
@@ -1420,7 +1010,7 @@ def parity_check_gqa_prompt(
 
 
 def parity_check_gqa_prompt_no_buff(
-    config,
+    config: PromptConfig,
     causal=True,
     local=False,
     past_format=Formats.BNSH,
@@ -1595,7 +1185,7 @@ def parity_check_gqa_prompt_no_buff(
 
 
 def parity_check_gqa_past(
-    config,
+    config: Config,
     causal=True,
     local=False,
     past_format=Formats.BNSH,
@@ -1788,7 +1378,7 @@ def parity_check_gqa_past(
 
 
 def parity_check_gqa_past_no_buff(
-    config,
+    config: Config,
     causal=True,
     local=False,
     past_format=Formats.BNSH,
@@ -2019,67 +1609,6 @@ def has_memory_efficient():
     return True
 
 
-def packed_mha_test_cases():
-    batches = [2] if pipeline_mode else [1, 5]
-    seqs = [1024, 1025] if pipeline_mode else [1024, 1025, 2048]
-    num_h = [1, 3] if pipeline_mode else [1, 6, 16]
-    h_sizes = [16, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
-
-    for b in batches:
-        for s in seqs:
-            for n in num_h:
-                for h in h_sizes:
-                    config = Config(b, s, s, 0, n, n, h)
-                    yield str(config), config
-
-
-def mha_test_cases():
-    batches = [2] if pipeline_mode else [1, 5]
-    seqs = (
-        [(1, 128), (113, 211), (2048, 2048)]
-        if pipeline_mode
-        else [
-            (113, 203),
-            (128, 217),
-            (113, 211),
-            (108, 256),
-            (256, 512),
-            (512, 256),
-            (1024, 1024),
-            (1023, 1024),
-            (1024, 1023),
-            (2048, 2048),
-        ]
-    )
-    num_h = [3] if pipeline_mode else [1, 6, 16]
-    h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
-
-    for b in batches:
-        for s, s2 in seqs:
-            for n in num_h:
-                for h in h_sizes:
-                    config = Config(b, s, s2, 0, n, n, h)
-                    yield str(config), config
-
-
-class TestMHA(unittest.TestCase):
-    @parameterized.expand(packed_mha_test_cases())
-    def test_packed_mha(self, _, config):
-        if not has_flash_attention():
-            return
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
-        print("-------- TEST PACKED MHA ---------")
-        parity_check_mha(config, True)
-
-    @parameterized.expand(mha_test_cases())
-    def test_mha(self, _, config):
-        if not has_flash_attention():
-            return
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
-        print("-------- TEST MHA ---------")
-        parity_check_mha(config, False)
-
-
 def gqa_no_past_memory_efficient_test_cases():
     batches = [3] if pipeline_mode else [1, 3, 5]
     seqs = (
@@ -2103,18 +1632,22 @@ def gqa_no_past_memory_efficient_test_cases():
         for sq, skv in seqs:
             for n, n2 in num_h:
                 for h in h_sizes:
-                    for rotary, rotary_interleaved in rotary_options_for_current_os():
-                        for packed in [False, True]:
-                            for softcap in [0.0, 50.0]:
-                                config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
-                                yield (
-                                    str(config) + f"{rotary}_{rotary_interleaved}_{packed}",
-                                    config,
-                                    rotary,
-                                    rotary_interleaved,
-                                    packed,
-                                    softcap,
-                                )
+                    for local in [False, True]:
+                        for rotary, rotary_interleaved in rotary_options_for_current_os():
+                            for packed in [False, True]:
+                                for softcap in [0.0, 50.0]:
+                                    config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
+                                    if rotary and h % 16 > 0:
+                                        continue
+                                    yield (
+                                        str(config) + f"{local}_{rotary}_{rotary_interleaved}_{packed}",
+                                        config,
+                                        local,
+                                        rotary,
+                                        rotary_interleaved,
+                                        packed,
+                                        softcap,
+                                    )
 
 
 def gqa_no_past_flash_attention_test_cases():
@@ -2144,9 +1677,12 @@ def gqa_no_past_flash_attention_test_cases():
                         for rotary, rotary_interleaved in rotary_options_for_current_os():
                             for packed in [False, True]:
                                 for softcap in [0.0, 50.0]:
+                                    if rotary and h % 16 > 0:
+                                        continue
+
                                     config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
                                     yield (
-                                        str(config) + f"{local}_{rotary}_{rotary_interleaved}_{packed}",
+                                        str(config) + f"{local}_{rotary}_{rotary_interleaved}_{packed}_{softcap}",
                                         config,
                                         local,
                                         rotary,
@@ -2183,19 +1719,22 @@ def gqa_past_memory_efficient_test_cases():
         for s, s2 in seqs:
             for n, n2 in num_h:
                 for h in h_sizes:
-                    for rotary, rotary_interleaved in rotary_options_for_current_os():
-                        for packed in [False, True]:
-                            for softcap in [0.0, 50.0]:
-                                sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
-                                config = Config(b, s, s2, sp, n, n2, h)
-                                yield (
-                                    str(config) + f"{rotary}_{rotary_interleaved}_{packed}",
-                                    config,
-                                    rotary,
-                                    rotary_interleaved,
-                                    packed,
-                                    softcap,
-                                )
+                    for local in [False, True]:
+                        for rotary, rotary_interleaved in rotary_options_for_current_os():
+                            for packed in [False, True]:
+                                for softcap in [0.0, 50.0]:
+                                    if rotary and h % 16 > 0:
+                                        continue
+                                    config = Config(b, s, s2, n, n2, h)
+                                    yield (
+                                        str(config) + f"{local}_{rotary}_{rotary_interleaved}_{packed}_{softcap}",
+                                        config,
+                                        local,
+                                        rotary,
+                                        rotary_interleaved,
+                                        packed,
+                                        softcap,
+                                    )
 
 
 def gqa_past_flash_attention_test_cases():
@@ -2229,10 +1768,12 @@ def gqa_past_flash_attention_test_cases():
                         for rotary, rotary_interleaved in rotary_options_for_current_os():
                             for packed in [False, True]:
                                 for softcap in [0.0, 50.0]:
-                                    sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
-                                    config = Config(b, s, s2, sp, n, n2, h)
+                                    if rotary and h % 16 > 0:
+                                        continue
+
+                                    config = Config(b, s, s2, n, n2, h)
                                     yield (
-                                        str(config) + f"{local}_{rotary}_{rotary_interleaved}_{packed}",
+                                        str(config) + f"{local}_{rotary}_{rotary_interleaved}_{packed}_{softcap}",
                                         config,
                                         local,
                                         rotary,
@@ -2272,7 +1813,10 @@ def gqa_interactive_one_batch_flash_attention_test_cases():
                     for local in [False, True]:
                         for rotary, rotary_interleaved in rotary_options_for_current_os():
                             for packed in [False, True]:
-                                config = Config(b, s, s2, -1, n, n2, h)
+                                if rotary and h % 16 > 0:
+                                    continue
+
+                                config = Config(b, s, s2, n, n2, h)
                                 yield (
                                     str(config) + f"{local}_{rotary}_{rotary_interleaved}_{packed}",
                                     config,
@@ -2312,7 +1856,10 @@ def gqa_interactive_one_batch_memory_efficient_attention_test_cases():
                 for h in h_sizes:
                     for rotary, rotary_interleaved in rotary_options_for_current_os():
                         for packed in [False, True]:
-                            config = Config(b, s, s2, -1, n, n2, h)
+                            if rotary and h % 16 > 0:
+                                continue
+
+                            config = Config(b, s, s2, n, n2, h)
                             yield (
                                 str(config) + f"{rotary}_{rotary_interleaved}_{packed}",
                                 config,
@@ -2410,12 +1957,13 @@ def test_gqa_interactive_one_batch_flash_attention(self, _, config, local, rotar
 @unittest.skipIf(not has_memory_efficient(), reason="Memory efficient FMHA is not available, skipping tests.")
 class TestMemoryEfficientGQA(unittest.TestCase):
     @parameterized.expand(gqa_no_past_memory_efficient_test_cases())
-    def test_gqa_no_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap):
+    def test_gqa_no_past_memory_efficient(self, _, config, local, rotary, rotary_interleaved, packed, softcap):
         os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
         print("------- MEMORY EFFICIENT ATTENTION (PROMPT CASE) ---------")
 
         parity_check_gqa_prompt(
             config,
+            local=local,
             rtol=5e-3,
             atol=5e-3,
             past_format=Formats.BNSH,
@@ -2427,6 +1975,7 @@ def test_gqa_no_past_memory_efficient(self, _, config, rotary, rotary_interleave
         )
         parity_check_gqa_prompt_no_buff(
             config,
+            local=local,
             rtol=5e-3,
             atol=5e-3,
             past_format=Formats.BNSH,
@@ -2438,12 +1987,13 @@ def test_gqa_no_past_memory_efficient(self, _, config, rotary, rotary_interleave
         )
 
     @parameterized.expand(gqa_past_memory_efficient_test_cases())
-    def test_gqa_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap):
+    def test_gqa_past_memory_efficient(self, _, config, local, rotary, rotary_interleaved, packed, softcap):
         os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
         print("-------- MEMORY EFFICIENT (TOKEN GEN) --------")
 
         parity_check_gqa_past(
             config,
+            local=local,
             past_format=Formats.BNSH,
             rtol=1e-3,
             atol=1e-3,
@@ -2455,6 +2005,7 @@ def test_gqa_past_memory_efficient(self, _, config, rotary, rotary_interleaved,
         )
         parity_check_gqa_past_no_buff(
             config,
+            local=local,
             past_format=Formats.BNSH,
             rtol=1e-3,
             atol=1e-3,
diff --git a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py b/onnxruntime/test/python/transformers/test_gqa_rocm.py
similarity index 98%
rename from onnxruntime/test/python/transformers/test_flash_attn_rocm.py
rename to onnxruntime/test/python/transformers/test_gqa_rocm.py
index a5910c28c2975..29ae1b6e44a78 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
+++ b/onnxruntime/test/python/transformers/test_gqa_rocm.py
@@ -3,7 +3,7 @@
 
 import torch
 from parameterized import parameterized
-from test_flash_attn_cuda import (
+from test_gqa_cuda import (
     Formats,
     gqa_no_past_flash_attention_test_cases,
     gqa_past_flash_attention_test_cases,
@@ -38,6 +38,7 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte
             rtol=0.001,
             atol=0.005,
         )
+
         parity_check_gqa_prompt_no_buff(
             config,
             local=local,
diff --git a/onnxruntime/test/python/transformers/test_mha_flash_attn.py b/onnxruntime/test/python/transformers/test_mha_flash_attn.py
new file mode 100644
index 0000000000000..f87370e37d21a
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_mha_flash_attn.py
@@ -0,0 +1,452 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# -------------------------------------------------------------------------
+import os
+import unittest
+
+import numpy
+import torch
+from bert_padding import pad_input, unpad_input
+from einops import rearrange, repeat
+from onnx import TensorProto, helper
+from parameterized import parameterized
+from test_gqa_cuda import attention_ref, has_flash_attention
+
+from onnxruntime import InferenceSession, SessionOptions
+
+torch.manual_seed(0)
+
+pipeline_mode = True  # Reduces number of tests so pipeline doesn't time out
+
+
+class Formats:
+    BSNH = 0
+    BNSH = 1
+
+
+class Config:
+    batch_size = 0
+    sequence_length = 0
+    kv_sequence_length = 0  # this is past sequence length when there is past state.
+    num_heads = 0
+    kv_num_heads = 0
+    head_size = 0
+    ep = "CUDAExecutionProvider"
+
+    def __init__(self, batch_size, sequence_length, kv_sequence_length, num_heads, kv_num_heads, head_size):
+        self.batch_size = batch_size
+        self.sequence_length = sequence_length
+        self.kv_sequence_length = kv_sequence_length
+        self.num_heads = num_heads
+        self.kv_num_heads = kv_num_heads
+        self.head_size = head_size
+
+    def __repr__(self):
+        short_ep = self.ep[: -len("ExecutionProvider")].lower()
+        return (
+            f"Config(batch_size={self.batch_size}, sequence_length={self.sequence_length}, "
+            f"kv_sequence_length={self.kv_sequence_length}, "
+            f"num_heads={self.num_heads}, kv_num_heads={self.kv_num_heads}, head_size={self.head_size}, ep={short_ep})"
+        )
+
+
+def create_packed_multihead_attention_graph(config: Config):
+    nodes = [
+        helper.make_node(
+            "PackedMultiHeadAttention",
+            [
+                "query",
+                "",
+                "",
+                "",
+                "token_offset",
+                "cumulative_sequence_length",
+            ],
+            ["output"],
+            "PackedMultiHeadAttention_0",
+            num_heads=config.num_heads,
+            domain="com.microsoft",
+        ),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "PackedMultiHeadAttention_Graph",
+        [
+            helper.make_tensor_value_info(
+                "query",
+                TensorProto.FLOAT16,
+                [
+                    -1,
+                    config.num_heads,
+                    3,
+                    config.head_size,
+                ],
+            ),
+            helper.make_tensor_value_info(
+                "token_offset", TensorProto.INT32, [config.batch_size, config.sequence_length]
+            ),
+            helper.make_tensor_value_info("cumulative_sequence_length", TensorProto.INT32, [config.batch_size + 1]),
+        ],
+        [
+            helper.make_tensor_value_info(
+                "output",
+                TensorProto.FLOAT16,
+                [-1, config.num_heads * config.head_size],
+            ),
+        ],
+    )
+
+    model = helper.make_model(graph)
+    return model.SerializeToString()
+
+
+def create_multihead_attention_graph(config: Config):
+    nodes = [
+        helper.make_node(
+            "MultiHeadAttention",
+            [
+                "query",
+                "key",
+                "value",
+            ],
+            ["output"],
+            "MultiHeadAttention_0",
+            num_heads=config.num_heads,
+            domain="com.microsoft",
+        ),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "MultiHeadAttention_Graph",
+        [
+            helper.make_tensor_value_info(
+                "query",
+                TensorProto.FLOAT16,
+                [
+                    config.batch_size,
+                    config.sequence_length,
+                    config.num_heads * config.head_size,
+                ],
+            ),
+            helper.make_tensor_value_info(
+                "key",
+                TensorProto.FLOAT16,
+                [
+                    config.batch_size,
+                    config.kv_sequence_length,
+                    config.num_heads * config.head_size,
+                ],
+            ),
+            helper.make_tensor_value_info(
+                "value",
+                TensorProto.FLOAT16,
+                [
+                    config.batch_size,
+                    config.kv_sequence_length,
+                    config.num_heads * config.head_size,
+                ],
+            ),
+        ],
+        [
+            helper.make_tensor_value_info(
+                "output",
+                TensorProto.FLOAT16,
+                [config.batch_size, config.sequence_length, config.num_heads * config.head_size],
+            ),
+        ],
+    )
+
+    model = helper.make_model(graph)
+    return model.SerializeToString()
+
+
+def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
+    assert mode in ["full", "random", "third"]
+    if mode == "full":
+        lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
+    elif mode == "random":
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen, (batch_size, 1), device=device)
+    else:
+        lengths = torch.randint(max_seqlen // 3, max_seqlen, (batch_size, 1), device=device)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
+    return padding_mask
+
+
+def generate_packed_qkv(q, k, v, query_padding_mask=None, key_padding_mask=None):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, d)
+        k: (batch_size, seqlen_k, nheads_k, d)
+        v: (batch_size, seqlen_k, nheads_k, d)
+        query_padding_mask: (batch_size, seqlen), bool
+        key_padding_mask: (batch_size, seqlen), bool
+    """
+    batch_size, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, nheads_k, _ = k.shape
+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
+    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
+
+    if query_padding_mask is not None:
+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
+
+        def output_pad_fn(output_unpad):
+            return pad_input(output_unpad, indices_q, batch_size, seqlen_q)
+
+    else:
+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
+        cu_seqlens_q = torch.arange(
+            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device
+        )
+        max_seqlen_q = seqlen_q
+
+        def output_pad_fn(output_unpad):
+            return rearrange(output_unpad, "(b s) h d -> b s h d", b=batch_size)
+
+    if key_padding_mask is not None:
+        k_unpad, _, _, _ = unpad_input(k, key_padding_mask)
+        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
+    else:
+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
+
+    assert (query_padding_mask == key_padding_mask).all()
+    assert nheads == nheads_k
+    qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
+    qkv = torch.stack([q, k, v], dim=2)
+    if query_padding_mask is not None:
+
+        def dqkv_pad_fn(dqkv_unpad):
+            return pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
+
+    else:
+
+        def dqkv_pad_fn(dqkv_unpad):
+            return rearrange(dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+
+    return (
+        qkv_unpad.detach().requires_grad_(),
+        cu_seqlens_q,
+        max_seqlen_q,
+        qkv.detach().requires_grad_(),
+        output_pad_fn,
+        dqkv_pad_fn,
+    )
+
+
+def create_inputs(config: Config):
+    qkv = torch.randn(
+        config.batch_size,
+        config.sequence_length,
+        3,
+        config.num_heads,
+        config.head_size,
+        device="cuda",
+        dtype=torch.float16,
+        requires_grad=False,
+    )
+    padding_mask = generate_random_padding_mask(config.sequence_length, config.batch_size, device="cuda", mode="random")
+    qkv_unpad, cu_seqlens, max_seqlen, qkv, output_pad_fn, dqkv_pad_fn = generate_packed_qkv(
+        *qkv.unbind(dim=2), padding_mask, padding_mask
+    )
+    return qkv_unpad, cu_seqlens, max_seqlen, qkv, output_pad_fn, dqkv_pad_fn, padding_mask
+
+
+def generate_token_offset(cu_seqlens, max_seqlen):
+    token_offset = []
+    token_padset = []  # These are the indices that contain padding tokens
+    for i in range(1, len(cu_seqlens)):
+        start = i - 1
+        pre_seqlen = cu_seqlens[i - 1]
+        seqlen = cu_seqlens[i]
+        token_offset += range(start * max_seqlen, (start * max_seqlen) + (seqlen - pre_seqlen))
+        token_padset += range((start * max_seqlen) + (seqlen - pre_seqlen), i * max_seqlen)
+    return numpy.asarray(token_offset + token_padset, dtype=numpy.int32)
+
+
+def flash_attn_varlen_qkvpacked_func(qkv_unpad, cu_seqlens, token_offset, config):
+    onnx_model_str = create_packed_multihead_attention_graph(config)
+    qkv_unpad = torch.swapdims(qkv_unpad, 1, 2)
+    ort_inputs = {
+        "query": qkv_unpad.detach().cpu().numpy(),
+        "token_offset": token_offset,
+        "cumulative_sequence_length": cu_seqlens.cpu().numpy(),
+    }
+    sess_options = SessionOptions()
+    ort_session = InferenceSession(onnx_model_str, sess_options, providers=[config.ep])
+    ort_output = ort_session.run(None, ort_inputs)
+    output = torch.tensor(ort_output)
+    return output
+
+
+def mha_func(q, k, v, config):
+    onnx_model_str = create_multihead_attention_graph(config)
+    q = torch.reshape(q, (config.batch_size, config.sequence_length, -1))
+    k = torch.reshape(k, (config.batch_size, config.kv_sequence_length, -1))
+    v = torch.reshape(v, (config.batch_size, config.kv_sequence_length, -1))
+    ort_inputs = {
+        "query": q.detach().cpu().numpy(),
+        "key": k.detach().cpu().numpy(),
+        "value": v.detach().cpu().numpy(),
+    }
+    sess_options = SessionOptions()
+    ort_session = InferenceSession(onnx_model_str, sess_options, providers=[config.ep])
+    ort_output = ort_session.run(None, ort_inputs)
+    ort_output = numpy.array(ort_output)
+    output = torch.tensor(ort_output)
+    return output
+
+
+def attention_qkvpacked_ref(
+    qkv,
+    key_padding_mask=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    upcast=True,
+    reorder_ops=False,
+    use_smooth_softmax=False,
+):
+    return attention_ref(
+        qkv[:, :, 0],
+        qkv[:, :, 1],
+        qkv[:, :, 2],
+        key_padding_mask,
+        key_padding_mask,
+        dropout_p,
+        dropout_mask,
+        upcast=upcast,
+        causal=causal,
+        reorder_ops=reorder_ops,
+        use_smooth_softmax=use_smooth_softmax,
+    )
+
+
+def parity_check_mha(
+    config,
+    packed,
+    rtol=1e-3,
+    atol=1e-3,
+):
+    if packed:
+        qkv_unpad, cu_seqlens, _, qkv, output_pad_fn, _, key_padding_mask = create_inputs(config)
+        token_offset = generate_token_offset(cu_seqlens, config.sequence_length).reshape(
+            (config.batch_size, config.sequence_length)
+        )
+        # ORT Flash
+        out_unpad = flash_attn_varlen_qkvpacked_func(qkv_unpad, cu_seqlens, token_offset, config)
+        out_unpad = torch.squeeze(out_unpad, 0)
+        out = torch.reshape(
+            output_pad_fn(out_unpad), (config.batch_size, config.sequence_length, config.num_heads, config.head_size)
+        )
+        out = out.detach().cpu().numpy()
+        # Pytorch to compare
+        out_ref, _ = attention_qkvpacked_ref(qkv, key_padding_mask, 0.0, None, causal=False)
+        out_ref = out_ref.detach().cpu().numpy()
+    else:
+        q = torch.randn(
+            config.batch_size,
+            config.sequence_length,
+            config.num_heads,
+            config.head_size,
+            device="cuda",
+            dtype=torch.float16,
+            requires_grad=False,
+        )
+        k = torch.randn(
+            config.batch_size,
+            config.kv_sequence_length,
+            config.kv_num_heads,
+            config.head_size,
+            device="cuda",
+            dtype=torch.float16,
+            requires_grad=False,
+        )
+        v = torch.randn(
+            config.batch_size,
+            config.kv_sequence_length,
+            config.kv_num_heads,
+            config.head_size,
+            device="cuda",
+            dtype=torch.float16,
+            requires_grad=False,
+        )
+        out = mha_func(q, k, v, config)
+        out = torch.squeeze(out, 0)
+        out = torch.reshape(out, (config.batch_size, config.sequence_length, config.num_heads, config.head_size))
+        out = out.detach().cpu().numpy()
+        # Pytorch to compare
+        out_ref, _ = attention_ref(q, k, v, None, None, 0.0, None, causal=False)
+        out_ref = out_ref.detach().cpu().numpy()
+
+    numpy.testing.assert_allclose(
+        out, out_ref, rtol=rtol, atol=atol, equal_nan=True, err_msg=f" with {config} packed={packed}"
+    )
+
+
+def packed_mha_test_cases():
+    batch_sizes = [2] if pipeline_mode else [1, 5]
+    sequence_lengths = [1024, 1025] if pipeline_mode else [1024, 1025, 2048]
+    num_heads = [1, 3] if pipeline_mode else [1, 6, 16]
+    head_sizes = [16, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+
+    for b in batch_sizes:
+        for s in sequence_lengths:
+            for n in num_heads:
+                for h in head_sizes:
+                    config = Config(b, s, s, n, n, h)
+                    yield str(config), config
+
+
+def mha_test_cases():
+    batch_sizes = [2] if pipeline_mode else [1, 5]
+    sequence_lengths = (
+        [(1, 128), (113, 211), (2048, 2048)]
+        if pipeline_mode
+        else [
+            (113, 203),
+            (128, 217),
+            (113, 211),
+            (108, 256),
+            (256, 512),
+            (512, 256),
+            (1024, 1024),
+            (1023, 1024),
+            (1024, 1023),
+            (2048, 2048),
+        ]
+    )
+    num_heads = [3] if pipeline_mode else [1, 6, 16]
+    head_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+
+    for b in batch_sizes:
+        for s, kv_sequence_length in sequence_lengths:
+            for n in num_heads:
+                for h in head_sizes:
+                    config = Config(b, s, kv_sequence_length, n, n, h)
+                    yield str(config), config
+
+
+class TestMHA(unittest.TestCase):
+    @parameterized.expand(packed_mha_test_cases())
+    def test_packed_mha(self, _, config):
+        if not has_flash_attention():
+            return
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
+        print("-------- TEST PACKED MHA ---------")
+        parity_check_mha(config, True)
+
+    @parameterized.expand(mha_test_cases())
+    def test_mha(self, _, config):
+        if not has_flash_attention():
+            return
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
+        print("-------- TEST MHA ---------")
+        parity_check_mha(config, False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/added_tokens.json b/onnxruntime/test/testdata/transformers/tiny_t5/added_tokens.json
new file mode 100644
index 0000000000000..3f5132007c4fc
--- /dev/null
+++ b/onnxruntime/test/testdata/transformers/tiny_t5/added_tokens.json
@@ -0,0 +1,102 @@
+{
+  "<extra_id_0>": 32099,
+  "<extra_id_10>": 32089,
+  "<extra_id_11>": 32088,
+  "<extra_id_12>": 32087,
+  "<extra_id_13>": 32086,
+  "<extra_id_14>": 32085,
+  "<extra_id_15>": 32084,
+  "<extra_id_16>": 32083,
+  "<extra_id_17>": 32082,
+  "<extra_id_18>": 32081,
+  "<extra_id_19>": 32080,
+  "<extra_id_1>": 32098,
+  "<extra_id_20>": 32079,
+  "<extra_id_21>": 32078,
+  "<extra_id_22>": 32077,
+  "<extra_id_23>": 32076,
+  "<extra_id_24>": 32075,
+  "<extra_id_25>": 32074,
+  "<extra_id_26>": 32073,
+  "<extra_id_27>": 32072,
+  "<extra_id_28>": 32071,
+  "<extra_id_29>": 32070,
+  "<extra_id_2>": 32097,
+  "<extra_id_30>": 32069,
+  "<extra_id_31>": 32068,
+  "<extra_id_32>": 32067,
+  "<extra_id_33>": 32066,
+  "<extra_id_34>": 32065,
+  "<extra_id_35>": 32064,
+  "<extra_id_36>": 32063,
+  "<extra_id_37>": 32062,
+  "<extra_id_38>": 32061,
+  "<extra_id_39>": 32060,
+  "<extra_id_3>": 32096,
+  "<extra_id_40>": 32059,
+  "<extra_id_41>": 32058,
+  "<extra_id_42>": 32057,
+  "<extra_id_43>": 32056,
+  "<extra_id_44>": 32055,
+  "<extra_id_45>": 32054,
+  "<extra_id_46>": 32053,
+  "<extra_id_47>": 32052,
+  "<extra_id_48>": 32051,
+  "<extra_id_49>": 32050,
+  "<extra_id_4>": 32095,
+  "<extra_id_50>": 32049,
+  "<extra_id_51>": 32048,
+  "<extra_id_52>": 32047,
+  "<extra_id_53>": 32046,
+  "<extra_id_54>": 32045,
+  "<extra_id_55>": 32044,
+  "<extra_id_56>": 32043,
+  "<extra_id_57>": 32042,
+  "<extra_id_58>": 32041,
+  "<extra_id_59>": 32040,
+  "<extra_id_5>": 32094,
+  "<extra_id_60>": 32039,
+  "<extra_id_61>": 32038,
+  "<extra_id_62>": 32037,
+  "<extra_id_63>": 32036,
+  "<extra_id_64>": 32035,
+  "<extra_id_65>": 32034,
+  "<extra_id_66>": 32033,
+  "<extra_id_67>": 32032,
+  "<extra_id_68>": 32031,
+  "<extra_id_69>": 32030,
+  "<extra_id_6>": 32093,
+  "<extra_id_70>": 32029,
+  "<extra_id_71>": 32028,
+  "<extra_id_72>": 32027,
+  "<extra_id_73>": 32026,
+  "<extra_id_74>": 32025,
+  "<extra_id_75>": 32024,
+  "<extra_id_76>": 32023,
+  "<extra_id_77>": 32022,
+  "<extra_id_78>": 32021,
+  "<extra_id_79>": 32020,
+  "<extra_id_7>": 32092,
+  "<extra_id_80>": 32019,
+  "<extra_id_81>": 32018,
+  "<extra_id_82>": 32017,
+  "<extra_id_83>": 32016,
+  "<extra_id_84>": 32015,
+  "<extra_id_85>": 32014,
+  "<extra_id_86>": 32013,
+  "<extra_id_87>": 32012,
+  "<extra_id_88>": 32011,
+  "<extra_id_89>": 32010,
+  "<extra_id_8>": 32091,
+  "<extra_id_90>": 32009,
+  "<extra_id_91>": 32008,
+  "<extra_id_92>": 32007,
+  "<extra_id_93>": 32006,
+  "<extra_id_94>": 32005,
+  "<extra_id_95>": 32004,
+  "<extra_id_96>": 32003,
+  "<extra_id_97>": 32002,
+  "<extra_id_98>": 32001,
+  "<extra_id_99>": 32000,
+  "<extra_id_9>": 32090
+}
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/config.json b/onnxruntime/test/testdata/transformers/tiny_t5/config.json
new file mode 100644
index 0000000000000..d649732da246f
--- /dev/null
+++ b/onnxruntime/test/testdata/transformers/tiny_t5/config.json
@@ -0,0 +1,60 @@
+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 16,
+  "d_kv": 4,
+  "d_model": 8,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 2,
+  "num_heads": 2,
+  "num_layers": 2,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.4",
+  "use_cache": true,
+  "vocab_size": 1024
+}
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/generation_config.json b/onnxruntime/test/testdata/transformers/tiny_t5/generation_config.json
new file mode 100644
index 0000000000000..6f2a63c77c1b9
--- /dev/null
+++ b/onnxruntime/test/testdata/transformers/tiny_t5/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.42.4"
+}
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/model.safetensors b/onnxruntime/test/testdata/transformers/tiny_t5/model.safetensors
new file mode 100644
index 0000000000000..1b90602ed0709
Binary files /dev/null and b/onnxruntime/test/testdata/transformers/tiny_t5/model.safetensors differ
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/special_tokens_map.json b/onnxruntime/test/testdata/transformers/tiny_t5/special_tokens_map.json
new file mode 100644
index 0000000000000..17ade346a1042
--- /dev/null
+++ b/onnxruntime/test/testdata/transformers/tiny_t5/special_tokens_map.json
@@ -0,0 +1,125 @@
+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/spiece.model b/onnxruntime/test/testdata/transformers/tiny_t5/spiece.model
new file mode 100644
index 0000000000000..16ff05c4dd0f9
Binary files /dev/null and b/onnxruntime/test/testdata/transformers/tiny_t5/spiece.model differ
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/tiny_t5.py b/onnxruntime/test/testdata/transformers/tiny_t5/tiny_t5.py
new file mode 100644
index 0000000000000..6a25cb89f6327
--- /dev/null
+++ b/onnxruntime/test/testdata/transformers/tiny_t5/tiny_t5.py
@@ -0,0 +1,85 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import os
+
+from sentencepiece import SentencePieceProcessor, SentencePieceTrainer
+from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer
+
+hidden_size = 8
+
+vocab_size = 1024
+save_directory = "tiny_t5"
+model_name = "google-t5/t5-small"
+
+config = T5Config.from_pretrained(model_name)
+
+config.num_heads = 2
+
+if vocab_size:
+    config.vocab_size = 1024
+
+config.d_model = hidden_size
+config.d_kv = hidden_size // config.num_heads
+config.d_ff = hidden_size * 2
+config.num_layers = 2
+config.num_decoder_layers = config.num_layers
+
+model = T5ForConditionalGeneration(config)
+
+model.save_pretrained(save_directory)
+
+tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
+tokenizer.save_pretrained(save_directory)
+
+
+def update_tokenizer(sp_model_path: str, vocab_size: int):
+    sp = SentencePieceProcessor()
+    sp.Load(sp_model_path)
+
+    # Export the vocabulary
+    with open("vocab.txt", "w", encoding="utf-8") as f:
+        for id in range(sp.GetPieceSize()):
+            piece = sp.IdToPiece(id)
+            score = sp.GetScore(id)
+            f.write(f"{piece}\t{score}\n")
+
+    with open("vocab.txt", encoding="utf-8") as f:
+        vocab = [line.strip().split("\t") for line in f]
+
+    # Sort by score in descending order and select top tokens
+    vocab_sorted = sorted(vocab, key=lambda x: float(x[1]), reverse=True)
+    pruned_vocab = vocab_sorted[:vocab_size]
+
+    # Write the pruned vocabulary to a new file
+    with open("pruned_vocab.txt", "w", encoding="utf-8") as f:
+        for piece, score in pruned_vocab:
+            f.write(f"{piece}\t{score}\n")
+
+    # Train a new SentencePiece model using the pruned vocabulary as a seed.
+    # Example corpus.txt can be found by searching "corpus.txt download" in search engine.
+    SentencePieceTrainer.Train(
+        f"--input=corpus.txt --model_prefix=spiece --vocab_size={vocab_size} --user_defined_symbols=pruned_vocab.txt"
+    )
+
+    # Load the new model
+    sp_new = SentencePieceProcessor()
+    sp_new.Load("spiece.model")
+
+    # Test encoding and decoding
+    text = "This is an example sentence."
+    tokens = sp_new.EncodeAsPieces(text)
+    print(tokens)
+
+    detokenized_text = sp_new.DecodePieces(tokens)
+    print(detokenized_text)
+
+    # Replace the original model.
+    os.replace("spiece.model", sp_model_path)
+
+
+if vocab_size:
+    original_path = os.path.join(save_directory, "spiece.model")
+    update_tokenizer(original_path, vocab_size)
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/tokenizer_config.json b/onnxruntime/test/testdata/transformers/tiny_t5/tokenizer_config.json
new file mode 100644
index 0000000000000..da3a2f5a033d6
--- /dev/null
+++ b/onnxruntime/test/testdata/transformers/tiny_t5/tokenizer_config.json
@@ -0,0 +1,940 @@
+{
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<extra_id_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<extra_id_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<extra_id_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<extra_id_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<extra_id_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<extra_id_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<extra_id_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<extra_id_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<extra_id_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<extra_id_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<extra_id_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<extra_id_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32012": {
+      "content": "<extra_id_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32013": {
+      "content": "<extra_id_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32014": {
+      "content": "<extra_id_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32015": {
+      "content": "<extra_id_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "<extra_id_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32017": {
+      "content": "<extra_id_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32018": {
+      "content": "<extra_id_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32019": {
+      "content": "<extra_id_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32020": {
+      "content": "<extra_id_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32021": {
+      "content": "<extra_id_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32022": {
+      "content": "<extra_id_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32023": {
+      "content": "<extra_id_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32024": {
+      "content": "<extra_id_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32025": {
+      "content": "<extra_id_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32026": {
+      "content": "<extra_id_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32027": {
+      "content": "<extra_id_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32028": {
+      "content": "<extra_id_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32029": {
+      "content": "<extra_id_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32030": {
+      "content": "<extra_id_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32031": {
+      "content": "<extra_id_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32032": {
+      "content": "<extra_id_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32033": {
+      "content": "<extra_id_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32034": {
+      "content": "<extra_id_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32035": {
+      "content": "<extra_id_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32036": {
+      "content": "<extra_id_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32037": {
+      "content": "<extra_id_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32038": {
+      "content": "<extra_id_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32039": {
+      "content": "<extra_id_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32040": {
+      "content": "<extra_id_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32041": {
+      "content": "<extra_id_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32042": {
+      "content": "<extra_id_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32043": {
+      "content": "<extra_id_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32044": {
+      "content": "<extra_id_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32045": {
+      "content": "<extra_id_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32046": {
+      "content": "<extra_id_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32047": {
+      "content": "<extra_id_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32048": {
+      "content": "<extra_id_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32049": {
+      "content": "<extra_id_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32050": {
+      "content": "<extra_id_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32051": {
+      "content": "<extra_id_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32052": {
+      "content": "<extra_id_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32053": {
+      "content": "<extra_id_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32054": {
+      "content": "<extra_id_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32055": {
+      "content": "<extra_id_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32056": {
+      "content": "<extra_id_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32057": {
+      "content": "<extra_id_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32058": {
+      "content": "<extra_id_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32059": {
+      "content": "<extra_id_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32060": {
+      "content": "<extra_id_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32061": {
+      "content": "<extra_id_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32062": {
+      "content": "<extra_id_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32063": {
+      "content": "<extra_id_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32064": {
+      "content": "<extra_id_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32065": {
+      "content": "<extra_id_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32066": {
+      "content": "<extra_id_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32067": {
+      "content": "<extra_id_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32068": {
+      "content": "<extra_id_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32069": {
+      "content": "<extra_id_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32070": {
+      "content": "<extra_id_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32071": {
+      "content": "<extra_id_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32072": {
+      "content": "<extra_id_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32073": {
+      "content": "<extra_id_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32074": {
+      "content": "<extra_id_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32075": {
+      "content": "<extra_id_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32076": {
+      "content": "<extra_id_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32077": {
+      "content": "<extra_id_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32078": {
+      "content": "<extra_id_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32079": {
+      "content": "<extra_id_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32080": {
+      "content": "<extra_id_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32081": {
+      "content": "<extra_id_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32082": {
+      "content": "<extra_id_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32083": {
+      "content": "<extra_id_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32084": {
+      "content": "<extra_id_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32085": {
+      "content": "<extra_id_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32086": {
+      "content": "<extra_id_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32087": {
+      "content": "<extra_id_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32088": {
+      "content": "<extra_id_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32089": {
+      "content": "<extra_id_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32090": {
+      "content": "<extra_id_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32091": {
+      "content": "<extra_id_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32092": {
+      "content": "<extra_id_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32093": {
+      "content": "<extra_id_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32094": {
+      "content": "<extra_id_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32095": {
+      "content": "<extra_id_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32096": {
+      "content": "<extra_id_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32097": {
+      "content": "<extra_id_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32098": {
+      "content": "<extra_id_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32099": {
+      "content": "<extra_id_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index c1564997c42b8..83fb548968d77 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -303,6 +303,10 @@ std::unique_ptr<IExecutionProvider> DefaultWebGpuExecutionProvider() {
   ORT_ENFORCE(config_options.AddConfigEntry(webgpu::options::kStorageBufferCacheMode,
                                             webgpu::options::kBufferCacheMode_Disabled)
                   .IsOK());
+  // Disable device auto collect
+  ORT_ENFORCE(config_options.AddConfigEntry(webgpu::options::kPreserveDevice,
+                                            webgpu::options::kPreserveDevice_ON)
+                  .IsOK());
   return WebGpuProviderFactoryCreator::Create(config_options)->CreateProvider();
 #else
   return nullptr;
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 1ad35b51bb1c1..147eab7116d94 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -223,6 +223,113 @@ int OrtGetInputOutputCount(OrtSession* session, size_t* input_count, size_t* out
   return ORT_OK;
 }
 
+int OrtGetInputOutputMetadata(ort_session_handle_t session, size_t index, char** name_cstr_ptr, void** type_info_ptr) {
+  OrtAllocator* allocator = nullptr;
+  RETURN_ERROR_CODE_IF_ERROR(GetAllocatorWithDefaultOptions, &allocator);
+
+  size_t input_count, output_count;
+  int error_code = OrtGetInputOutputCount(session, &input_count, &output_count);
+  if (error_code != ORT_OK) {
+    return error_code;
+  }
+
+  if (index >= input_count + output_count) {
+    std::ostringstream ostr;
+    ostr << "Invalid index: " << index << ", input count: " << input_count << ", output count: " << output_count;
+    return CheckStatus(Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, ostr.str().c_str()));
+  }
+
+  char* name_cstr;
+  if (index < input_count) {
+    RETURN_ERROR_CODE_IF_ERROR(SessionGetInputName, session, index, allocator, &name_cstr);
+  } else {
+    RETURN_ERROR_CODE_IF_ERROR(SessionGetOutputName, session, index - input_count, allocator, &name_cstr);
+  }
+  REGISTER_AUTO_RELEASE_BUFFER(char, name_cstr, allocator);
+
+  OrtTypeInfo* type_info;
+  if (index < input_count) {
+    RETURN_ERROR_CODE_IF_ERROR(SessionGetInputTypeInfo, session, index, &type_info);
+  } else {
+    RETURN_ERROR_CODE_IF_ERROR(SessionGetOutputTypeInfo, session, index - input_count, &type_info);
+  }
+  REGISTER_AUTO_RELEASE_HANDLE(TypeInfo, type_info);
+
+  const OrtTensorTypeAndShapeInfo* tensor_info;
+  RETURN_ERROR_CODE_IF_ERROR(CastTypeInfoToTensorInfo, type_info, &tensor_info);
+
+  size_t type_info_size = 4;
+  ONNXTensorElementDataType element_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  size_t dim_count = 0;
+  if (tensor_info != nullptr) {
+    RETURN_ERROR_CODE_IF_ERROR(GetTensorElementType, tensor_info, &element_type);
+    RETURN_ERROR_CODE_IF_ERROR(GetDimensionsCount, tensor_info, &dim_count);
+
+    // byte [0, 4): [i32] element type
+    // byte [4, 8): [u32] dimension count
+    // byte [8, 8 + dim_count * ptr_size): [ptr] symbolic dimension names for dim[0], dim[1], ..., dim[dim_count - 1]
+    // byte [8 + dim_count * ptr_size, 8 + dim_count * ptr_size * 2): [size_t] dimension values for dim[0], dim[1], ..., dim[dim_count - 1]
+    // from byte 8 + dim_count * ptr_size * 2: optional string copies for symbolic dimension names
+    type_info_size = 8 + dim_count * (sizeof(size_t) * 2);
+  }
+
+  std::vector<int64_t> dim_values(dim_count);
+  std::vector<const char*> dim_params(dim_count);
+  std::vector<size_t> dim_params_str_len(dim_count);
+  if (dim_count > 0) {
+    size_t str_len_total = 0;
+    RETURN_ERROR_CODE_IF_ERROR(GetDimensions, tensor_info, dim_values.data(), dim_count);
+    RETURN_ERROR_CODE_IF_ERROR(GetSymbolicDimensions, tensor_info, dim_params.data(), dim_count);
+    for (size_t i = 0; i < dim_count; ++i) {
+      size_t str_size = dim_params[i] ? strlen(dim_params[i]) : 0;
+      if (str_size > 0) {
+        str_len_total += str_size + 1;
+        dim_params_str_len[i] = str_size + 1;
+      } else {
+        dim_params_str_len[i] = 0;
+      }
+    }
+    type_info_size += str_len_total;
+  }
+
+  uint8_t* type_info_buffer = reinterpret_cast<uint8_t*>(allocator->Alloc(allocator, type_info_size));
+  // write to buffer @ byte [0, 4)
+  int32_t* p_type_info_element_type = reinterpret_cast<int32_t*>(type_info_buffer);
+  *p_type_info_element_type = static_cast<int32_t>(element_type);
+
+  if (tensor_info != nullptr) {
+    // write to buffer @ byte [4, 8)
+    uint32_t* p_type_info_dim_count = reinterpret_cast<uint32_t*>(type_info_buffer + 4);
+    *p_type_info_dim_count = static_cast<uint32_t>(dim_count);
+
+    if (dim_count > 0) {
+      // write to buffer @ byte [8, 8 + dim_count * ptr_size)
+      const char** p_dim_params = reinterpret_cast<const char**>(type_info_buffer + 8);
+      char* p_str_copy_dest = reinterpret_cast<char*>(type_info_buffer + 8 + dim_count * sizeof(size_t) * 2);
+      for (size_t i = 0; i < dim_count; ++i) {
+        if (dim_params_str_len[i] > 0) {
+          p_dim_params[i] = p_str_copy_dest;
+          memcpy(p_str_copy_dest, dim_params[i], dim_params_str_len[i]);
+          p_str_copy_dest += dim_params_str_len[i];
+        } else {
+          p_dim_params[i] = nullptr;
+        }
+      }
+
+      // write to buffer @ byte [8 + dim_count * ptr_size, 8 + dim_count * ptr_size + dim_count * 4 + dim_count * 4)
+      size_t* p_dim_values = reinterpret_cast<size_t*>(type_info_buffer + 8 + dim_count * sizeof(size_t));
+      for (size_t i = 0; i < dim_count; ++i) {
+        p_dim_values[i] = static_cast<size_t>(dim_values[i]);
+      }
+    }
+  }
+
+  UNREGISTER_AUTO_RELEASE(name_cstr);
+  *name_cstr_ptr = name_cstr;
+  *type_info_ptr = type_info_buffer;
+  return ORT_OK;
+}
+
 char* OrtGetInputName(OrtSession* session, size_t index) {
   OrtAllocator* allocator = nullptr;
   RETURN_NULLPTR_IF_ERROR(GetAllocatorWithDefaultOptions, &allocator);
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index 9ff1eb55ecedc..c488c16f4a60c 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -145,21 +145,17 @@ int EMSCRIPTEN_KEEPALIVE OrtGetInputOutputCount(ort_session_handle_t session,
                                                 size_t* output_count);
 
 /**
- * get the model's input name.
+ * get the metadata of the specified input or output of the model.
  * @param session handle of the specified session
- * @param index the input index
- * @returns a pointer to a buffer which contains C-style string. Caller must release the C style string after use by
+ * @param index the input index or output index. index should be in range [0, input_count + output_count). if the index
+ * is in range [0, input_count), it's an input index. otherwise, it's an output index.
+ * @param name_cstr_ptr [out] a pointer to a buffer which contains C-style string of the name of the input or output. Caller must release the C style string after use by
  * calling OrtFree().
- */
-char* EMSCRIPTEN_KEEPALIVE OrtGetInputName(ort_session_handle_t session, size_t index);
-/**
- * get the model's output name.
- * @param session handle of the specified session
- * @param index the output index
- * @returns a pointer to a buffer which contains C-style string. Caller must release the C style string after use by
+ * @param type_info_ptr [out] a pointer to a buffer which contains the type information of the input or output. Caller must release the buffer after use by
  * calling OrtFree().
+ * @returns ORT error code. If not zero, call OrtGetLastError() to get detailed error message.
  */
-char* EMSCRIPTEN_KEEPALIVE OrtGetOutputName(ort_session_handle_t session, size_t index);
+int EMSCRIPTEN_KEEPALIVE OrtGetInputOutputMetadata(ort_session_handle_t session, size_t index, char** name_cstr_ptr, void** type_info_ptr);
 
 /**
  * free the specified buffer.
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index d5903c9d1d211..2493887dfc347 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -704,6 +704,7 @@ def convert_arg_line_to_args(self, arg_line):
     parser.add_argument("--armnn_home", help="Path to ArmNN home dir")
     parser.add_argument("--armnn_libs", help="Path to ArmNN libraries")
     parser.add_argument("--build_micro_benchmarks", action="store_true", help="Build ONNXRuntime micro-benchmarks.")
+    parser.add_argument("--no_kleidiai", action="store_true", help="Disable KleidiAI integration on Arm platforms.")
 
     # options to reduce binary size
     parser.add_argument(
@@ -1628,6 +1629,8 @@ def generate_build_tree(
     if args.use_snpe:
         cmake_args += ["-Donnxruntime_USE_SNPE=ON"]
 
+    cmake_args += ["-Donnxruntime_USE_KLEIDIAI=" + ("OFF" if args.no_kleidiai else "ON")]
+
     if args.macos or args.ios or args.visionos or args.tvos:
         # Note: Xcode CMake generator doesn't have a good support for Mac Catalyst yet.
         if args.macos == "Catalyst" and args.cmake_generator == "Xcode":
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index cbf70f32996db..f4658f3a22c33 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -41,11 +41,11 @@ parameters:
 
 variables:
   - name: docker_base_image
-    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
+    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
   - name: linux_trt_version
-    value: 10.3.0.26-1.cuda11.8
+    value: 10.9.0.34-1.cuda12.8
   - name: Repository
-    value: 'onnxruntimecuda11manylinuxbuild'
+    value: 'onnxruntimecuda12manylinuxbuild'
 
 stages:
 - stage: Build_Onnxruntime_Cuda
@@ -93,6 +93,7 @@ stages:
             $(Repository) \
             /bin/bash -c '
               set -ex; \
+              ls /usr/local/; \
               PATH=/opt/python/cp310-cp310/bin:$PATH  /opt/python/cp310-cp310/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                 --build_dir /build --cmake_generator Ninja \
                 --config Release --update --build \
@@ -100,9 +101,9 @@ stages:
                 --build_shared_lib \
                 --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
                 --build_wheel \
-                --enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
+                --enable_onnx_tests --use_cuda --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 \
                 --enable_cuda_profiling \
-                --enable_pybind --build_java \
+                --enable_pybind \
                 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;86" '
 
         workingDirectory: $(Build.SourcesDirectory)
@@ -164,7 +165,7 @@ stages:
         DockerBuildArgs: "
         --build-arg BUILD_UID=$( id -u )
         "
-        Repository: onnxruntimeubuntupackagestest_cuda11
+        Repository: onnxruntimeubuntupackagestest_cuda12
         UseImageCacheContainerRegistry: false
 
     - task: Cache@2
@@ -182,14 +183,14 @@ stages:
           -v $(Build.BinariesDirectory)/Release:/Release \
           -v $(STABLE_DIFFUSION_MODEL_CACHE):/model_cache:rw \
           -v $(GenerateImage_DIR):/images:rw \
-          onnxruntimeubuntupackagestest_cuda11 \
+          onnxruntimeubuntupackagestest_cuda12 \
           bash -c ' \
             set -ex; \
             python3 --version; \
             python3 -m pip install --upgrade pip; \
             python3 -m pip install /Release/*.whl; \
             pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion; \
-            python3 -m pip install -r requirements/cuda11/requirements.txt; \
+            python3 -m pip install -r requirements/cuda12/requirements.txt; \
             python3 -m pip install numpy==1.22.2; \
             python3 -m pip install --upgrade polygraphy onnx-graphsurgeon ; \
             echo Generate an image guided by a text prompt; \
@@ -221,7 +222,7 @@ stages:
     - script: |
         docker run -e SYSTEM_COLLECTIONURI --rm --gpus all -v $PWD:/workspace \
           -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
-          onnxruntimeubuntupackagestest_cuda11 \
+          onnxruntimeubuntupackagestest_cuda12 \
           bash -c '
             set -x; \
             python3 --version; \
@@ -248,7 +249,7 @@ stages:
     - script: |
         docker run -e SYSTEM_COLLECTIONURI --rm --gpus all -v $PWD:/workspace \
           -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
-          onnxruntimeubuntupackagestest_cuda11 \
+          onnxruntimeubuntupackagestest_cuda12 \
           bash -c '
             set -ex; \
             python3 --version; \
@@ -445,7 +446,7 @@ stages:
               popd ; \
               python3 -m pip install /ort-artifact/*.whl ; \
               python3 -m pip uninstall -y torch ; \
-              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124 ; \
               python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output wtiny-fp32-cpu-hf --precision fp32 --provider cpu --overwrite --use_external_data_format --optimize_onnx --no_beam_search_op --output_cross_qk ; \
               python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output wtiny-fp32-cpu-hf --precision fp32 --provider cpu --overwrite --use_external_data_format --optimize_onnx ; \
               python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output wtiny-fp32-cpu-hf --precision fp32 --provider cpu --overwrite --use_external_data_format --optimize_onnx --no_beam_search_op --output_cross_qk --separate_encoder_and_decoder_init ; \
@@ -489,7 +490,7 @@ stages:
               popd ; \
               python3 -m pip install /ort-artifact/*.whl ; \
               python3 -m pip uninstall -y torch ; \
-              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124 ; \
               python3 -m models.whisper.convert_to_onnx -m /whisper_large_v3 --output whisperlargev3 --use_external_data_format ; \
               popd ; \
             '
@@ -510,7 +511,7 @@ stages:
               popd ; \
               python3 -m pip install /ort-artifact/*.whl ; \
               python3 -m pip uninstall -y torch ; \
-              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124 ; \
               ls whisperlargev3; \
               export LD_LIBRARY_PATH=/tmp/ompffmpeg:${LD_LIBRARY_PATH}; \
               ffmpeg -version; \
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index c44d3cff09e96..2dc597fcb2351 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -8,10 +8,10 @@ parameters:
 - name: TrtVersion
   displayName: TensorRT Version
   type: string
-  default: 10.8_cuda12.6_cudnn9
+  default: 10.9_cuda12.8_cudnn9
   values:
-  - 10.8_cuda11.8_cudnn8
-  - 10.8_cuda12.6_cudnn9
+  - 10.9_cuda11.8_cudnn8
+  - 10.9_cuda12.8_cudnn9
   - BIN
 
 - name: UseTensorrtOssParser
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_android.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_android.yml
new file mode 100644
index 0000000000000..c988a97b6a56c
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_android.yml
@@ -0,0 +1,63 @@
+parameters:
+  AgentPool : 'Win-CPU'
+  ArtifactSuffix: ''
+  SpecificArtifact: false
+  BuildId: ''
+
+stages:
+- stage: NuGet_Test_Android
+  jobs:
+  - job:  NuGet_Test_Android
+    workspace:
+      clean: all
+    pool: "${{ parameters.AgentPool }}"
+
+    variables:
+    - name: OnnxRuntimeBuildDirectory
+      value: '$(Build.BinariesDirectory)'
+
+    steps:
+      - task: NuGetToolInstaller@0
+        displayName: Use Nuget 6.10.x
+        inputs:
+          versionSpec: 6.10.x
+
+      - template: ../../templates/flex-downloadPipelineArtifact.yml
+        parameters:
+          StepName: 'Download Pipeline Artifact'
+          ArtifactName: drop-signed-nuget-${{ parameters.ArtifactSuffix }}
+          TargetPath: '$(Build.BinariesDirectory)\nuget-artifact'
+          SpecificArtifact: ${{ parameters.SpecificArtifact }}
+          BuildId: ${{ parameters.BuildId }}
+
+      - template: get-nuget-package-version-as-variable.yml
+        parameters:
+          packageFolder: '$(Build.BinariesDirectory)\nuget-artifact'
+
+      - task: PowerShell@2
+        displayName: Install MAUI workloads
+        inputs:
+          targetType: 'inline'
+          script: |
+            dotnet workload install maui maui-android android
+          workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+      - task: PowerShell@2
+        displayName: Publish Android MAUI APK
+        inputs:
+          targetType: 'inline'
+          script: |
+            dotnet nuget add source $(Build.BinariesDirectory)\nuget-artifact --name local-nuget
+            dotnet publish -c Release --property:UsePrebuiltNativePackage=true --property:CurrentOnnxRuntimeVersion=$(NuGetPackageVersionNumber) -f net8.0-android
+          workingDirectory: '$(Build.SourcesDirectory)\csharp\test\Microsoft.ML.OnnxRuntime.Tests.MAUI'
+
+      - task: PowerShell@2
+        displayName: Run BrowserStack test
+        inputs:
+          targetType: 'inline'
+          script: |
+            dotnet test
+          workingDirectory: '$(Build.SourcesDirectory)\csharp\test\Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android'
+        env:
+          BROWSERSTACK_USERNAME: $(browserstack_username)
+          BROWSERSTACK_ACCESS_KEY: $(browserstack_access_key)
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml
index ed7d77246e862..5f5628a4326d3 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/react-natvie-andriod-e2e-test-job.yml
@@ -44,9 +44,8 @@ jobs:
       versionSpec: '20.x'
 
   - script: |
-      sudo apt install coreutils ninja-build nodejs npm yarn
-      npm install --global yarn
-    displayName: Install coreutils, ninja, npm, and yarn
+      sudo apt install ninja-build
+    displayName: Install ninja
 
   - task: DownloadPipelineArtifact@2
     inputs:
@@ -62,44 +61,7 @@ jobs:
       targetFolder: $(Build.SourcesDirectory)/js/react_native/android/libs
     displayName: Copy Android package to React Native directory
 
-  - script: |
-      npm ci
-    workingDirectory: '$(Build.SourcesDirectory)/js'
-    displayName: npm ci js
-
-  - script: |
-      npm ci
-    workingDirectory: '$(Build.SourcesDirectory)/js/common'
-    displayName: npm ci js/common
-
-  - script: |
-      yarn
-    workingDirectory: '$(Build.SourcesDirectory)/js/react_native'
-    displayName: yarn js/react_native
-
-  - task: PowerShell@2
-    inputs:
-      filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/js/pack-npm-packages.ps1'
-      arguments: '"-dev.$(Get-Date -Format yyyyMMdd)-$(git rev-parse --short HEAD)" $(Build.SourcesDirectory) react_native'
-      workingDirectory: '$(Build.SourcesDirectory)'
-      errorActionPreference: stop
-    env:
-      ORT_JS_PACK_MODE: e2e
-    displayName: Pack NPM packages
-
-  - script: |
-      mv $(Build.SourcesDirectory)/js/common/onnxruntime-common*.tgz onnxruntime-common.tgz
-      yarn add --no-lockfile file:./onnxruntime-common.tgz
-      mv $(Build.SourcesDirectory)/js/react_native/onnxruntime-react-native*.tgz onnxruntime-react-native.tgz
-      yarn add --no-lockfile file:./onnxruntime-react-native.tgz
-      yarn
-    workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
-    displayName: Bootstrap Android and iOS e2e tests
-
-  - script: |
-      yarn add --dev jest-junit
-    workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
-    displayName: install jest junit reporter js/react_native/e2e
+  - template: steps/react-native-bootstrap-steps.yml
 
   - script: |
       keytool -genkey -v -keystore debug.keystore -alias androiddebugkey -storepass android \
@@ -114,15 +76,6 @@ jobs:
       targetFolder: $(Build.SourcesDirectory)/js/react_native/e2e/android/app/libs
     displayName: Copy Android package to Android e2e test directory
 
-  - script: |
-      yarn global add detox-cli
-      echo "Path: $PATH"
-      echo "##vso[task.prependpath]$(yarn global bin)"
-      echo "Updated PATH: $PATH"
-      echo "Detox bin directory: $(yarn global bin)"
-      ls $(yarn global bin)
-    displayName: Install detox cli tools and prepend to PATH
-
   - script: |
       detox build --configuration android.emu.release
     workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
@@ -199,6 +152,7 @@ jobs:
       contents: onnxruntime-react-native*.tgz
       targetFolder: $(Build.ArtifactStagingDirectory)
     displayName: Create Artifacts onnxruntime-react-native
+
   - ${{ if eq(parameters.is1ES, true) }}:
     - task: 1ES.PublishPipelineArtifact@1
       inputs:
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/react-native-bootstrap-steps.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/react-native-bootstrap-steps.yml
new file mode 100644
index 0000000000000..d0deb57cbf303
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/react-native-bootstrap-steps.yml
@@ -0,0 +1,25 @@
+steps:
+- script:
+    npm install -g detox-cli
+  displayName: Install detox cli tools
+
+- script: |
+    npm ci
+  workingDirectory: '$(Build.SourcesDirectory)/js'
+  displayName: npm ci js
+
+- script: |
+    npm ci
+  workingDirectory: '$(Build.SourcesDirectory)/js/common'
+  displayName: npm ci js/common
+
+- script: |
+    npm ci
+    npm run bootstrap-no-pods
+  workingDirectory: '$(Build.SourcesDirectory)/js/react_native'
+  displayName: bootstrap react_native
+
+- script: |
+    npm install --save-dev jest-junit
+  workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
+  displayName: install jest junit reporter js/react_native/e2e
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index 366ee3fcf4e92..7b257a4cdcdd5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -24,8 +24,6 @@ jobs:
   pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   workspace:
     clean: all
-  pool:
-    vmImage: 'macOS-14'
   variables:
     runCodesignValidationInjection: false
     ANDROID_AVD_HOME: $(Agent.TempDirectory)
@@ -114,4 +112,3 @@ jobs:
   - template: component-governance-component-detection-steps.yml
     parameters :
       condition : 'succeeded'
-
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 7a46bdc7cde0a..d9b38e3eaba2e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -778,6 +778,13 @@ stages:
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
+- template: ../nuget/templates/test_android.yml
+  parameters:
+    AgentPool : 'onnxruntime-Win-CPU-2022'
+    ArtifactSuffix: 'CPU'
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
+
 - template: ../nuget/templates/test_linux.yml
   parameters:
     AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
index eb22d003c462e..39a958e848784 100644
--- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
@@ -1,5 +1,5 @@
 variables:
-  common_trt_version: '10.8.0.43'
+  common_trt_version: '10.9.0.34'
   # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below
   linux_trt_version_cuda11: ${{ variables.common_trt_version }}-1.cuda11.8
   linux_trt_version_cuda12: ${{ variables.common_trt_version }}-1.cuda12.8
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index ba1373fa6e338..674f16d8e9332 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -13,10 +13,10 @@ parameters:
       - 12.2
   - name: TrtVersion
     type: string
-    default: '10.8.0.43'
+    default: '10.9.0.34'
     values:
       - 8.6.1.6
-      - 10.8.0.43
+      - 10.9.0.34
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, true) }}:
@@ -42,7 +42,7 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.8.0.43')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.9.0.34')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.8"
           displayName: Set trtCudaVersion
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 1a0b8d7f867a6..142b76ee43b99 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -15,10 +15,10 @@ parameters:
     default: '11.8'
   - name: win_trt_folder_cuda11
     type: string
-    default: 'TensorRT-10.8.0.43.Windows10.x86_64.cuda-11.8'
+    default: 'TensorRT-10.9.0.34.Windows10.x86_64.cuda-11.8'
   - name: win_trt_folder_cuda12
     type: string
-    default: 'TensorRT-10.8.0.43.Windows10.x86_64.cuda-12.8'
+    default: 'TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8'
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, 'true') }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index dffcea9500393..d7850153698e1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -125,11 +125,6 @@ jobs:
         WithCache: ${{ parameters.WithCache }}
 
   - ${{ if eq(parameters.BuildWebGPU, true) }}:
-    # This step only verifies whether the build is successful.
-    # currently, we uses EMSDK 3.1.59, which is not compatible with Dawn's changes in its Emscripten fork. Closure compiler will not work for WebGPU build.
-    # Only enables in DEBUG build.
-    #
-    # TODO: when upgrading to a newer Emscripten version, we should fix this step.
     - template: build-linux-wasm-step.yml
       parameters:
         Today: $(Today)
@@ -138,31 +133,50 @@ jobs:
         ${{ else }}:
           AdditionalKey: wasm_inferencing_webgpu_exp | ${{ parameters.BuildConfig }}
         CacheDir: $(ORT_CACHE_DIR)/wasm_inferencing_webgpu
-        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_inferencing_webgpu --use_webgpu --target onnxruntime_webassembly --skip_tests'
+        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_inferencing_webgpu --use_webgpu --use_jsep --use_webnn --target onnxruntime_webassembly --skip_tests'
         DisplayName: 'Build (simd + threads + WebGPU experimental)'
         WithCache: ${{ parameters.WithCache }}
 
   - ${{ if eq(parameters.SkipPublish, false) }}:
     - script: |
-        cp $(Build.BinariesDirectory)/wasm_inferencing/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.wasm $(Build.ArtifactStagingDirectory)
-        cp $(Build.BinariesDirectory)/wasm_inferencing/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.mjs $(Build.ArtifactStagingDirectory)
+        mkdir -p $(Build.ArtifactStagingDirectory)/wasm/
+        cp $(Build.BinariesDirectory)/wasm_inferencing/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.wasm $(Build.ArtifactStagingDirectory)/wasm/
+        cp $(Build.BinariesDirectory)/wasm_inferencing/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.mjs $(Build.ArtifactStagingDirectory)/wasm/
         if [ -d $(Build.BinariesDirectory)/wasm_inferencing_jsep ]; then
-          cp $(Build.BinariesDirectory)/wasm_inferencing_jsep/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.jsep.wasm $(Build.ArtifactStagingDirectory)
-          cp $(Build.BinariesDirectory)/wasm_inferencing_jsep/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.jsep.mjs $(Build.ArtifactStagingDirectory)
+          cp $(Build.BinariesDirectory)/wasm_inferencing_jsep/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.jsep.wasm $(Build.ArtifactStagingDirectory)/wasm/
+          cp $(Build.BinariesDirectory)/wasm_inferencing_jsep/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.jsep.mjs $(Build.ArtifactStagingDirectory)/wasm/
         fi
       displayName: 'Create Artifacts'
+    - ${{ if eq(parameters.BuildWebGPU, true) }}:
+      - script: |
+          mkdir -p $(Build.ArtifactStagingDirectory)/wasm_webgpu/
+          cp $(Build.BinariesDirectory)/wasm_inferencing_webgpu/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.jsep.wasm $(Build.ArtifactStagingDirectory)/wasm_webgpu/
+          cp $(Build.BinariesDirectory)/wasm_inferencing_webgpu/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.jsep.mjs $(Build.ArtifactStagingDirectory)/wasm_webgpu/
+        displayName: 'Create Artifacts (WebGPU EP)'
     - ${{ if eq(parameters.is1ES, false) }}:
       - task: PublishPipelineArtifact@1
         displayName: 'Publish Pipeline Artifact'
         inputs:
           artifactName: '${{ parameters.BuildConfig }}_wasm'
-          targetPath: '$(Build.ArtifactStagingDirectory)'
+          targetPath: '$(Build.ArtifactStagingDirectory)/wasm'
+      - ${{ if eq(parameters.BuildWebGPU, true) }}:
+        - task: PublishPipelineArtifact@1
+          displayName: 'Publish Pipeline Artifact (WebGPU EP)'
+          inputs:
+            artifactName: '${{ parameters.BuildConfig }}_wasm_webgpu'
+            targetPath: '$(Build.ArtifactStagingDirectory)/wasm_webgpu'
     - ${{ if eq(parameters.is1ES, true) }}:
       - task: 1ES.PublishPipelineArtifact@1
         displayName: 'Publish Pipeline Artifact'
         inputs:
           artifactName: '${{ parameters.BuildConfig }}_wasm'
-          targetPath: '$(Build.ArtifactStagingDirectory)'
+          targetPath: '$(Build.ArtifactStagingDirectory)/wasm'
+      - ${{ if eq(parameters.BuildWebGPU, true) }}:
+        - task: 1ES.PublishPipelineArtifact@1
+          displayName: 'Publish Pipeline Artifact (WebGPU EP)'
+          inputs:
+            artifactName: '${{ parameters.BuildConfig }}_wasm_webgpu'
+            targetPath: '$(Build.ArtifactStagingDirectory)/wasm_webgpu'
   - task: PublishTestResults@2
     displayName: 'Publish unit test results'
     inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index c1309d345d819..c4de3271f5ca9 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -120,11 +120,6 @@ stages:
     - script:
         brew install coreutils ninja npm
       displayName: Install coreutils, ninja, npm
-
-    - script:
-        npm install -g detox-cli
-      displayName: Install detox cli tools
-
     - script:
         brew tap wix/brew
       displayName: brew tap wix/brew
@@ -133,50 +128,7 @@ stages:
         brew install applesimutils
       displayName: Install applesimutils tools required by detox ios
 
-    - script: |
-        npm ci
-      workingDirectory: '$(Build.SourcesDirectory)/js'
-      displayName: npm ci js
-
-    - script: |
-        npm ci
-      workingDirectory: '$(Build.SourcesDirectory)/js/common'
-      displayName: npm ci js/common
-
-    - script: |
-        npm install
-      workingDirectory: '$(Build.SourcesDirectory)/js/react_native'
-      displayName: npm install js/react_native
-
-    - task: PowerShell@2
-      inputs:
-        filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/js/pack-npm-packages.ps1'
-        arguments: '"-dev.$(Get-Date -Format yyyyMMdd)-$(git rev-parse --short HEAD)" $(Build.SourcesDirectory) react_native'
-        workingDirectory: '$(Build.SourcesDirectory)'
-        errorActionPreference: stop
-      env:
-        ORT_JS_PACK_MODE: e2e
-      displayName: Pack NPM packages
-
-    - script: |
-        set -e -x
-        npm install
-        ls node_modules
-        mv $(Build.SourcesDirectory)/js/common/onnxruntime-common*.tgz onnxruntime-common.tgz
-        mv $(Build.SourcesDirectory)/js/react_native/onnxruntime-react-native*.tgz onnxruntime-react-native.tgz
-        npm install ./onnxruntime-common.tgz
-        ls node_modules/onnxruntime*
-        npm install ./onnxruntime-react-native.tgz
-        ls node_modules/onnxruntime*
-
-
-      workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
-      displayName: Bootstrap Android and iOS e2e tests
-
-    - script: |
-        npm install --save-dev jest-junit
-      workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
-      displayName: install jest junit reporter js/react_native/e2e
+    - template: ../stages/jobs/steps/react-native-bootstrap-steps.yml
 
     - script: |
         ORT_C_LOCAL_POD_PATH=$(Build.BinariesDirectory)/ios-full-pod/onnxruntime-c \
diff --git a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
index 2e3589ee87c29..e6d86b8802148 100644
--- a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
@@ -147,7 +147,7 @@ stages:
       ExtraBuildArgs: '--target onnxruntime_webassembly --skip_tests --enable_wasm_api_exception_catching --disable_rtti ${{ parameters.ExtraBuildArgs }}'
       PoolName: ${{ parameters.PoolName }}
       BuildJsep: ${{ parameters.BuildJsep }}
-      BuildWebGPU: false
+      BuildWebGPU: ${{ parameters.BuildWebGPU }}
       WithCache: ${{ parameters.WithCache }}
       is1ES: ${{ parameters.is1ES }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 6868043f64d81..4ae7423803ddf 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -81,12 +81,12 @@ jobs:
       versionSpec: '20.x'
   - task: DownloadPipelineArtifact@2
     inputs:
-      patterns: '${{ parameters.BuildConfig }}_*/**/*'
-      path: $(Pipeline.Workspace)\artifacts
+      patterns: '${{ parameters.BuildConfig }}_wasm/**/*'
+      path: $(Pipeline.Workspace)\artifacts_wasm
     displayName: 'Download WebAssembly artifacts'
   - task: CopyFiles@2
     inputs:
-      sourceFolder: $(Pipeline.Workspace)\artifacts
+      sourceFolder: $(Pipeline.Workspace)\artifacts_wasm
       contents: |
         **\ort-*.wasm
       targetFolder: $(Build.SourcesDirectory)\js\web\dist
@@ -94,7 +94,7 @@ jobs:
     displayName: 'Binplace dist files (.wasm)'
   - task: CopyFiles@2
     inputs:
-      sourceFolder: $(Pipeline.Workspace)\artifacts
+      sourceFolder: $(Pipeline.Workspace)\artifacts_wasm
       contents: |
         **\ort-*.mjs
       targetFolder: $(Build.SourcesDirectory)\js\web\dist
@@ -221,6 +221,47 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebAssembly: proxy'
     condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
+
+  # === Start of experimental WebGPU EP tests ===
+
+  - ${{ if eq(parameters.RunWebGpuTests, true) }}:
+    - task: DownloadPipelineArtifact@2
+      inputs:
+        patterns: '${{ parameters.BuildConfig }}_wasm_webgpu/**/*'
+        path: $(Pipeline.Workspace)\artifacts_wasm_webgpu
+      displayName: 'Download WebAssembly artifacts'
+    - task: CopyFiles@2
+      inputs:
+        sourceFolder: $(Pipeline.Workspace)\artifacts_wasm_webgpu
+        contents: |
+          **\ort-*.wasm
+        targetFolder: $(Build.SourcesDirectory)\js\web\dist
+        flattenFolders: true
+        overWrite: true
+      displayName: 'Binplace dist files (.wasm)'
+    - task: CopyFiles@2
+      inputs:
+        sourceFolder: $(Pipeline.Workspace)\artifacts_wasm_webgpu
+        contents: |
+          **\ort-*.mjs
+        targetFolder: $(Build.SourcesDirectory)\js\web\dist
+        flattenFolders: true
+        overWrite: true
+      displayName: 'Binplace dist files (.mjs)'
+    - script: |
+        powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+      displayName: 'Check active Chrome processes (before test)'
+      condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
+    - script: |
+        mkdir $(Agent.TempDirectory)\web\test\07
+        dir $(Agent.TempDirectory)\web\test\07
+        npm test --webgpu-ep -- -b=webgpu -e=chrome $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\07 --chromium-flags=--enable-logging --chromium-flags=--v=1
+      workingDirectory: '$(Build.SourcesDirectory)\js\web'
+      displayName: 'Run ort-web tests - WebGPU EP'
+      continueOnError: true # we allow WebGPU EP tests to fail for now
+
+  # === End of experimental WebGPU EP tests ===
+
   - script: |
       npm run test:e2e -- --browser=Chrome_default
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
@@ -256,12 +297,13 @@ jobs:
       displayName: 'Publish Pipeline Artifact'
       condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
   - script: |
-      if exist 01 (echo ------------------- BEGIN 01 -------------------&&type 01\chrome_debug.log&&echo ------------------- END 01 ------------------- )
-      if exist 02 (echo ------------------- BEGIN 02 -------------------&&type 02\chrome_debug.log&&echo ------------------- END 02 ------------------- )
-      if exist 03 (echo ------------------- BEGIN 03 -------------------&&type 03\chrome_debug.log&&echo ------------------- END 03 ------------------- )
-      if exist 04 (echo ------------------- BEGIN 04 -------------------&&type 04\chrome_debug.log&&echo ------------------- END 04 ------------------- )
-      if exist 05 (echo ------------------- BEGIN 05 -------------------&&type 05\chrome_debug.log&&echo ------------------- END 05 ------------------- )
-      if exist 06 (echo ------------------- BEGIN 06 -------------------&&type 06\chrome_debug.log&&echo ------------------- END 06 ------------------- )
+      for %%i in (01 02 03 04 05 06 07) do (
+        if exist %%i (
+          echo ------------------- BEGIN %%i -------------------
+          type %%i\chrome_debug.log
+          echo ------------------- END %%i -------------------
+        )
+      )
     displayName: 'Log Chrome processes (after test)'
     workingDirectory: '$(Agent.TempDirectory)\web\test'
     condition: always()
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index c42042b0ec639..6552c423617b5 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -5,7 +5,7 @@
 ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
 
 FROM $BASEIMAGE
-ARG TRT_VERSION
+ARG TRT_VERSION=10.9.0.34-1.cuda12.8
 
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "$TRT_VERSION" ]; then  \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index 7abc7a6d35ec3..1933fd371d3bc 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
-ARG TRT_VERSION=10.8.0.43-1.cuda12.8
+ARG TRT_VERSION=10.9.0.34-1.cuda12.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
index 2df43197b7d39..62562705c92b2 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.8.0.43-1.cuda11.8
+ARG TRT_VERSION=10.9.0.34-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 64186e83f001a..9b392fa0e3a68 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.8.0.43-1+cuda11.8
+ARG TRT_VERSION=10.9.0.34-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
index a563a1926ed39..bf3b50880a252 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
@@ -5,8 +5,8 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-ARG TRT_VERSION=10.8.0.43-1+cuda11.8
+ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
+ARG TRT_VERSION=10.9.0.34-1+cuda12.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
@@ -28,7 +28,7 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip
 
 # Install TensorRT
-RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
     libnvinfer-dev=${TRT_VERSION} \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
index 7f5e8f871f415..c6931147f96f9 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
@@ -5,8 +5,8 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-ARG TRT_VERSION=10.8.0.43-1+cuda11.8
+ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
+ARG TRT_VERSION=10.9.0.34-1+cuda12.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
@@ -30,7 +30,7 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip
 
 # Install TensorRT
-RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
     libnvinfer-dev=${TRT_VERSION} \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index f454d21164ac4..f68f488a9d8b8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install psutil setuptools>=68.2.2
 
 # Install TensorRT
-RUN TRT_VERSION="10.8.0.43-1+cuda11.8" &&\
+RUN TRT_VERSION="10.9.0.34-1+cuda11.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index a9dbefc6faee0..1d3575411a692 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 AS base
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04 AS base
 
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install setuptools>=68.2.2 psutil
 
 # Install TensorRT
-RUN TRT_VERSION="10.8.0.43-1+cuda12.8" &&\
+RUN TRT_VERSION="10.9.0.34-1+cuda12.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index 7dd302a6b03da..03f14732b70f8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT installed from provided binaries
 
 # Build base image with required system packages
-FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04 AS base
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04 AS base
 
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
index 0c105daa38ac8..d1df74e2a4506 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
@@ -5,7 +5,7 @@
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 
 FROM $BASEIMAGE
-ARG TRT_VERSION=10.8.0.43-1.cuda11.8
+ARG TRT_VERSION=10.9.0.34-1.cuda11.8
 
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "${TRT_VERSION}" ]; then  \
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index fd9890fa12fb3..ecadab5d3f8a3 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
 ) else (
     set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.8.0.43.Windows10.x86_64.cuda-12.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH%
 
 @REM The default version is still cuda v12.2, because set cuda v11.8 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.8.0.43.Windows10.x86_64.cuda-11.8\lib
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-11.8\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index f598d25353c4a..45e0d970fb541 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.8.0.43.Windows10.x86_64.cuda-12.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index 14aeff3df9c62..0fb37e3a1550a 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -8,5 +8,6 @@ torch
 coloredlogs==15.0
 transformers==4.46.3
 parameterized>=0.8.1
+sentencepiece
 psutil
 einops